In [24]:
import xgboost as xgb
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd
import numpy as np

# add the path to my packages to system paths so they can be imported
import sys
sys.path.append('/home/yasamanparhizkar/Documents/yorku/01_thesis/simgraph/code/my_packages')
# sys.path.append('F:/Users/yasam/Documents/GitHub/simgraph/code/my_packages')
# sys.path.append('/home/yasamanparhizkar/Documents/thesis/code/my_packages')

import dataprocess.data_handler_03 as dh

# XGBoost
Regression to predict the summed activities.

## Load spike data

In [25]:
# load all spike data from file
spikes_dp = '../../../local_data/original_files/'
grouped_data = np.load(spikes_dp+'summed_spikes.npy') # you can load alternative files from the same directory

I_order_10 = [54, 35, 10, 60, 74, 9, 61, 56, 91, 104]

In [26]:
grouped_data.shape

(297, 1141, 1)

In [27]:
# visualize the response data
length = grouped_data.shape[0] * grouped_data.shape[1]
lbl_min = int(np.min(grouped_data))
lbl_max = int(np.max(grouped_data))
print('labels range from {} to {}.'.format(lbl_min, lbl_max))

print('label      | percentage belonging to label   ')
print('---------------------------------------------')
sanity = 0
for lbl in range(lbl_min, lbl_max+1):
    class_perc = 100*float(sum(sum(grouped_data == lbl))) / length
    print('label #{:3} | {:.2f} %'.format(lbl, class_perc))
    sanity += class_perc

print("total percentage (should be %100): ", sanity)

labels range from 0 to 26.
label      | percentage belonging to label   
---------------------------------------------
label #  0 | 31.53 %
label #  1 | 15.38 %
label #  2 | 8.77 %
label #  3 | 7.07 %
label #  4 | 6.27 %
label #  5 | 5.61 %
label #  6 | 4.82 %
label #  7 | 4.22 %
label #  8 | 3.67 %
label #  9 | 3.12 %
label # 10 | 2.60 %
label # 11 | 2.07 %
label # 12 | 1.61 %
label # 13 | 1.19 %
label # 14 | 0.80 %
label # 15 | 0.52 %
label # 16 | 0.31 %
label # 17 | 0.18 %
label # 18 | 0.11 %
label # 19 | 0.06 %
label # 20 | 0.04 %
label # 21 | 0.02 %
label # 22 | 0.01 %
label # 23 | 0.01 %
label # 24 | 0.00 %
label # 25 | 0.00 %
label # 26 | 0.00 %
total percentage (should be %100):  100.00000000000003


## Load movie feature set

In [29]:
def get_mnist_labels(data_params):
    return np.loadtxt(data_params['features_dp']+'lbls.csv')

def transform_mnistsift(fv):
    return fv[::4]

def transform_slowfast(fv):
    """
    Transform to be applied on feature vectors.
    
    Input: fv
    fv - 1xDf torch tensor representing a feature vector
    
    Output: fvv
    fvv - 1xDf' torch tensor representing the transformed feature vector
    """
    
    # for faster run and less memory usage
    fvv = fv[::200]
    
    # for numerical stability during GD
    # fvv = fvv * 10
    
    return fvv

def transform_sift3d(fv):
    return fv[::10]*10000

def transform_soenet(fv):
    return fv[::5]

# data retrieval params
# data_params = {'func': dh.datapoint_sift, 'lbl_func': get_mnist_labels, 'features_dp': '../../data/fe_exp/mnist-sift/', \
#                'spike_data': None, 'group_id': None, 'transform': transform_mnistsift, 'ind_min': 0, 'ind_max': 13203, 'feature_id':'mnist-sift'}

# data_params = {'func': dh.datapoint_numpy, 'lbl_func': dh.get_labels, 'features_dp': '../../data/features/slowfast/slowfast_4732_numpy/', \
#                'spike_data': grouped_data, 'group_id': 0, 'transform': transform_slowfast, 'ind_min': 1*1141+0, 'ind_max': 2*1141-1, 'feature_id':'slowfast'}

data_params = {'func': dh.datapoint_numpy, 'lbl_func': dh.get_labels, 'features_dp': '../../data/features/sift3d/fvs_s1_with_kp/desc/', \
               'spike_data': grouped_data, 'group_id': 0, 'transform': transform_sift3d, 'ind_min': 1*1141+0, 'ind_max': 2*1141-1, 'feature_id':'sift3d'}

# data_params = {'func': dh.datapoint_numpy, 'lbl_func': dh.get_labels, 'features_dp': '../../data/features/soenet/soenet3/features_2layer/', \
#                'spike_data': grouped_data, 'group_id': 0, 'transform': transform_soenet, 'ind_min': 1*1141+41, 'ind_max': 2*1141-1, 'feature_id':'soenet'}

In [31]:
train_num = 100
val_num = 50

train_num, val_num, train_data, val_data = \
dh.random_train_val(train_num, val_num, data_params, seed=0)

# optional normalization
# train_data['des'] = dh.normalize(train_data['des'])
# val_data['des'] = dh.normalize(val_data['des'])

# show statistics
print('feature_id: ', data_params['feature_id'])
print('train_num = ', train_num, ', val_num = ', val_num)
print('number of features: ', train_data['des'].shape[1])

print('train_smpls = ', train_data['smpls'], '\nval_smpls = ', val_data['smpls'])
print('train_lbls = ', train_data['lbls'], '\nval_lbls = ', val_data['lbls'])
print('train_des = ', train_data['des'], '\nval_des = ', val_data['des'])
print('minimum value of train features: ', np.min(train_data['des']))
print('mean value of train features: ', np.mean(train_data['des']))
print('median value of train features: ', np.median(train_data['des']))
print('maximum value of train features: ', np.max(train_data['des']))

feature_id:  sift3d
train_num =  100 , val_num =  50
number of features:  77
train_smpls =  [1994 2025 2012 1146 2253 1414 1163 1475 1365 1553 1143 1306 2086 1195
 1427 1984 1449 2192 1149 1591 2270 1364 1705 1283 1228 1967 1905 1622
 1557 1781 1316 1915 1706 2147 1921 1175 1578 1641 1658 1686 2167 2244
 1269 1181 1836 1774 1170 1698 2056 1937 1505 1862 1885 1711 1754 1396
 1837 1387 1421 2139 1718 2193 2090 1320 1447 1689 1883 1865 1697 1895
 1907 1216 2130 1193 2185 1539 1853 2022 1408 2251 1824 1753 1409 2138
 1524 1646 1992 2007 1552 1403 1541 1564 1790 1198 1218 2217 2049 1232
 1813 1362] 
val_smpls =  [1487 1576 1511 1205 2234 2153 1875 1636 2141 1680 1879 1955 2229 1750
 1773 2001 1638 1709 2242 2197 1962 1778 2117 1821 1242 2228 1496 1818
 1916 1223 1983 1791 2008 2006 1470 1860 2037 1538 2102 1922 2065 1649
 2202 1684 1606 2183 2105 1260 1551 1157]
train_lbls =  [ 1.  3.  0.  1.  1.  3.  0.  5.  4.  0.  0. 10.  1.  1.  7.  0.  1.  6.
  3.  1.  1.  3.  5.  2.  0.  2.  8.  0.  1

## Create DMatrices from numpy arrays

In [43]:
# create xgb Dmatrices directly from numpy arrays
dtrain = xgb.DMatrix(train_data['des'], label=train_data['lbls'])
dval = xgb.DMatrix(val_data['des'], label=val_data['lbls'])

# dtrain_x = xgb.DMatrix(train_data['des'])
# dtrain_y = xgb.DMatrix(train_data['lbls'])
# dval_x = xgb.DMatrix(val_data['des'])
# dval_y = xgb.DMatrix(val_data['lbls'])

In [44]:
print('Train Dmatrix shape: {} x {}'.format(dtrain.num_row(), dtrain.num_col()))
print('Val Dmatrix shape: {} x {}'.format(dval.num_row(), dval.num_col()))
print('Train Dmatrix labels: ', dtrain.get_label())
print('Val Dmatrix labels: ', dval.get_label())

Train Dmatrix shape: 100 x 77
Val Dmatrix shape: 50 x 77
Train Dmatrix labels:  [ 1.  3.  0.  1.  1.  3.  0.  5.  4.  0.  0. 10.  1.  1.  7.  0.  1.  6.
  3.  1.  1.  3.  5.  2.  0.  2.  8.  0.  1.  4. 12.  2.  6.  9.  3.  0.
  1.  1.  8.  3.  8.  1.  3.  0.  9.  2.  0.  2.  0.  7.  1.  0.  2. 13.
  8.  7.  5.  5.  1. 11.  9.  3.  0.  3.  1.  5.  4.  3.  1.  4. 11.  0.
 18.  0. 12.  0.  0.  0.  0.  0.  6. 10.  3. 13.  5.  1.  0.  3.  0.  9.
  0.  0.  2.  0.  0.  3.  0.  0.  3.  3.]
Val Dmatrix labels:  [ 4.  0.  3.  1.  0.  2.  3.  1.  8.  7.  2.  2.  0. 13.  3.  1.  1.  9.
  0. 10.  1.  2.  8.  6.  0.  0.  4.  7.  1.  1.  0.  6.  4.  1. 10.  0.
  0.  0.  1.  2.  0.  3.  4.  2.  1. 11.  3.  0.  0.  0.]


In [45]:
print(dtrain)

<xgboost.core.DMatrix object at 0x7f4a18965c10>


## Create Pandas DataFrames and Series from numpy arrays

In [48]:
# create a dataframe from the features array
train_df = pd.DataFrame(data = train_data['des'])
val_df = pd.DataFrame(data = val_data['des'])
train_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,67,68,69,70,71,72,73,74,75,76
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,67.61,0.0,77.16,132.39,43.49,0.0,0.0,120.87,209.94,27.88
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.27,0.0,0.57,29.9,0.0,0.28,0.04,14.84,6.53,3.04
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,54.25,113.18,190.23,692.9,0.21,10.11,122.15,349.08,120.23,49.76
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,308.91,0.0,281.79,583.57,194.92,6.77,0.1,457.96,780.29,98.12
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.01,0.0,5.15,1.15,0.0,0.38,0.1,8.64,2.03,0.0


In [49]:
# create pandas series from the labels
train_y = pd.DataFrame(data = train_data['lbls'])
val_y = pd.DataFrame(data = val_data['lbls'])
train_y.head()

Unnamed: 0,0
0,1.0
1,3.0
2,0.0
3,1.0
4,1.0


## Train the XGBoost model

In [50]:
#Creating an XGBoost regressor
model = xgb.XGBRegressor()

In [51]:
#Training the model on the training data
model.fit(train_df, train_y)

In [62]:
#Making predictions on the test set
train_predictions = model.predict(train_df)

In [66]:
# Calculate the mean squared error and R-squared score
train_mse = mean_squared_error(train_y, train_predictions)
train_r2 = r2_score(train_y, train_predictions)
train_acc = sum(np.round(train_predictions) == train_y.to_numpy().reshape(-1)) / train_num

print("Mean Squared Error on the TRAIN DATA:", train_mse)
print("R-squared Score on the TRAIN DATA:", train_r2)
print("Prediction Accuracy on the TRAIN DATA:", train_acc)

Mean Squared Error on the TRAIN DATA: 2.659758424535528e-07
R-squared Score on the TRAIN DATA: 0.9999999819284108


## Validate the XGBoost model

In [68]:
#Making predictions on the test set
val_predictions = model.predict(val_df)

In [70]:
# Calculate the mean squared error and R-squared score
val_mse = mean_squared_error(val_y, val_predictions)
val_r2 = r2_score(val_y, val_predictions)
train_acc = sum(np.round(train_predictions) == train_y.to_numpy().reshape(-1)) / train_num

print("Mean Squared Error on the VALIDATION DATA:", val_mse)
print("R-squared Score on the VALIDATION DATA:", val_r2)

Mean Squared Error on the VALIDATION DATA: 14.477066295684443
R-squared Score on the VALIDATION DATA: -0.26124427582977083


# 

1.0

In [84]:
train_num

100