# Create pseudo data to stimulate different fMRI parcellations

In [None]:
# Ausumming there are 500000 voxels in a fMRI scan
# We create 3 different parcellations:
# 1. 50 parcels, each parcel has 10000 voxels
# 2. 500 parcels, each parcel has 1000 voxels
# 3. 500 parcels, each parcel has different number of voxels, from 1 to 1000, but the total number of voxels is still 500000

# Pseudo data is created by sci-kit learn make_classification function

import numpy as np
import pandas as pd
import time
import pickle

from sklearn.datasets import make_classification

# hyperparameters
N_SAMPLES = 800
N_INFORMATIVE_RATIO = 0.01

# Create 50 parcels, each parcel has 10000 voxels
parcel_50 = []
for i in range(50):
    parcel_50.append(make_classification(n_samples=N_SAMPLES, 
                                         n_features=10000, 
                                         n_informative=int(10000*N_INFORMATIVE_RATIO),
                                         n_classes=8,
    ))
parcel_50 = np.array(parcel_50)

# Create 500 parcels, each parcel has 1000 voxels
parcel_500 = []
for i in range(500):
    parcel_500.append(make_classification(n_samples=N_SAMPLES,
                                          n_features=1000,
                                          n_informative=int(1000*N_INFORMATIVE_RATIO),
                                          n_classes=8,
    ))
                                          
parcel_500 = np.array(parcel_500)

# Create 500 parcels, each parcel has different number of voxels(features), from 1 to 1000, but the total number of voxels(features) is still 500000
parcel_500_diff = []

# get random number of features for each parcel, sum for all parcels is 500000
random_features_num = np.random.randint(1, 1000, 500)
random_features_num = random_features_num / sum(random_features_num) * 500000
random_features_num = random_features_num.astype(int)
print('Total number of voxels is: ', sum(random_features_num))

for i in range(500):
    parcel_500_diff.append(make_classification(n_samples=N_SAMPLES,
                                               n_features=random_features_num[i],
                                               n_informative=int(random_features_num[i]*N_INFORMATIVE_RATIO),
                                               n_classes=8,
    ))

                                            
parcel_500_diff = np.array(parcel_500_diff)

# Save the data
with open('parcel_50.pickle', 'wb') as f:
    pickle.dump(parcel_50, f)
with open('parcel_500.pickle', 'wb') as f:
    pickle.dump(parcel_500, f)
with open('parcel_500_diff.pickle', 'wb') as f:
    pickle.dump(parcel_500_diff, f)

# Compare the performance of sklearn and cuML (No parallelization)

In [None]:
# Compare the performance of sklearn and cuML by evaluating the accuracy of the model and 
# the time it takes to train the model of different parcellations

# # Load the data
# with open('parcel_50.pickle', 'rb') as f:
#     parcel_50 = pickle.load(f)
# with open('parcel_500.pickle', 'rb') as f:
#     parcel_500 = pickle.load(f)
# with open('parcel_500_diff.pickle', 'rb') as f:
#     parcel_500_diff = pickle.load(f)

# Create a list of different parcellations
parcellations = [parcel_50, parcel_500, parcel_500_diff]

# Performance log
df = pd.DataFrame(columns=['parcel', 'n_features', 'n_informative', 'n_classes', 'n_samples', 'sklearn_time', 'sklearn_accuracy', 'cuml_time', 'cuml_accuracy'])



In [None]:
# Train and evaluate with sklearn and cuML with cross validation
from sklearn.linear_model import RandomForestClassifier as skRandomForestClassifier
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.metrics import accuracy_score

from cuml.ensemble import RandomForestClassifier as cuRandomForestClassifier

for i in range(len(parcellations)):
    for j in range(len(parcellations[i])):
        # sklearn
        start = time.time()
        sk_model = skRandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
        scores = cross_validate(sk_model, parcellations[i][j][0], parcellations[i][j][1], cv=5, scoring='accuracy')
        end = time.time()
        sklearn_time = end - start
        sklearn_accuracy = scores['test_score'].mean()
        
        # cuML
        start = time.time()
        cu_model = cuRandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
        scores = cross_validate(cu_model, parcellations[i][j][0], parcellations[i][j][1], cv=5, scoring='accuracy')
        end = time.time()
        cuml_time = end - start
        cuml_accuracy = scores['test_score'].mean()
        
        # log
        df = df.append({'parcel': i, 'n_features': parcellations[i][j][0].shape[1], 'n_informative': parcellations[i][j][2], 'n_classes': parcellations[i][j][3], 'n_samples': parcellations[i][j][0].shape[0], 'sklearn_time': sklearn_time, 'sklearn_accuracy': sklearn_accuracy, 'cuml_time': cuml_time, 'cuml_accuracy': cuml_accuracy}, ignore_index=True)