# Create pseudo data to stimulate different fMRI parcellations

In [1]:
# Assuming there are 500000 voxels in a fMRI scan
# We create 3 different parcellations:
# 1. 50 parcels, each parcel has 10000 voxels
# 2. 500 parcels, each parcel has 1000 voxels
# 3. 500 parcels, each parcel has different number of voxels, from 1 to 1000, but the total number of voxels is still 500000

# Pseudo data is created by sci-kit learn make_classification function

import numpy as np
import pandas as pd
import time
import pickle

from sklearn.datasets import make_classification

# hyperparameters
N_SAMPLES = 800*5
N_INFORMATIVE_RATIO = 0.01 # 1% of voxels are informative
N_CLASSES = 8
DATA_TYPE = np.float32 # set data type to float32 to leaverage GPU


# Create 50 parcels, each parcel has 10000 voxels
parcel_50 = []
for i in range(50):
    X, y = make_classification(n_samples=N_SAMPLES, 
                                n_features=10000, 
                                n_informative=int(10000*N_INFORMATIVE_RATIO),
                                n_classes=N_CLASSES,
    )
    parcel_50.append((X.astype(DATA_TYPE),y.astype(DATA_TYPE)))

# Create 500 parcels, each parcel has 1000 voxels
parcel_500 = []
for i in range(500):
    X, y = make_classification(n_samples=N_SAMPLES, 
                                n_features=1000, 
                                n_informative=int(1000*N_INFORMATIVE_RATIO),
                                n_classes=N_CLASSES,
    )
    parcel_500.append((X.astype(DATA_TYPE),y.astype(DATA_TYPE)))
    
                                          

# Create 500 parcels, each parcel has different number of voxels(features), from 1 to 1000, but the total number of voxels(features) is still 500000
parcel_500_diff = []

# get random number of features for each parcel, sum for all parcels is 500000
random_features_num = np.random.randint(500, 1000, 500)
random_features_num = random_features_num / sum(random_features_num) * 500000
random_features_num = random_features_num.astype(int)
print('Total number of voxels is: ', sum(random_features_num))

for i in range(500):

    X, y = make_classification(n_samples=N_SAMPLES,
                                 n_features=random_features_num[i],
                                    n_informative=int(random_features_num[i]*N_INFORMATIVE_RATIO),
                                    n_classes=N_CLASSES,
    )
    parcel_500_diff.append((X.astype(DATA_TYPE),y.astype(DATA_TYPE)))      

# Save the data
with open('parcel_50.pickle', 'wb') as f:
    pickle.dump(parcel_50, f)
with open('parcel_500.pickle', 'wb') as f:
    pickle.dump(parcel_500, f)
with open('parcel_500_diff.pickle', 'wb') as f:
    pickle.dump(parcel_500_diff, f)
    

    


Total number of voxels is:  499751


: 

: 

# Compare the performance of sklearn and cuML (No parallelization)

In [None]:
# Compare the performance of sklearn and cuML by evaluating the accuracy of the model and 
# the time it takes to train the model of different parcellations

# Load the data
with open('parcel_50.pickle', 'rb') as f:
    parcel_50 = pickle.load(f)
with open('parcel_500.pickle', 'rb') as f:
    parcel_500 = pickle.load(f)
with open('parcel_500_diff.pickle', 'rb') as f:
    parcel_500_diff = pickle.load(f)

# Create a list of different parcellations
parcellations = [parcel_50, parcel_500, parcel_500_diff]
parcellations = [parcel_50]

# Performance log
df = pd.DataFrame(columns=['parcel', 'n_features', 'n_informative', 'n_classes', 'n_samples', 'sklearn_time', 'sklearn_accuracy', 'cuml_time', 'cuml_accuracy'])



In [None]:
# Train and evaluate with sklearn with cross validation
from sklearn.ensemble import RandomForestClassifier as skRandomForestClassifier
from sklearn.model_selection import cross_val_score, cross_validate
from cuml.metrics import accuracy_score

from tqdm import tqdm


for i in range(len(parcellations)):
    for j in tqdm(range(len(parcellations[i]))):
        # sklearn
        start = time.time()
        sk_model = skRandomForestClassifier(n_estimators=100, max_depth=2, random_state=0, n_jobs=-1)
        scores = cross_validate(sk_model, parcellations[i][j][0], parcellations[i][j][1], cv=5, scoring='accuracy')
        end = time.time()
        sklearn_time = end - start
        sklearn_accuracy = scores['test_score'].mean()

        
        # log
        df = pd.concat([df, pd.DataFrame.from_records([{'parcel': i,
                                            'n_features': parcellations[i][j][0].shape[1],
                                            'n_informative': N_INFORMATIVE_RATIO*parcellations[i][j][0].shape[1],
                                            'n_classes': N_CLASSES,
                                            'n_samples': N_SAMPLES,
                                            'sklearn_time': sklearn_time,
                                            'sklearn_accuracy': sklearn_accuracy,
                                            }])], ignore_index=True)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [01:01<00:00,  1.24s/it]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [08:32<00:00,  1.03s/it]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [08:25<00:00,  1.01s/it]


In [None]:
# Train and evaluate with cuML with cross validation
from cuml.ensemble import RandomForestClassifier as cuRandomForestClassifier
from sklearn.model_selection import KFold
import cupy as cp
import cudf

# Transform data to cp.ndarray
for parcel in parcellations:
    for i in range(len(parcel)):
        parcel[i] = (cp.array(parcel[i][0]), cp.array(parcel[i][1]))

kfold = KFold(n_splits=5)

for i in range(len(parcellations)):
    for j in tqdm(range(len(parcellations[i]))):

        # cuML
        fold_accuracy = cp.array([])
        start = time.time()
        for train_idx, test_idx in kfold.split(X=parcellations[i][j][0], y=parcellations[i][j][1]):
            cu_model = cuRandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
            cu_model.fit(parcellations[i][j][0][train_idx], parcellations[i][j][1][train_idx])
            fold_accuracy = cp.append(fold_accuracy, accuracy_score(parcellations[i][j][1][test_idx], cu_model.predict(parcellations[i][j][0][test_idx])))
        end = time.time()
        cuml_time = end - start
        cuml_accuracy = cp.asnumpy(fold_accuracy).mean()
        
        # log
        df = pd.concat([df, pd.DataFrame.from_records([{'parcel': i,
                            'n_features': parcellations[i][j][0].shape[1],
                            'n_informative': N_INFORMATIVE_RATIO*parcellations[i][j][0].shape[1],
                            'n_classes': N_CLASSES,
                            'n_samples': N_SAMPLES,
                            'cuml_time': cuml_time,
                            'cuml_accuracy': cuml_accuracy,
                            }])], ignore_index=True)

  return func(**kwargs)
  return func(**kwargs)
  return func(**kwargs)
  return func(**kwargs)
  return func(**kwargs)
  return func(**kwargs)
  return func(**kwargs)
  return func(**kwargs)
  return func(**kwargs)
  return func(**kwargs)
  return func(**kwargs)
  return func(**kwargs)
  return func(**kwargs)
  return func(**kwargs)
  return func(**kwargs)
  return func(**kwargs)
  return func(**kwargs)
  return func(**kwargs)
  return func(**kwargs)
  return func(**kwargs)
  return func(**kwargs)
  return func(**kwargs)
  return func(**kwargs)
  return func(**kwargs)
  return func(**kwargs)
  return func(**kwargs)
  return func(**kwargs)
  return func(**kwargs)
  return func(**kwargs)
  return func(**kwargs)
  return func(**kwargs)
  return func(**kwargs)
  return func(**kwargs)
  return func(**kwargs)
  return func(**kwargs)
  return func(**kwargs)
  return func(**kwargs)
  return func(**kwargs)
  return func(**kwargs)
  return func(**kwargs)
  return func(**kwargs)
  return func(**

In [None]:
# Save the log
df.to_csv('performance_log_no_parallel.csv', index=False)

# Compare the performance of sklearn and cuML (Parallelization)

In [None]:
#TODO: parallelize the training and evaluation process