# Create pseudo data to stimulate different fMRI parcellations

In [1]:
# Ausumming there are 500000 voxels in a fMRI scan
# We create 3 different parcellations:
# 1. 50 parcels, each parcel has 10000 voxels
# 2. 500 parcels, each parcel has 1000 voxels
# 3. 500 parcels, each parcel has different number of voxels, from 1 to 1000, but the total number of voxels is still 500000

# Pseudo data is created by sci-kit learn make_classification function

import numpy as np
import cupy as cp
import pandas as pd
import time
import pickle

from cuml import make_classification

# hyperparameters
N_SAMPLES = 800
N_INFORMATIVE_RATIO = 0.01 # 1% of voxels are informative
N_CLASSES = 8
DATA_TYPE = np.float32 # set data type to float32 to leaverage GPU


X, y = make_classification(n_samples=N_SAMPLES, 
                                n_features=500000, 
                                n_informative=int(500000*N_INFORMATIVE_RATIO),
                                n_classes=N_CLASSES,
    )


# Get k parcels(patches) with same number of voxels(features)
def get_parcels(X, y, k, dtype='cp'):
    """
    X: numpy or cupy array, shape = (n_samples, n_features)
    y: numpy or cupy array, shape = (n_samples, )
    k: int, number of parcels
    """
    # get number of voxels in each parcel
    n_voxels_per_parcel = int(X.shape[1]/k)
    # get k parcels
    parcels = []
    if dtype == 'cp':
        for i in range(k):
            parcels.append((X[:, i*n_voxels_per_parcel:(i+1)*n_voxels_per_parcel], y))
    else:
        for i in range(k):
            parcels.append((X[:, i*n_voxels_per_parcel:(i+1)*n_voxels_per_parcel].get(), y.get()))
        
    return parcels
    
    
    

# Get k parcels(patches) with random number of voxels(features) in each parcel
def get_parcels_diff(X, y, k, least_voxels_per_parcel=100, dtype='cp'):
    """
    X: numpy or cupy array, shape = (n_samples, n_features)
    y: numpy or cupy array, shape = (n_samples, )
    k: int, number of parcels
    least_voxels_per_parcel: int, the least number of voxels in each parcel
    """
    # get k parcels
    parcels = []
    if dtype == 'cp':
        for i in range(k):
            # get number of voxels in each parcel
            n_voxels_per_parcel = cp.random.randint(least_voxels_per_parcel, X.shape[1]//k)
            parcels.append((X[:, i*n_voxels_per_parcel:(i+1)*n_voxels_per_parcel], y))
    else:
        for i in range(k):
            # get number of voxels in each parcel
            n_voxels_per_parcel = np.random.randint(least_voxels_per_parcel, X.shape[1]//k)
            parcels.append((X[:, i*n_voxels_per_parcel:(i+1)*n_voxels_per_parcel].get(), y.get()))




# Compare the performance of sklearn and cuML (No parallelization)

In [2]:
# Compare the performance of sklearn and cuML by evaluating the accuracy of the model and 
# the time it takes to train the model of different parcellations


# Create a list of different parcellations
parcellations = [get_parcels(X, y, 50)]

# Performance log
df = pd.DataFrame(columns=['parcel', 'n_features', 'n_informative', 'n_classes', 'n_samples', 'sklearn_time', 'sklearn_accuracy', 'cuml_time', 'cuml_accuracy'])



In [3]:
# Train and evaluate with sklearn with cross validation
from sklearn.ensemble import RandomForestClassifier as skRandomForestClassifier
from sklearn.model_selection import cross_val_score, cross_validate
from cuml.metrics import accuracy_score

from tqdm import tqdm


for i in range(len(parcellations)):
    for j in tqdm(range(len(parcellations[i]))):
        # sklearn
        start = time.time()
        sk_model = skRandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
        scores = cross_validate(sk_model, parcellations[i][j][0].get(), parcellations[i][j][1].get(), cv=5, scoring='accuracy')
        end = time.time()
        sklearn_time = end - start
        sklearn_accuracy = scores['test_score'].mean()

        
        # log
        df = pd.concat([df, pd.DataFrame.from_records([{'parcel': i,
                                            'n_features': parcellations[i][j][0].shape[1],
                                            'n_informative': N_INFORMATIVE_RATIO*parcellations[i][j][0].shape[1],
                                            'n_classes': N_CLASSES,
                                            'n_samples': N_SAMPLES,
                                            'sklearn_time': sklearn_time,
                                            'sklearn_accuracy': sklearn_accuracy,
                                            }])], ignore_index=True)
        



100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [03:25<00:00,  4.11s/it]


In [4]:


# Train and evaluate with cuML with cross validation
from cuml.ensemble import RandomForestClassifier as cuRandomForestClassifier
from sklearn.model_selection import KFold

kfold = KFold(n_splits=5)

for i in range(len(parcellations)):
    for j in tqdm(range(len(parcellations[i]))):

        # cuML
        fold_accuracy = cp.array([])
        start = time.time()
        for train_idx, test_idx in kfold.split(X=parcellations[i][j][0], y=parcellations[i][j][1]):
            cu_model = cuRandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
            cu_model.fit(parcellations[i][j][0][train_idx], parcellations[i][j][1][train_idx])
            fold_accuracy = cp.append(fold_accuracy, accuracy_score(parcellations[i][j][1][test_idx], cu_model.predict(parcellations[i][j][0][test_idx])))
        end = time.time()
        cuml_time = end - start
        cuml_accuracy = cp.asnumpy(fold_accuracy).mean()
        
        # log
        df = pd.concat([df, pd.DataFrame.from_records([{'parcel': i,
                            'n_features': parcellations[i][j][0].shape[1],
                            'n_informative': N_INFORMATIVE_RATIO*parcellations[i][j][0].shape[1],
                            'n_classes': N_CLASSES,
                            'n_samples': N_SAMPLES,
                            'cuml_time': cuml_time,
                            'cuml_accuracy': cuml_accuracy,
                            }])], ignore_index=True)

  return func(**kwargs)
  return func(**kwargs)
  return func(**kwargs)
  return func(**kwargs)
  return func(**kwargs)
  return func(**kwargs)
  return func(**kwargs)
  return func(**kwargs)
  return func(**kwargs)
  return func(**kwargs)
  return func(**kwargs)
  return func(**kwargs)
  return func(**kwargs)
  return func(**kwargs)
  return func(**kwargs)
  return func(**kwargs)
  return func(**kwargs)
  return func(**kwargs)
  return func(**kwargs)
  return func(**kwargs)
  return func(**kwargs)
  return func(**kwargs)
  return func(**kwargs)
  return func(**kwargs)
  return func(**kwargs)
  return func(**kwargs)
  return func(**kwargs)
  return func(**kwargs)
  return func(**kwargs)
  return func(**kwargs)
  return func(**kwargs)
  return func(**kwargs)
  return func(**kwargs)
  return func(**kwargs)
  return func(**kwargs)
  return func(**kwargs)
  return func(**kwargs)
  return func(**kwargs)
  return func(**kwargs)
  return func(**kwargs)
  return func(**kwargs)
  return func(**

In [5]:
# Save the log
df.to_csv('performance_log_no_parallel.csv', index=False)

# Compare the performance of sklearn and cuML (Parallelization)

In [6]:
#TODO: parallelize the training and evaluation process