In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
%matplotlib inline
from utils.data_loader import Dataset
from utils.helpers import * 
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR 
import xgboost
from sklearn.ensemble import RandomForestRegressor

In [68]:
#first merge test cc files
cc1 = pd.read_csv('./data/cc_testing_v2.csv')
cc2 = pd.read_csv('./data/cc_testing2_v2.csv')
cc_merged = pd.merge(cc1, cc2, how='outer')
cc_merged.to_csv('./data/cc_testing_merged.csv', index=False)

In [69]:
training_filenames = ['./data/training_fluid_intelligenceV1.csv', './data/btsv01_ALL.txt', './data/cc_training_v2.csv']
validation_filenames = ['./data/validation_fluid_intelligenceV1.csv', './data/btsv01_ALL.txt', './data/cc_validation_v2.csv']
test_filenames = ['./data/btsv01_ALL.txt', './data/cc_testing_merged.csv']

cols_to_drop = ['btsv01_id', 'interview_date', 'collection_id', 'dataset_id', 'collection_title', \
                'src_subject_id', 'gender']

label_col = 'residual_fluid_intelligence_score'

In [70]:

training = Dataset(training_filenames, cols_to_drop, label_col)
validation = Dataset(validation_filenames, cols_to_drop, label_col)
test = Dataset(test_filenames, cols_to_drop, label_col, test=True)


  exec(code_obj, self.user_global_ns, self.user_ns)


In [72]:
scaler = StandardScaler()
train_data = scaler.fit_transform(training.data)
val_data = scaler.transform(validation.data)
test_data = scaler.transform(test.data)
dataset_cols = training.meta_data['final_dataset']['columns']

In [73]:
#generating custom columns
is_frontal = np.array([x for x in range(train_data.shape[1]) if 'frontal' in training.meta_data['final_dataset']['columns'][x]])
is_suptent = np.array([x for x in range(train_data.shape[1]) if 'supratentorium' in training.meta_data['final_dataset']['columns'][x]])
structures_to_delete = ['interview_age', 'thalamus', 'caudate', 'putamen', 'pallidum', 'volume', 'wm', 'supratentorium', 'csf']
cortex_indices = []

#first 123 are in the initial dataset 
for i,column in enumerate(training.meta_data['final_dataset']['columns'][:123]):
    curr_deletes = []
    for name in structures_to_delete: 
        if name in column:
            curr_deletes.append(name)
    if len(curr_deletes)==0:
        cortex_indices.append(i)

cortex_indices = np.array(cortex_indices)
def generate_frontal_ratio(frontal_inds, reference_inds, data):
    coefs = []
    for observation in data:
        frontal_volume = np.sum(observation[frontal_inds])
        reference_volume = np.sum(observation[reference_inds])
        coefs.append(frontal_volume/reference_volume)
    return np.array(coefs).reshape(-1,1)


frontal_suptent_train = scaler.fit_transform(generate_frontal_ratio(is_frontal, is_suptent, train_data))
frontal_suptent_val = scaler.transform(generate_frontal_ratio(is_frontal, is_suptent, val_data))
frontal_suptent_test = scaler.transform(generate_frontal_ratio(is_frontal, is_suptent, test_data))
frontal_cortex_train= scaler.fit_transform(generate_frontal_ratio(is_frontal, cortex_indices, train_data))
frontal_cortex_val = scaler.transform(generate_frontal_ratio(is_frontal, cortex_indices, val_data))
frontal_cortex_test = scaler.transform(generate_frontal_ratio(is_frontal, cortex_indices, test_data))

#append to cols and to data
dataset_cols = training.meta_data['final_dataset']['columns']
dataset_cols.extend(['frontal_suptent_ratio', 'frontal_cortex_ratio'])
train_data = np.append(train_data, np.hstack((frontal_suptent_train, frontal_cortex_train)), axis=1)
val_data = np.append(val_data, np.hstack((frontal_suptent_val, frontal_cortex_val)), axis=1)
test_data = np.append(test_data, np.hstack((frontal_suptent_test, frontal_cortex_test)), axis=1)

In [106]:
import datetime
today = str(datetime.date.today()) + "_default+cc"
results_file = './data/results_{}.pkl'.format(today)
results_df = pd.read_pickle(results_file)
columns_variants = './data/variants_mapping_{}.pkl'.format(today)
cols_variants = pd.read_pickle(columns_variants)
features_imp_file = './data/feature_importance_{}randfor.pkl'.format(today)
feature_imp_df = pd.read_pickle(features_imp_file)

In [107]:
best_features_comb = 0
best_val = 100
best_model = None
for feature_comb in results_df.keys():
    if type(results_df[feature_comb])==dict:
        for model in results_df[feature_comb].keys():
            if results_df[feature_comb][model] < best_val:
                best_val = results_df[feature_comb][model]
                best_model = model
                best_features_comb = feature_comb

In [108]:
print(best_features_comb, best_val, best_model)

3 70.98327127079955 randfor


In [112]:
#get the columns
valid_inds = cols_variants[best_features_comb]
n_best = len(np.where(feature_imp_df['feature_importance'] >=0.01)[0])
inds_restricted = np.argsort(-feature_imp_df['feature_importance'])[:n_best]
model = RandomForestRegressor(n_estimators=500, n_jobs=-1, random_state=42)

In [113]:
model.fit(train_data[:,inds_restricted], training.labels)
test_predictions = model.predict(test_data[:,inds_restricted])
val_predictions = model.predict(val_data[:,inds_restricted])

In [128]:
mse(val_predictions, validation.labels)

68.40265187472134

In [117]:
test_results_df = pd.DataFrame(np.array([test.subjects.values, test_predictions]).T, columns=['subject', 'predicted_score'], index=None)
val_results_df = pd.DataFrame(np.array([validation.subjects.values, val_predictions]).T, columns=['subject', 'predicted_score'], index=None)

In [120]:
test_template = pd.read_csv('./data/abcdnp_testing_template.csv')
val_template = pd.read_csv('./data/pred_validation_template.csv')

In [None]:
save = False

In [147]:
test_final = pd.merge(test_template, test_results_df, on='subject', suffixes=('_',''), how='inner')
test_final = test_final.drop([x for x in test_final.columns if x.endswith('_')], axis=1)
if save:
    test_final.to_csv('./data/abcdnp_testing_template.csv', index=False)

In [148]:
val_final = pd.merge(val_template, val_results_df, on='subject', suffixes=('_', ''))
val_final = val_final.drop([x for x in val_final.columns if x.endswith('_')], axis=1)
if save: 
    val_final.to_csv('./data/pred_validation_template.csv', index=False)