In [2]:
import os
import glob
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
import pandas as pd
from sklearn import datasets
import seaborn as sns
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.model_selection import GroupKFold
import datetime
import seaborn as sns

WORK_SPACE = "/home/olle/PycharmProjects/LODE/workspace"

longitudinal_pd = pd.read_csv(os.path.join(WORK_SPACE, "sequence_data/longitudinal_data.csv"))
segmentation_statistics_pd = pd.read_csv(os.path.join(WORK_SPACE, "sequence_data/segmentation_statistics_vol.csv"))

id_cols = segmentation_statistics_pd.record.str.split("_", expand=True)[[0, 1 , 2, 3]]

segmentation_statistics_pd["patient_id"] = id_cols[0]
segmentation_statistics_pd["study_date"] = id_cols[1]
segmentation_statistics_pd["laterality"] = id_cols[2]

longitudinal_pd["study_date"] = longitudinal_pd.study_date.str.replace("-", "")

# cast data types
keys = ["patient_id", "study_date", "laterality"] 
for key in keys:
    longitudinal_pd[key] = longitudinal_pd[key].astype(str)
    segmentation_statistics_pd[key] = segmentation_statistics_pd[key].astype(str)
    
longitudinal_abt = pd.merge(longitudinal_pd, segmentation_statistics_pd, how="inner", 
                            left_on = keys, right_on = keys)

feature_columns = list(longitudinal_abt.columns[17:])

In [3]:
### split data based on group

In [4]:
import numpy as np
from sklearn.model_selection import GroupShuffleSplit
X = longitudinal_abt[feature_columns]
y = longitudinal_abt.logMAR
groups = longitudinal_abt.patient_id

gss = GroupShuffleSplit(n_splits=1, train_size=.8, random_state=42)

train_idx, eval_idx = next(gss.split(X, y, groups))

train_x = X.iloc[train_idx]
train_y = y.iloc[train_idx]

test_x = X.iloc[eval_idx]
test_y = y.iloc[eval_idx]

train_x.shape, test_x.shape

((18549, 126), (4411, 126))

In [5]:
groups

0         34537
1         34537
2         34537
3         34537
4         34537
          ...  
22955    121331
22956    121685
22957    122659
22958    126011
22959    126011
Name: patient_id, Length: 22960, dtype: object

In [None]:
#### 

In [42]:
pipelines = []
pipelines.append(('ScaledLR', Pipeline([('Scaler', StandardScaler()),('LR',LinearRegression())])))
pipelines.append(('ScaledLASSO', Pipeline([('Scaler', StandardScaler()),('LASSO', Lasso())])))
pipelines.append(('ScaledEN', Pipeline([('Scaler', StandardScaler()),('EN', ElasticNet())])))
pipelines.append(('ScaledKNN', Pipeline([('Scaler', StandardScaler()),('KNN', KNeighborsRegressor())])))
pipelines.append(('ScaledCART', Pipeline([('Scaler', StandardScaler()),('CART', DecisionTreeRegressor())])))
pipelines.append(('ScaledGBM', Pipeline([('Scaler', StandardScaler()),('GBM', GradientBoostingRegressor())])))

results = []
names = []
for name, model in pipelines:
    kfoldGroup = GroupKFold(n_splits=5)
    cv_results = cross_val_score(model, X, y, groups, cv=kfoldGroup, scoring='r2')
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

ScaledLR: 0.362298 (0.043424)
ScaledLASSO: -0.004347 (0.003840)
ScaledEN: -0.004347 (0.003840)
ScaledKNN: 0.250898 (0.049855)
ScaledCART: -0.117768 (0.124853)
ScaledGBM: 0.444334 (0.029202)


In [None]:
#### perform grid search over GradientBooster

In [44]:
from sklearn.model_selection import GridSearchCV

scaler = StandardScaler().fit(train_x)
rescaledX = scaler.transform(train_x)
param_grid = dict(n_estimators=np.array([50,100,200,300,400]))
model = GradientBoostingRegressor(random_state=21)
kfold = KFold(n_splits=5)
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring='r2', cv=kfold)
grid_result = grid.fit(rescaledX, train_y)

means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

0.449095 (0.044104) with: {'n_estimators': 50}
0.454284 (0.045447) with: {'n_estimators': 100}
0.451467 (0.046835) with: {'n_estimators': 200}
0.447170 (0.047009) with: {'n_estimators': 300}
0.443561 (0.047088) with: {'n_estimators': 400}
Best: 0.454284 using {'n_estimators': 100}


In [None]:
### finalize model

In [43]:
from sklearn.metrics import mean_absolute_error

scaler = StandardScaler().fit(train_x)
rescaled_X_train = scaler.transform(train_x)
model = GradientBoostingRegressor(random_state=21, n_estimators=100)
model.fit(rescaled_X_train, train_y)

# transform the validation dataset
rescaled_X_test = scaler.transform(test_x)
predictions = model.predict(rescaled_X_test)
print(mean_absolute_error(test_y, predictions))

0.2454949512099583
