In [21]:
import os
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from pathlib import Path

tadpoleD1D2File = Path("../data/TADPOLE_D1_D2.csv")

class SimpleSVM:
    def __init__(self):
        self.diagnosis_model = Pipeline([
            ('scaler', StandardScaler()),
            ('classifier', svm.SVC(kernel="linear", probability=True)),
        ])
        self.adas_model = Pipeline([
            ('scaler', StandardScaler()),
            ('classifier', svm.SVR(kernel="linear")),
        ])
        self.ventricles_model = Pipeline([
            ('scaler', StandardScaler()),
            ('classifier', svm.SVR(kernel="linear")),
        ])
    
    def train_model(self, model, train_df, X_train, var_name):
        # remove rows with NaN future values
        Y_train_var = train_df[var_name]
        not_nans = np.logical_not(np.isnan(Y_train_var))
        X_train_var = X_train[not_nans]
        Y_train_var = Y_train_var[not_nans]
        
        model.fit(X_train_var, Y_train_var)
    
    def pre_process(self, train_df):
        train_df = train_df.copy()
        train_df.replace({'DXCHANGE': {4: 2, 5: 3, 6: 3, 7: 1, 8: 2, 9: 1}})
        train_df = train_df.rename(columns={"DXCHANGE": "Diagnosis"})

        # Sort the dataframe based on age for each subject
        train_df = train_df.sort_values(by=['RID', 'Years_bl'])
        
        train_df["Ventricles_ICV"] = train_df["Ventricles"].values / train_df["ICV_bl"].values
        
        # Select features
        train_df = train_df[
            ["RID", "Diagnosis", "ADAS13", "Ventricles_ICV"]
        ]

        # Force values to numeric
        train_df = train_df.astype("float64", errors='ignore')
        
        # Get future value from each row's next row
        for predictor in ["Diagnosis", "ADAS13", 'Ventricles_ICV']:
            train_df["Future_"+predictor] = np.append(train_df[predictor].values[1:], np.NaN)
        
        # Drop each last row per patient
        train_df = train_df.drop(train_df.groupby('RID').tail(1).index.values)
        return train_df
    
    def train(self, train_df: pd.DataFrame):
        train_df = self.pre_process(train_df)
        
        # Select columns for training
        X_train = train_df[["Diagnosis", "ADAS13", "Ventricles_ICV"]]
        
        # fill NaNs with mean
        X_train = X_train.fillna(X_train.mean())
        
        self.train_model(self.diagnosis_model,  train_df, X_train, "Future_Diagnosis")
        self.train_model(self.adas_model,       train_df, X_train, "Future_ADAS13")
        self.train_model(self.ventricles_model, train_df, X_train, "Future_Ventricles_ICV")

    def predict(predict_df: pd.DataFrame):
        pass


df = pd.read_csv(tadpoleD1D2File)

model = SimpleSVM()
model.train(df)

  interactivity=interactivity, compiler=compiler, result=result)


In [22]:
model.adas_model.predict([[1.0, 18.67, 0.059573]])

array([19.70281387])

In [23]:
model.pre_process(df)

Unnamed: 0,RID,Diagnosis,ADAS13,Ventricles_ICV,Future_Diagnosis,Future_ADAS13,Future_Ventricles_ICV
0,2.0,1.0,18.67,0.059573,1.0,19.67,
5723,2.0,1.0,19.67,,1.0,20.00,
5724,2.0,1.0,20.00,,1.0,23.00,
5725,2.0,1.0,23.00,,,,
5726,2.0,,,,1.0,21.00,
5727,2.0,1.0,21.00,,,,
5728,2.0,,,,4.0,14.00,
5729,2.0,4.0,14.00,,,,
5730,2.0,,,,1.0,18.00,
5731,2.0,1.0,18.00,,,,
