In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from multiprocessing import Pool  # For parallel processing


In [3]:
def preprocess_data(data):
  # y = data["DX.bl"]
  # Drop unnecessary features (e.g., index, directory.id)
  # remove_columns = list(data.columns)[0:9]
  remove_columns = ['directory.id', 'Subject', 'RID', 'Image.Data.ID', 'Modality', 'Visit', 'Acq.Date', 'EXAMDATE', 'Dx Codes for Submission']
  print('Removing columns:', remove_columns)
  data = data.drop(remove_columns, axis=1)

  # Handle missing values
  imputer = SimpleImputer(strategy="mean")
  data[["AGE", "PTEDUCAT", "MMSE"]] = imputer.fit_transform(data[["AGE", "PTEDUCAT", "MMSE"]])

  

  # One-Hot Encode categorical features
  categorical_features = ["PTGENDER", "PTETHCAT", "PTRACCAT", "APOE Genotype"]  # Add APOE Genotype
  encoder = OneHotEncoder(sparse=False)
  data = pd.concat([data, pd.DataFrame(encoder.fit_transform(data[categorical_features]))], axis=1)
  data.drop(categorical_features, axis=1, inplace=True)

  # Feature scaling (consider for specific models)
  scaler = StandardScaler()
  data[["AGE", "MMSE"]] = scaler.fit_transform(data[["AGE", "MMSE"]])
  
  data = data.dropna(subset=['imputed_genotype'])

  # Separate features (X) and target variable (y)
  X = data.drop("DX.bl", axis=1)
  y = data["DX.bl"]
  
  X.columns = X.columns.astype(str)

  return X, y

# Load data
data = pd.read_csv("data/ADNI_Training_Q3_APOE_CollectionADNI1Complete 1Yr 1.5T_July22.2014.csv")

# Preprocess data
X, y = preprocess_data(data.copy())

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


Removing columns: ['directory.id', 'Subject', 'RID', 'Image.Data.ID', 'Modality', 'Visit', 'Acq.Date', 'EXAMDATE', 'Dx Codes for Submission']




In [4]:
X.isnull().sum()

AGE                 0
PTEDUCAT            0
APOE4               0
MMSE                0
imputed_genotype    0
0                   0
1                   0
2                   0
3                   0
4                   0
5                   0
6                   0
7                   0
8                   0
9                   0
10                  0
11                  0
12                  0
13                  0
dtype: int64

In [5]:
# X = data
# Y = data['DX.bl']
# # del data

# remove_columns = list(X.columns)[0:9]
# remove_columns.append('Dx Codes for Submission')
# print('Removing columns:', remove_columns)

# X = X.drop(remove_columns, axis=1)

# features = list(X.columns)
# X.head(5)

In [6]:
print(data.columns)
X.head()
# y.info()
# y.value_counts()

Index(['directory.id', 'Subject', 'RID', 'Image.Data.ID', 'Modality', 'Visit',
       'Acq.Date', 'DX.bl', 'EXAMDATE', 'AGE', 'PTGENDER', 'PTEDUCAT',
       'PTETHCAT', 'PTRACCAT', 'APOE4', 'MMSE', 'imputed_genotype',
       'APOE Genotype', 'Dx Codes for Submission'],
      dtype='object')


Unnamed: 0,AGE,PTEDUCAT,APOE4,MMSE,imputed_genotype,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,0.917438,18.0,1,-2.703157,True,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,-1.14778,10.0,0,0.031099,False,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,-0.219929,16.0,0,0.812315,True,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.78275,13.0,0,-0.750117,True,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
4,-0.189998,12.0,1,-1.140725,True,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [7]:
def train_stage1_model(X_train, y_train):
  # Define and train Random Forest model with hyperparameter tuning
  param_grid = {
      "n_estimators": [100, 200, 300],
      "max_depth": [5, 10, 15]
  }
  model_stage1 = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5)
  model_stage1.fit(X_train, y_train)
  best_model = model_stage1.best_estimator_
  return best_model

# Train Stage 1 model
model_stage1 = train_stage1_model(X_train, y_train)


In [8]:
model_stage1.predict(X_test)[0]

'LMCI'

In [52]:
def predict_stage2(datapoint, model_lmc, model_ad):
  # Predict probabilities for both LMCI and AD models
  proba_lmc = model_lmc.predict_proba(datapoint.reshape(1, -1))[:, 1]
  proba_ad = model_ad.predict_proba(datapoint.reshape(1, -1))[:, 1]
  # Assign class based on higher probability
  return "LMCI" if proba_lmc > proba_ad else "AD"

def parallel_stage2_prediction(X_not_cn, model_lmc, model_ad):
  # Use multiprocessing for parallel prediction
  with Pool() as pool:
    y_pred_stage2 = pool.starmap(predict_stage2, zip(X_not_cn, [model_lmc] * len(X_not_cn), [model_ad] * len(X_not_cn)))
  return y_pred_stage2

def train_stage2_models(X_train_not_cn, y_train_not_cn):
  # Train separate models for LMCI and AD with hyperparameter tuning
  param_grid_lmc = {"kernel": ["linear", "rbf"], "C": [0.1, 1, 10]}
  param_grid_ad = {"kernel": ["linear", "rbf"], "C": [0.1, 1, 10]}
  model_lmc = GridSearchCV(SVC(random_state=42), param_grid_lmc, cv=5)
  model_ad = GridSearchCV(SVC(random_state=42), param_grid_ad, cv=5)
  model_lmc.fit(X_train_not_cn, y_train_not_cn)
  model_ad.fit(X_train_not_cn, y_train_not_cn)
  best_model_lmc = model_lmc.best_estimator_
  best_model_ad = model_ad.best_estimator_
  return best_model_lmc, best_model_ad

# Stage 2: Train LMCI and AD models (replace with your chosen models)
model_lmc, model_ad = train_stage2_models(X_train[y_train != "CN"], y_train[y_train != "CN"])

# Predict Stage 2 labels for Not CN data using parallel processing
y_pred_stage2 = parallel_stage2_prediction(X_test[y_test != "CN"], model_lmc, model_ad)


In [None]:
def evaluate_model(y_true, y_pred):
  # Calculate accuracy, precision, recall, F1-score
  accuracy = accuracy_score(y_true, y_pred)
  report = classification_report(y_true, y_pred)
  print("Accuracy:", accuracy)
  print("Classification Report:\n", report)

# Evaluate Stage 1 model
evaluate_model(y_test, model_stage1.predict(X_test))

# Evaluate Stage 2 model (considering only Not CN data from test set)
evaluate_model(y_test[y_test != "CN"], y_pred_stage2)
