In [13]:
# ==============================================================================
# title           : Prediction models
# description     : 
# author          : V.-R. Bourque
# date            : 2024-12
# version         : 1
# usage           : Python 3.11.10
# notes           : example with simulated data
# ==============================================================================

In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from train_models import train_k_fold_eval
np.random.seed(123)

In [15]:
# Load data
from simulation_dataset import simulate
df = simulate(cohorts = ['SSC', 'MSSNG', 'SPARK'], sample_sizes = [500, 500, 5000])

In [16]:
# List all models
var_list = [
['sex','PGS'],
['sex','PGS','DEL','DUP'],
['sex','PGS','DEL','DUP','LOF','MIS']
]

In [17]:
# Define training and testing datasets
df_train = df.loc[df['Cohort']=='SPARK'].reset_index()
df_eval = df.loc[(df['Cohort']=='SSC')|(df['Cohort']=='MSSNG')].reset_index()

# Classifier
clf = LogisticRegression(random_state=0, penalty=None, max_iter=1000)

# Train all models and predict
ndf = train_k_fold_eval(df_train, df_eval, clf, ['sex'])
for var in var_list:
    print(var)
    ndf[['mod_'+ '+'.join(var)]] = train_k_fold_eval(df_train, df_eval, clf, var)[['mod_'+'+'.join(var)]]

['sex', 'PGS']
['sex', 'PGS', 'DEL', 'DUP']
['sex', 'PGS', 'DEL', 'DUP', 'LOF', 'MIS']


In [18]:
# Save
filepath = "~/Documents/prediction_in_autism/package_version/"
ndf.to_csv(filepath+'data/simulated_data_predictions.csv')