In [1]:
import pandas as pd
import warnings

#warnings.filterwarnings("ignore")

from epilepsy_prediction import load_prediction_model, data_preprocess,fuse_string,evaluate_model,load_imputation_model
from epilepsy_prediction.imputation import column_imputer
from copy import deepcopy
from pandas import read_csv
from pandas import DataFrame
from pandas import melt
from pandas import merge
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.linear_model import LogisticRegression

def load_data(change_name_file,data_file_name,outcome_col="outcome"):
    data_df = pd.read_csv(data_file_name, index_col=0)
    new_name_df = pd.read_csv(change_name_file,index_col=0)
    data_df.rename(new_name_df['marketscan'].to_dict(),inplace=True,axis=1)
    y_train = data_df.loc[data_df["test_train"]=="train",outcome_col]
    X_train = data_df.loc[data_df["test_train"]=="train",~( data_df.columns.isin([outcome_col,"test_train"]))]
    y_test = data_df.loc[~(data_df["test_train"]=="train"),outcome_col]
    X_test = data_df.loc[~(data_df["test_train"]=="train"),~ (data_df.columns.isin([outcome_col,"test_train"]))]
    return X_train,y_train,X_test,y_test

variables = "carba	valpo	topi	lamo	oxca	leve".split()

def get_one_hot_column(Xs, y_test, col_name):
    test_set = Xs[Xs[col_name]==1]
    y_set = y_test[test_set.index]
    return test_set, y_set


## all drugs, outcome any

In [2]:
change_name_file=r"C:\\Users\Anastassia Kolde\\Documents\\Israel secondment\\dictionary_19.01.23.csv"
data_file_name = r"C:\\Users\Anastassia Kolde\\Documents\\Israel secondment\\Data\test_train_drugs_all.csv"
X_train,y_train,X_test,y_test = load_data(change_name_file,data_file_name, outcome_col="outcome_any")

cols=X_train.columns[-6:]
X = X_train.drop("outcome_primary", axis = 1)  
Xs = X_test.drop("outcome_primary", axis = 1)  
X.sort_index(inplace=True)
Xs.sort_index(inplace=True)

stbl = X_train[cols].sum().reset_index().set_axis(['drug', 'count'], axis=1).assign(weight=lambda df: df['count'])

wtbl = merge(
    melt(X_train[cols].reset_index(), id_vars=['id'], var_name='drug')
    .pipe(lambda df: df[df['value'] == 1])
    [['id', 'drug']],
    stbl,
    on='drug',
    how='left'
)[['id', 'weight']].set_index('id')

weights = wtbl.loc[X.index.values, 'weight']

##XGBoost
xgbc = XGBClassifier()
model_xg = xgbc.fit(X, y_train.loc[X.index], sample_weight=weights)

##Logistic
##lr = LogisticRegression()
##model_lr = lr.fit(X, y_train.loc[X.index], sample_weight=weights)

In [3]:
#AUC test
probs_xg = model_xg.predict_proba(Xs)[:, 1]
auc_xg = roc_auc_score(y_test.loc[Xs.index], probs_xg)
auc_xg

0.5421315404074025

In [4]:
#AUC factorization
carba_one, carba_test = get_one_hot_column(Xs, y_test, "carba")
valpo_one, valpo_test = get_one_hot_column(Xs, y_test, "valpo")
lamo_one, lamo_test = get_one_hot_column(Xs, y_test, "lamo")
leve_one, leve_test = get_one_hot_column(Xs, y_test, "leve")
topi_one, topi_test = get_one_hot_column(Xs, y_test, "topi")
oxca_one, oxca_test = get_one_hot_column(Xs, y_test, "oxca")

probs_carba = model_xg.predict_proba(carba_one)[:, 1]
auc_carba = roc_auc_score(carba_test, probs_carba)

probs_valpo = model_xg.predict_proba(valpo_one)[:, 1]
auc_valpo = roc_auc_score(valpo_test, probs_valpo)

probs_lamo = model_xg.predict_proba(lamo_one)[:, 1]
auc_lamo = roc_auc_score(lamo_test, probs_lamo)

probs_leve = model_xg.predict_proba(leve_one)[:, 1]
auc_leve = roc_auc_score(leve_test, probs_leve)

probs_topi = model_xg.predict_proba(topi_one)[:, 1]
auc_topi = roc_auc_score(topi_test, probs_topi)

probs_oxca = model_xg.predict_proba(oxca_one)[:, 1]
auc_oxca = roc_auc_score(oxca_test, probs_oxca)

print(auc_carba)
print(auc_valpo)
print(auc_lamo)
print(auc_leve)
print(auc_topi)
print(auc_oxca)

0.5461609620721554
0.5538061158100195
0.5055495882563552
0.6340909090909091
0.7843137254901961
0.4705882352941176


## all drugs, outcome primary

In [5]:
change_name_file=r"C:\\Users\Anastassia Kolde\\Documents\\Israel secondment\\dictionary_19.01.23.csv"
data_file_name = r"C:\\Users\Anastassia Kolde\\Documents\\Israel secondment\\Data\test_train_drugs_all.csv"
X_train,y_train,X_test,y_test = load_data(change_name_file,data_file_name, outcome_col="outcome_primary")

cols=X_train.columns[-6:]
X = X_train.drop("outcome_any", axis = 1)  
Xs = X_test.drop("outcome_any", axis = 1)  
X.sort_index(inplace=True)
Xs.sort_index(inplace=True)

stbl = X_train[cols].sum().reset_index().set_axis(['drug', 'count'], axis=1).assign(weight=lambda df: df['count'])

wtbl = merge(
    melt(X_train[cols].reset_index(), id_vars=['id'], var_name='drug')
    .pipe(lambda df: df[df['value'] == 1])
    [['id', 'drug']],
    stbl,
    on='drug',
    how='left'
)[['id', 'weight']].set_index('id')

weights = wtbl.loc[X.index.values, 'weight']

##XGBoost
xgbc = XGBClassifier()
model_xg = xgbc.fit(X, y_train.loc[X.index], sample_weight=weights)

##Logistic
##lr = LogisticRegression()
##model_lr = lr.fit(X, y_train.loc[X.index], sample_weight=weights)

In [6]:
#AUC test
probs_xg = model_xg.predict_proba(Xs)[:, 1]
auc_xg = roc_auc_score(y_test.loc[Xs.index], probs_xg)
auc_xg

0.5830064402810304

In [7]:
#AUC factorization
carba_one, carba_test = get_one_hot_column(Xs, y_test, "carba")
valpo_one, valpo_test = get_one_hot_column(Xs, y_test, "valpo")
lamo_one, lamo_test = get_one_hot_column(Xs, y_test, "lamo")
leve_one, leve_test = get_one_hot_column(Xs, y_test, "leve")
topi_one, topi_test = get_one_hot_column(Xs, y_test, "topi")
oxca_one, oxca_test = get_one_hot_column(Xs, y_test, "oxca")

probs_carba = model_xg.predict_proba(carba_one)[:, 1]
auc_carba = roc_auc_score(carba_test, probs_carba)

probs_valpo = model_xg.predict_proba(valpo_one)[:, 1]
auc_valpo = roc_auc_score(valpo_test, probs_valpo)

probs_lamo = model_xg.predict_proba(lamo_one)[:, 1]
auc_lamo = roc_auc_score(lamo_test, probs_lamo)

probs_leve = model_xg.predict_proba(leve_one)[:, 1]
auc_leve = roc_auc_score(leve_test, probs_leve)

probs_topi = model_xg.predict_proba(topi_one)[:, 1]
auc_topi = roc_auc_score(topi_test, probs_topi)

probs_oxca = model_xg.predict_proba(oxca_one)[:, 1]
auc_oxca = roc_auc_score(oxca_test, probs_oxca)

print(auc_carba)
print(auc_valpo)
print(auc_lamo)
print(auc_leve)
print(auc_topi)
print(auc_oxca)

0.6147876393396359
0.6323979591836735
0.6498084291187739
0.6333333333333333
0.1454545454545455
0.4377062706270627


## 1st index any

In [8]:
change_name_file=r"C:\\Users\Anastassia Kolde\\Documents\\Israel secondment\\dictionary_19.01.23.csv"
data_file_name = r"C:\\Users\Anastassia Kolde\\Documents\\Israel secondment\\Data\test_train_drugs_1st_index.csv"
X_train,y_train,X_test,y_test = load_data(change_name_file,data_file_name, outcome_col="outcome_any")

cols=X_train.columns[-6:]
X = X_train.drop("outcome_primary", axis = 1)  
Xs = X_test.drop("outcome_primary", axis = 1)  
X.sort_index(inplace=True)
Xs.sort_index(inplace=True)

stbl = X_train[cols].sum().reset_index().set_axis(['drug', 'count'], axis=1).assign(weight=lambda df: df['count'])

wtbl = merge(
    melt(X_train[cols].reset_index(), id_vars=['id'], var_name='drug')
    .pipe(lambda df: df[df['value'] == 1])
    [['id', 'drug']],
    stbl,
    on='drug',
    how='left'
)[['id', 'weight']].set_index('id')

weights = wtbl.loc[X.index.values, 'weight']

##XGBoost
xgbc = XGBClassifier()
model_xg = xgbc.fit(X, y_train.loc[X.index], sample_weight=weights)

##Logistic
##lr = LogisticRegression()
##model_lr = lr.fit(X, y_train.loc[X.index], sample_weight=weights)

In [9]:
#AUC test
probs_xg = model_xg.predict_proba(Xs)[:, 1]
auc_xg = roc_auc_score(y_test.loc[Xs.index], probs_xg)
auc_xg

0.5206204718558425

In [10]:
#AUC factorization
carba_one, carba_test = get_one_hot_column(Xs, y_test, "carba")
valpo_one, valpo_test = get_one_hot_column(Xs, y_test, "valpo")
lamo_one, lamo_test = get_one_hot_column(Xs, y_test, "lamo")
leve_one, leve_test = get_one_hot_column(Xs, y_test, "leve")
topi_one, topi_test = get_one_hot_column(Xs, y_test, "topi")
oxca_one, oxca_test = get_one_hot_column(Xs, y_test, "oxca")

probs_carba = model_xg.predict_proba(carba_one)[:, 1]
auc_carba = roc_auc_score(carba_test, probs_carba)

probs_valpo = model_xg.predict_proba(valpo_one)[:, 1]
auc_valpo = roc_auc_score(valpo_test, probs_valpo)

probs_lamo = model_xg.predict_proba(lamo_one)[:, 1]
auc_lamo = roc_auc_score(lamo_test, probs_lamo)

probs_leve = model_xg.predict_proba(leve_one)[:, 1]
auc_leve = roc_auc_score(leve_test, probs_leve)

probs_topi = model_xg.predict_proba(topi_one)[:, 1]
auc_topi = roc_auc_score(topi_test, probs_topi)

probs_oxca = model_xg.predict_proba(oxca_one)[:, 1]
auc_oxca = roc_auc_score(oxca_test, probs_oxca)

print(auc_carba)
print(auc_valpo)
print(auc_lamo)
print(auc_leve)
print(auc_topi)
print(auc_oxca)

0.5603525211958947
0.5263480392156863
0.521487204249155
0.4624113475177305
0.5694444444444444
0.5024509803921569


## 1st primary

In [11]:
change_name_file=r"C:\\Users\Anastassia Kolde\\Documents\\Israel secondment\\dictionary_19.01.23.csv"
data_file_name = r"C:\\Users\Anastassia Kolde\\Documents\\Israel secondment\\Data\test_train_drugs_1st_index.csv"
X_train,y_train,X_test,y_test = load_data(change_name_file,data_file_name, outcome_col="outcome_primary")

cols=X_train.columns[-6:]
X = X_train.drop("outcome_any", axis = 1)  
Xs = X_test.drop("outcome_any", axis = 1)  
X.sort_index(inplace=True)
Xs.sort_index(inplace=True)

stbl = X_train[cols].sum().reset_index().set_axis(['drug', 'count'], axis=1).assign(weight=lambda df: df['count'])

wtbl = merge(
    melt(X_train[cols].reset_index(), id_vars=['id'], var_name='drug')
    .pipe(lambda df: df[df['value'] == 1])
    [['id', 'drug']],
    stbl,
    on='drug',
    how='left'
)[['id', 'weight']].set_index('id')

weights = wtbl.loc[X.index.values, 'weight']

##XGBoost
xgbc = XGBClassifier()
model_xg = xgbc.fit(X, y_train.loc[X.index], sample_weight=weights)

##Logistic
##lr = LogisticRegression()
##model_lr = lr.fit(X, y_train.loc[X.index], sample_weight=weights)

In [12]:
#AUC test
probs_xg = model_xg.predict_proba(Xs)[:, 1]
auc_xg = roc_auc_score(y_test.loc[Xs.index], probs_xg)
auc_xg

0.5849262317899229

In [13]:
#AUC factorization
carba_one, carba_test = get_one_hot_column(Xs, y_test, "carba")
valpo_one, valpo_test = get_one_hot_column(Xs, y_test, "valpo")
lamo_one, lamo_test = get_one_hot_column(Xs, y_test, "lamo")
leve_one, leve_test = get_one_hot_column(Xs, y_test, "leve")
topi_one, topi_test = get_one_hot_column(Xs, y_test, "topi")
oxca_one, oxca_test = get_one_hot_column(Xs, y_test, "oxca")

probs_carba = model_xg.predict_proba(carba_one)[:, 1]
auc_carba = roc_auc_score(carba_test, probs_carba)

probs_valpo = model_xg.predict_proba(valpo_one)[:, 1]
auc_valpo = roc_auc_score(valpo_test, probs_valpo)

probs_lamo = model_xg.predict_proba(lamo_one)[:, 1]
auc_lamo = roc_auc_score(lamo_test, probs_lamo)

probs_leve = model_xg.predict_proba(leve_one)[:, 1]
auc_leve = roc_auc_score(leve_test, probs_leve)

probs_topi = model_xg.predict_proba(topi_one)[:, 1]
auc_topi = roc_auc_score(topi_test, probs_topi)

probs_oxca = model_xg.predict_proba(oxca_one)[:, 1]
auc_oxca = roc_auc_score(oxca_test, probs_oxca)

print(auc_carba)
print(auc_valpo)
print(auc_lamo)
print(auc_leve)
print(auc_topi)
print(auc_oxca)

0.5443773234200744
0.6432160804020101
0.7394957983193278
0.5416666666666667
0.717948717948718
0.5027210884353741
