# LR model AUC ROC

In [2]:
import numpy as np
import pandas as pd 
import feather 

from scipy import stats
from scipy.sparse import hstack, coo_matrix, save_npz, load_npz

from sklearn.pipeline import Pipeline
from sklearn.feature_selection import chi2, f_classif, SelectPercentile
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, roc_curve
import joblib

import pymysql
import pymysql.cursors
from database import Database

from utils import Utils

db = Database('Mimir from Munnin')
u = Utils()
np.random.seed(u.RANDOM_STATE)

In [4]:
# load data
df_patients = u.load_df('df_patients')
df_patients = df_patients.sort_values(by='PID')

# build features
drugs = u.load_np('drugs')

drug_features = []
for i, drugID in enumerate(drugs[:1458]): 
    f = u.load_feature(str(i))
    drug_features.append(f)
    
age_feature = coo_matrix(df_patients.get('Age').values.reshape(u.NUM_PATIENTS, 1))

num_drugs_feature = coo_matrix(hstack(drug_features).sum(1))

features = []
features.append(age_feature)
features.append(num_drugs_feature)
features.extend(drug_features)

X = hstack(features)
y = u.load_feature('label').toarray().reshape(-1)

X_new = SelectPercentile(chi2, percentile=10).fit_transform(X, y)

# load model and predict propensity scores

model = joblib.load(u.DATA_PATH+'PSM_models/LR/Best.pkl')
clf = model['classifier']
p_scores = clf.predict_proba(X_new)[:, 1]

y_true = df_patients.get('Sex').replace('F', 1).replace('M', 0).values
y_pred = p_scores
roc_auc = roc_auc_score(y_true, y_pred)
print('ROC-AUC: {:.2f}'.format(roc_auc))

ROC-AUC: 0.69
