In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import glob
import sklearn
import pandas as pd
import os
import sys
import time
import re

from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, roc_auc_score, roc_curve, cohen_kappa_score, precision_score, recall_score, accuracy_score
from joblib import dump, load

plt.rc('figure', figsize=(16, 9))

In [None]:
# read in raw files
filenames=glob.glob("SNP_data_raw/5_95_data/final_data/*.csv")

In [None]:
filenames

In [None]:
files=[]
files.append(filenames[2])
files.append(filenames[-2])

In [None]:
files

In [None]:
for file in files:
    data=pd.read_csv(file)
    X=data.iloc[:,1:-1]
    y=data.iloc[:,-1]
    
    accuracy_list=[]
    precision_score_list=[]
    recall_score_list=[]
    roc_auc_score_list=[]
    cohenkappa_score_list=[]
    for r in range(1,100):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=r)
        model=XGBClassifier(max_depth=6)
        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)
        predictions = [round(value) for value in y_pred]

        accuracy_list.append(accuracy_score(y_test, predictions))
        precision_score_list.append(precision_score(y_test,predictions))
        recall_score_list.append(recall_score(y_test,predictions))
        try:
            roc_auc_score_list.append(roc_auc_score(y_test,predictions))
        except ValueError:
            roc_auc_score_list.append(-1)
        cohenkappa_score_list.append(cohen_kappa_score(y_test,predictions))  

    stats = pd.DataFrame({'accuracy': accuracy_list, 
                              'precision': precision_score_list,
                              'recall': recall_score_list,
                              'auc': roc_auc_score_list,
                              'kappa': cohenkappa_score_list})

    stats.to_csv('SNP_data_raw/5_95_data/final_data/results/'+file.split('/')[3].split('_')[0]+'_SNP_xgboost_stats.csv')
#     dump(model, 'filename.joblib') 