In [1]:
# read clean data with default info
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model, datasets
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn import preprocessing
from sklearn.metrics import roc_auc_score
from scipy import stats

In [2]:
train_data = pd.read_csv("training_data.csv")
X = train_data.loc[:, ['at103s', 'bc03s', 'bc36s', 'br03s', 'br36s', 'g051s', 'g215a', 'g215b',
                        'g224c', 'g230s', 'g251b', 'g304s', 'g305s', 'g311s', 're03s', 're24s',
                        's062s', 's068b', 's071b', 's073b', 'trv01', 'cv13']]
Y = train_data.iloc[:, -1]
X = preprocessing.scale(X)
c = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]
logreg = LogisticRegressionCV(penalty='l2', solver='sag', Cs=c, refit=True, cv=10, max_iter=1000)
logreg.fit(X, Y)
print("The accuracy rate in training set is ", logreg.score(X, Y))
y_scores = logreg.predict(X)
y_true = Y
print("The AUC score is ", roc_auc_score(y_true, y_scores))



The accuracy rate in training set is  0.674220963173
The AUC score is  0.580346419782


In [3]:
# add MSCORE
train_data = pd.read_csv("training_data.csv")
X = train_data.loc[:, ['at103s', 'bc03s', 'bc36s', 'br03s', 'br36s', 'g051s', 'g215a', 'g215b',
                        'g224c', 'g230s', 'g251b', 'g304s', 'g305s', 'g311s', 're03s', 're24s',
                        's062s', 's068b', 's071b', 's073b', 'trv01', 'cv13', 'MSCORE']]
Y = train_data.iloc[:, -1]
X = preprocessing.scale(X)
c = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]
logreg = LogisticRegressionCV(penalty='l2', solver='sag', Cs=c, refit=True, cv=10, max_iter=1000)
logreg.fit(X, Y)
print("The accuracy rate in training set is ", logreg.score(X, Y))
y_scores = logreg.predict(X)
y_true = Y
print("The AUC score is ", roc_auc_score(y_true, y_scores))



The accuracy rate in training set is  0.690588605603
The AUC score is  0.612464942363


In [6]:
# add CV_Auto
train_data = pd.read_csv("training_data.csv")
X = train_data.loc[:, ['at103s', 'bc03s', 'bc36s', 'br03s', 'br36s', 'g051s', 'g215a', 'g215b',
                        'g224c', 'g230s', 'g251b', 'g304s', 'g305s', 'g311s', 're03s', 're24s',
                        's062s', 's068b', 's071b', 's073b', 'trv01', 'cv13', 'CV_Auto']]
Y = train_data.iloc[:, -1]
X = preprocessing.scale(X)
c = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]
logreg = LogisticRegressionCV(penalty='l2', solver='sag', Cs=c, refit=True, cv=10, max_iter=1000)
logreg.fit(X, Y)
print("The accuracy rate in training set is ", logreg.score(X, Y))
y_scores = logreg.predict(X)
y_true = Y
print("The AUC score is ", roc_auc_score(y_true, y_scores))



The accuracy rate in training set is  0.692162417375
The AUC score is  0.619360406391


In [7]:
# add FICO08
train_data = pd.read_csv("training_data.csv")
X = train_data.loc[:, ['at103s', 'bc03s', 'bc36s', 'br03s', 'br36s', 'g051s', 'g215a', 'g215b',
                        'g224c', 'g230s', 'g251b', 'g304s', 'g305s', 'g311s', 're03s', 're24s',
                        's062s', 's068b', 's071b', 's073b', 'trv01', 'cv13', 'FICO08']]
Y = train_data.iloc[:, -1]
X = preprocessing.scale(X)
c = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]
logreg = LogisticRegressionCV(penalty='l2', solver='sag', Cs=c, refit=True, cv=10, max_iter=1000)
logreg.fit(X, Y)
print("The accuracy rate in training set is ", logreg.score(X, Y))
y_scores = logreg.predict(X)
y_true = Y
print("The AUC score is ", roc_auc_score(y_true, y_scores))



The accuracy rate in training set is  0.680830972616
The AUC score is  0.589104537708


In [8]:
# add AADM10
train_data = pd.read_csv("training_data.csv")
X = train_data.loc[:, ['at103s', 'bc03s', 'bc36s', 'br03s', 'br36s', 'g051s', 'g215a', 'g215b',
                        'g224c', 'g230s', 'g251b', 'g304s', 'g305s', 'g311s', 're03s', 're24s',
                        's062s', 's068b', 's071b', 's073b', 'trv01', 'cv13', 'AADM10']]
Y = train_data.iloc[:, -1]
X = preprocessing.scale(X)
c = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]
logreg = LogisticRegressionCV(penalty='l2', solver='sag', Cs=c, refit=True, cv=10, max_iter=1000)
logreg.fit(X, Y)
print("The accuracy rate in training set is ", logreg.score(X, Y))
y_scores = logreg.predict(X)
y_true = Y
print("The AUC score is ", roc_auc_score(y_true, y_scores))



The accuracy rate in training set is  0.697828139754
The AUC score is  0.625766234983
