In [1]:
# read clean data with default info
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model, datasets
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn import preprocessing
from sklearn.metrics import roc_auc_score
from scipy import stats

In [2]:
# select different feature size and run cross validation
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

# read data and standardize
train_data = pd.read_csv("training_data.csv")
X = train_data.iloc[:, :-1]
Y = train_data.iloc[:, -1]
X = preprocessing.scale(X)
# selece feature based on f_classif
X_new = SelectKBest(f_classif, k=60)
X_new.fit_transform(X, Y)
X = X[:, X_new.get_support()]
c = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]
logreg = LogisticRegressionCV(penalty='l2', solver='sag', Cs=c, refit=True, cv=10, max_iter=1000)
logreg.fit(X, Y)
print("The accuracy rate in training set is ", logreg.score(X, Y))
y_scores = logreg.predict(X)
y_true = Y
print("The AUC score is ", roc_auc_score(y_true, y_scores))



The accuracy rate in training set is  0.710418633931
The AUC score is  0.647664241558


In [3]:
from sklearn.feature_selection import f_regression
p_value = f_regression(X, Y)
column = train_data.iloc[:, X_new.get_support()].columns
for i in range(len(column)):
    print(column[i], p_value[1][i])

LINKT001 3.2410100325e-25
LINKT004 4.87857127875e-33
AADM10 5.81749748148e-78
AADM12 3.66425040218e-55
AADM11 2.35021276792e-73
AADM13 5.37300643142e-44
at103s 1.03916798646e-17
bc02s 2.11451332036e-24
bc03s 4.60542441802e-19
bc12s 2.64908558873e-17
bc36s 9.46958559061e-20
br02s 1.95304079719e-24
br03s 5.49980621057e-18
br36s 3.92973850935e-21
g051s 4.07879044769e-28
g212s 5.23142107848e-23
g215a 7.88847320398e-21
g215b 1.16775003247e-23
g224b 1.28103295525e-18
g224c 7.82179130486e-23
g228s 9.45259113335e-18
g230s 3.64187094122e-19
g250b 1.03957213188e-19
g250c 1.50660803075e-22
g251b 2.1135115362e-19
g251c 3.03071687954e-23
g304s 2.97523302447e-18
g305s 8.35112409049e-22
g310s 8.55190145795e-19
g311s 1.52808510432e-17
re02s 2.43268666121e-22
re03s 1.2394902867e-20
re12s 3.55551706295e-19
re24s 1.16069453195e-18
re29s 1.78369675011e-17
s061s 5.92901765441e-18
s062s 4.64522297553e-18
s068a 5.44673534674e-17
s068b 1.09774274713e-22
s071a 4.43403564445e-27
s071b 3.89658164028e-27
s073a 3.

In [12]:
train_data = pd.read_csv("training_data.csv")
X = train_data.loc[:, ['at103s', 'bc03s', 'bc36s', 'br03s', 'br36s', 'g051s', 'g215a', 'g215b',
                        'g224c', 'g230s', 'g251b', 'g304s', 'g305s', 'g311s', 're03s', 're24s',
                        's062s', 's068b', 's071b', 's073b', 'trv01', 'cv13']]
Y = train_data.iloc[:, -1]
X = preprocessing.scale(X)
c = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]
logreg = LogisticRegressionCV(penalty='l2', solver='sag', Cs=c, refit=True, cv=10, max_iter=1000)
logreg.fit(X, Y)
print("The accuracy rate in training set is ", logreg.score(X, Y))
y_scores = logreg.predict(X)
y_true = Y
print("The AUC score is ", roc_auc_score(y_true, y_scores))



The accuracy rate in training set is  0.674220963173
The AUC score is  0.580346419782


In [3]:
test_data = pd.read_csv("testing_data.csv")
X = test_data.iloc[:, :-1]
Y = test_data.iloc[:, -1]
X = preprocessing.scale(X)
X = X[:, X_new.get_support()]
print("The accuracy rate in training set is ", logreg.score(X, Y))
y_scores = logreg.predict(X)
y_true = Y
print("The AUC score is ", roc_auc_score(y_true, y_scores))

The accuracy rate in training set is  0.665932452276
The AUC score is  0.601680950386


In [4]:
# list all features we use
import csv
feature_list = test_data.columns.tolist()
select_feature = pd.DataFrame(feature_list)
print(select_feature.loc[X_new.get_support()])
for item in len(X_new.get_support()):
    print(select_feature.loc[X_new.get_support()], )

                0
0        LINKT001
3        LINKT004
10         AADM10
11         AADM12
12         AADM11
13         AADM13
44         at103s
70          bc02s
71          bc03s
74          bc12s
88          bc36s
125         br02s
126         br03s
143         br36s
220         g051s
253         g212s
256         g215a
257         g215b
279         g224b
280         g224c
284         g228s
285         g230s
298         g250b
299         g250c
301         g251b
302         g251c
307         g304s
308         g305s
313         g310s
314         g311s
500         re02s
501         re03s
504         re12s
507         re24s
511         re29s
555         s061s
556         s062s
559         s068a
560         s068b
561         s071a
562         s071b
563         s073a
564         s073b
826         trv01
833         trv08
857        rvlr07
858        rvlr08
859        rvlr09
860        rvlr10
888      paymnt10
902          cv13
956      walshrs2
1037     bcpmtnum
1038      CV_Auto
1039      

In [None]:
# we need more data
# how to test ks score?
# 