In [36]:
# read clean data with default info
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model, datasets
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn import preprocessing
from sklearn.metrics import roc_auc_score

train_data = pd.read_csv("training_data.csv")

In [34]:
X = train_data.iloc[:, :-1]
Y = train_data.iloc[:, -1]
# normalize data before regression
X = preprocessing.scale(X)
c = [0.0001, 0.01, 1, 100]
logreg = LogisticRegressionCV(penalty='l2', solver='sag', Cs=c, refit=True, cv=10, max_iter=100)
logreg.fit(X, Y)
logreg.score(X, Y)



0.76455775889203648

In [38]:
# select different feature size and run cross validation
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

k_list = [10, 20, 40, 80, 160, 320, 640]
score_list = []
for k_element in k_list:
    # read data and standardize
    train_data = pd.read_csv("training_data.csv")
    X = train_data.iloc[:, :-1]
    Y = train_data.iloc[:, -1]
    X = preprocessing.scale(X)
    # selece feature based on f_classif
    X_new = SelectKBest(f_classif, k=k_element)
    X_new.fit_transform(X, Y)
    X = X[:, X_new.get_support()]
    c = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]
    logreg = LogisticRegressionCV(penalty='l2', solver='sag', Cs=c, refit=True, cv=10, max_iter=100)
    logreg.fit(X, Y)
    score_list.append(logreg.score(X, Y))
    print(score_list[-1])



0.697198615046
0.6975133774
0.706011960969
0.71104815864
0.728674850488
0.728989612842
0.725212464589


In [39]:
from sklearn.decomposition import PCA

train_data = pd.read_csv("training_data.csv")
X = train_data.iloc[:, :-1]
X = preprocessing.scale(X)
Y = train_data.iloc[:, -1]
pca = PCA(n_components=0.99, svd_solver='full')
pca.fit(X)
X = pca.transform(X)
c = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]
logreg = LogisticRegressionCV(penalty='l2', solver='sag', Cs=c, refit=True, cv=10, max_iter=1000)
logreg.fit(X, Y)
logreg.score(X, Y)



0.73119294932326095