In [None]:
# the codes in Module 10
# load data
import pandas as pd
train_data = pd.read_csv("./data/train.csv")
test_data = pd.read_csv("./data/test.csv")

In [None]:
# take target out of training set
Y = train_data['label']
train_data = train_data.drop(['label'], axis=1)

In [None]:
train_data

In [None]:
# remove the features that are all zeros
total_data = train_data
total_data = total_data.append(test_data)
for col in total_data.columns:
    if len(total_data[total_data[col] == 0]) == len(total_data[col]):
        train_data = train_data.drop([col], axis=1)
        test_data = test_data.drop([col], axis=1)

In [None]:
# min-max scaling
train_data = (train_data - train_data.min()) / (train_data.max() - train_data.min())
test_data = (test_data - test_data.min()) / (test_data.max() - test_data.min())

In [None]:
def countMissing(data):
    missing = data.columns[data.isnull().any()].tolist()
    return missing
misTrain = countMissing(train_data)
misTest = countMissing(test_data)
misTotal = list(set().union(misTrain, misTest))

In [None]:
def imputation(data, column, value):
    data.loc[data[column].isnull(), column] = value

In [None]:
print(misTrain, len(misTrain))

In [None]:
print(misTest, len(misTest))

In [None]:
for ele in misTrain:
    imputation(train_data, ele, 1)

In [None]:
for ele in misTest:
    imputation(test_data, ele, 1)

In [None]:
# One-Vs-Rest with regularized logistic regression models
from sklearn.multiclass import OneVsRestClassifier
from sklearn import linear_model
# edit by Keith: solver='liblinear'
res = OneVsRestClassifier(linear_model.LogisticRegression(penalty='l1', solver='liblinear')).fit(train_data, Y).predict(test_data)





In [None]:
res

In [None]:
# One-Vs-One with regularized logistic regression models
from sklearn.multiclass import OneVsOneClassifier
from sklearn import linear_model
# edit by Keith: solver='liblinear'
res = OneVsOneClassifier(linear_model.LogisticRegression(penalty='l1', solver='liblinear')).fit(train_data, Y).predict(test_data)

In [None]:
res

In [None]:
# edit by Keith: cross_val_score
from sklearn.model_selection import cross_val_score
alphas = [0.01, 0.1, 1.0, 10]
regs = ["l1", "l2"]
scores = []
param = []
for alpha in alphas:
    for reg in regs:
        lm = OneVsRestClassifier(linear_model.LogisticRegression(penalty=reg, C=alpha, solver='liblinear'))
        scores.append(cross_val_score(lm, train_data, Y, scoring="accuracy", cv=2).mean())
        param.append([alpha, reg])
scores = pd.DataFrame({'parameter': param, 'score': scores})
print(scores.sort_values(by = 'score', ascending = False))

In [None]:
from sklearn.model_selection import cross_val_score
alphas = [0.01, 0.1, 1.0, 10]
regs = ["l1", "l2"]
scores = []
param = []
for alpha in alphas:
    for reg in regs:
        lm = OneVsOneClassifier(linear_model.LogisticRegression(penalty=reg, C=alpha, solver='liblinear'))
        scores.append(cross_val_score(lm, train_data, Y, scoring="accuracy", cv = 2).mean())
        param.append([alpha, reg])
scores = pd.DataFrame({'parameter': param, 'score': scores})
print(scores.sort_values(by = 'score', ascending = False))

In [None]:
# One-Vs-Rest with regularized logistic regression models
from sklearn.multiclass import OneVsRestClassifier
from sklearn import linear_model
res = OneVsRestClassifier(linear_model.LogisticRegression(penalty="l1", C=1.0, solver='liblinear')).fit(train_data, Y).predict(test_data)

In [None]:
# One-Vs-One with regularized logistic regression models
from sklearn.multiclass import OneVsOneClassifierb
from sklearn import linear_model
res = OneVsOneClassifier(linear_model.LogisticRegression(penalty="l2", C=1.0, solver='liblinear')).fit(train_data, Y).predict(test_data)

In [None]:
# train regularized softmax regression model
from sklearn import linear_model
lm = linear_model.LogisticRegression(multi_class='multinomial', solver='lbfgs')
lm.fit(train_data, Y)
res = lm.predict(test_data)

In [None]:
# edit by Keith: cross_val_score
from sklearn.model_selection import cross_val_score
alphas = [0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1.0, 3.0, 10, 30, 100, 300, 1000, 3000]
scores = []
param = []
for alpha in alphas:
    lm = linear_model.LogisticRegression(C=alpha, multi_class='multinomial', solver='lbfgs')
    scores.append(cross_val_score(lm, train_data, Y, scoring="accuracy", cv=10).mean())
    param.append([alpha])
scores = pd.DataFrame({'parameter': param, 'score': scores})
print(scores.sort_values(by = 'score', ascending = False))

In [None]:
# train regularized softmax regression model
from sklearn import linear_model
lm = linear_model.LogisticRegression(C = 0.3, multi_class = 'multinomial', solver = 'lbfgs')
lm.fit(train_data, Y)
res = lm.predict(test_data)

In [None]:
import numpy as np
import math
# RBF function
def rbf(data, centers, sigma):
    res = np.ndarray(shape = (len(data), len(centers)))
    i = 0
    for row in data:
        tmp = []
        for center in centers:
            tmp.append(math.exp(-1.0 * sum(np.square(row - center)) / (2 * sigma * sigma)))
        res[i, :] = tmp
        i = i + 1
    return res
# select k centers from data
def selectCenters(data, k):
    tmp = np.random.choice(len(data), k)
    return data[tmp,:]

In [None]:
# cross validation on regularized softmax regression
# edit by Keith: cross_val_score
from sklearn.model_selection import cross_val_score
alphas = [0.1, 1.0, 10]
sigmas = [1.0, 2.0, 4.0]
Ks = [100, 400, 800, 1600]
scores = []
param = []
for K in Ks:
    centers = selectCenters(train_data.as_matrix(), K)
    for sigma in sigmas:
        rbfX = rbf(train_data.as_matrix(), centers, sigma)
        for a in alphas:
            lm = linear_model.LogisticRegression(multi_class = 'multinomial', solver = 'lbfgs', C = a)
            scores.append(cross_val_score(lm, rbfX, Y, scoring="accuracy", cv = 10).mean())
            param.append([K, sigma, a])
scores = pd.DataFrame({'parameter': param, 'score': scores})
print(scores.sort_values(by = 'score', ascending = False))

In [None]:
centers = selectCenters(train_data.as_matrix(), 1600)
rbfX = rbf(train_data.as_matrix(), centers, 4.0)
rbfTest = rbf(test_data.as_matrix(), centers, 4.0)
lm = linear_model.LogisticRegression(multi_class = 'multinomial', solver = 'lbfgs', C = 10)
lm.fit(rbfX, Y)
res = lm.predict(rbfTest)

In [None]:
from sklearn.decomposition import PCA
# edit by Keith: cross_val_score
from sklearn.model_selection import cross_val_score
alphas = [0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1.0, 3.0, 10, 30, 100, 300, 1000, 3000]
Ks = [10, 100, 500]
scores = []
param = []
for K in Ks: 
    pca = PCA(n_components = K, svd_solver='arpack')
    pca.fit(train_data)
    pca_train = pca.transform(train_data)
    for alpha in alphas:
        lm = linear_model.LogisticRegression(C = alpha, multi_class = 'multinomial', solver = 'lbfgs')
        scores.append(cross_val_score(lm, pca_train, Y, scoring="accuracy", cv = 10).mean())
        param.append([K, alpha])
scores = pd.DataFrame({'parameter': param, 'score': scores})
print(scores.sort_values(by = 'score', ascending = False))

In [None]:
pca = PCA(n_components = 500, svd_solver='arpack')
pca.fit(train_data)
pca_train = pca.transform(train_data)
pca_test = pca.transform(test_data)
lm = linear_model.LogisticRegression(C = 0.3, multi_class = 'multinomial', solver = 'lbfgs')
lm.fit(pca_train, Y)
res = lm.predict(pca_test)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
# edit by Keith: cross_val_score
from sklearn.model_selection import cross_val_score
nigs = [1, 10, 20]
scores = []
param = []
for n in nigs:
    neigh = KNeighborsClassifier(n_neighbors = n)
    scores.append(cross_val_score(neigh, train_data, Y, scoring="accuracy", cv = 10).mean())
    param.append([n])
scores = pd.DataFrame({'parameter': param, 'score': scores})
print(scores.sort_values(by = 'score', ascending = False))

In [None]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors = 1)
neigh.fit(train_data, Y)
res = neigh.predict(test_data)

In [None]:
res

In [None]:
# save predictions
sample_data = pd.read_csv("./data/sample_submission.csv")
sample_data['Label'] = res
sample_data.to_csv('./prediction.csv', index = False)