In [31]:
from sklearn.multiclass import OneVsOneClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.svm import SVC

import warnings
import pandas as pd
import numpy as np

train = np.matrix(pd.read_csv("dr_train.csv"))
x_train = train[:, 1:]
y_train = train[:, 0].A1

x_test = np.matrix(pd.read_csv("dr_test.csv"))

def output_csv(filename, y):
    df_pred = pd.DataFrame({"ImageId": range(1, len(y) + 1), "Label": y}) 
    df_pred.to_csv(filename, index=False)

In [2]:
models = [
    ("SGDC", SGDClassifier()),
    ("OvO(LogRegression)", OneVsOneClassifier(LogisticRegression())),
    ("OvR(LogRegression)", OneVsRestClassifier(LogisticRegression())),
    ("OvO(LDA)", OneVsOneClassifier(LDA())),
    ("OvR(LDA)", OneVsRestClassifier(LDA())),
    ("SVC", SVC(gamma=0.001, C=100))
]

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    for title, m in models:
        m.fit(x_train, y_train)
        print(title, m.score(x_train, y_train))
        y_test = m.predict(x_test)
        output_csv(title+".csv", y_test)

SGDC 0.885095238095
OvO(LogRegression) 0.989761904762
OvR(LogRegression) 0.938571428571
OvO(LDA) 0.941666666667
OvR(LDA) 0.863428571429
SVC 1.0


In [17]:
from sklearn.model_selection import train_test_split

x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size=0.2, random_state=0)

models = [
    #("SGDC", SGDClassifier()),
    #("OvO(LogRegression)", OneVsOneClassifier(LogisticRegression())),
    #("OvR(LogRegression)", OneVsRestClassifier(LogisticRegression())),
    #("OvO(Ridge)", OneVsOneClassifier(Ridge(alpha=1.0))),
    #("OvR(Ridge)", OneVsRestClassifier(Ridge(alpha=1.0))),
    ("OvO(LDA)", OneVsOneClassifier(LDA())),
    ("OvR(LDA)", OneVsRestClassifier(LDA())),
    ("OvO(Lasso)", OneVsOneClassifier(Lasso(0.1))),
    #("OvR(Lasso)", OneVsRestClassifier(Lasso(0.1)))
]

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    for title, m in models:
        m.fit(x_train, y_train)
        print(
            title, "\ttraining score:", m.score(x_train, y_train),
            "validation score:", m.score(x_valid, y_valid))
        y_test = m.predict(x_test)
        output_csv("80_"+title+".csv", y_test)

OvO(LDA) 	training score: 0.947879464286 validation score: 0.916369047619
OvR(LDA) 	training score: 0.867113095238 validation score: 0.84568452381
OvO(Lasso) 	training score: 0.307589285714 validation score: 0.319494047619


In [18]:
for alpha in np.linspace(0, 2, 21):
    m_ridge = OneVsRestClassifier(Ridge(alpha=alpha))

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        m_ridge.fit(x_train, y_train)
        print(
            "80_Ridge_"+str(alpha), "training score:", m_ridge.score(x_train, y_train),
            "validation score:", m_ridge.score(x_valid, y_valid))
        y_test = m_ridge.predict(x_test)
        output_csv("80_Ridge_"+str(alpha)+".csv", y_test)
print()
for alpha in np.linspace(0, 2, 21):
    m_lasso = OneVsRestClassifier(Lasso(alpha=alpha))

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        m_lasso.fit(x_train, y_train)
        print(
            "80_Lasso_"+str(alpha), "training score:", m_lasso.score(x_train, y_train),
            "validation score:", m_lasso.score(x_valid, y_valid))
        y_test = m_lasso.predict(x_test)
        output_csv("80_Lasso_"+str(alpha)+".csv", y_test)

80_Ridge_0.0 training score: 0.860751488095 validation score: 0.839285714286
80_Ridge_0.1 training score: 0.863876488095 validation score: 0.844345238095
80_Ridge_0.2 training score: 0.863876488095 validation score: 0.844345238095
80_Ridge_0.3 training score: 0.863876488095 validation score: 0.844345238095
80_Ridge_0.4 training score: 0.863876488095 validation score: 0.844345238095
80_Ridge_0.5 training score: 0.863876488095 validation score: 0.844345238095
80_Ridge_0.6 training score: 0.863876488095 validation score: 0.844345238095
80_Ridge_0.7 training score: 0.863876488095 validation score: 0.844345238095
80_Ridge_0.8 training score: 0.863876488095 validation score: 0.844345238095
80_Ridge_0.9 training score: 0.863876488095 validation score: 0.844345238095
80_Ridge_1.0 training score: 0.863876488095 validation score: 0.844345238095
80_Ridge_1.1 training score: 0.863876488095 validation score: 0.844345238095
80_Ridge_1.2 training score: 0.863876488095 validation score: 0.844345238095

In [32]:
from sklearn.model_selection import KFold
from copy import deepcopy


models = [
    #("SGDC", SGDClassifier()),
    #("OvO(LogRegression)", OneVsOneClassifier(LogisticRegression())),
    #("OvR(LogRegression)", OneVsRestClassifier(LogisticRegression())),
    #("OvO(LDA)", OneVsOneClassifier(LDA())),
    #("OvR(LDA)", OneVsRestClassifier(LDA())),
    #("OvO(Ridge)", OneVsOneClassifier(Ridge(alpha=1.0))),
    ("OvR(Ridge)", OneVsRestClassifier(Ridge(alpha=0.1))),
    #("OvO(Lasso)", OneVsOneClassifier(Lasso(0.1))),
    ("OvR(Lasso)", OneVsRestClassifier(Lasso(0)))
]

def kfold_train(m, x_train, y_train):
    kf = KFold(n_splits=10)
    scores1 = []
    scores2 = []
    trained_models = []
    for train_idx, valid_idx in kf.split(x_train):
        xx_train, yy_train = x_train[train_idx], y_train[train_idx]
        xx_valid, yy_valid = x_train[valid_idx], y_train[valid_idx]
        m.fit(xx_train, yy_train)
        scores1.append(m.score(xx_train, yy_train))
        scores2.append(m.score(xx_valid, yy_valid))
        trained_models.append(m)
    return trained_models, np.mean(scores1), np.mean(scores2)

def _choose(pred):
    """ Choose the most    """
    m = {}
    for n in pred:
        m[n] = m.get(n, 0) + 1
    return sorted(m.keys(), key=lambda n:m[n])[-1]

def bulk_predict(trained_models, x_test):
    predicts = []
    for m in trained_models:
        y_predict = m.predict(x_test)
        predicts.append(y_predict)
    pred_mat = np.matrix(predicts)
    y = []
    for i in range(pred_mat.shape[1]):
        p = _choose(pred_mat[:, i].A1)
        y.append(p)
    return np.array(y)


with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    for title, m in models:
        print("\n", title, "Started:")
        trained_models, s1, s2 = kfold_train(deepcopy(m), x_train, y_train)
        y_test = bulk_predict(trained_models, x_test)
        print("\tTrainingScore:", s1, "\tValidationScore:", s2)
        output_csv("kfold_"+title+".csv", y_test)


 OvR(Ridge) Started:
	TrainingScore: 0.860576719577 	ValidationScore: 0.848547619048

 OvR(Lasso) Started:
	TrainingScore: 0.860574074074 	ValidationScore: 0.848547619048
