In [164]:
from sklearn.datasets import load_iris
import numpy as np


def bootstrap(length):
    """
    Return:
        indexes of train/target datasets.
    """
    train = set()
    for m in range(length):
        train.add(np.random.randint(0, length))
    test = set(range(length)) - train
    return list(train), list(test)

iris = load_iris()
data_x = iris.data[:100]
data_y = iris.target[:100]

idx_train, idx_test = bootstrap(len(data_x))
print("*" * 10 + " 4.1 Demonstration of bootstrap " + "*" * 10)
print("Train: ", idx_train)
print("Test : ", idx_test)
print("*" * 50)

train_x = data_x[idx_train]
train_y = data_y[idx_train]
test_x = data_x[idx_test]
test_y = data_y[idx_test]

********** 4.1 Demonstration of bootstrap **********
Train:  [1, 2, 3, 6, 7, 10, 11, 17, 18, 21, 23, 24, 26, 27, 28, 29, 30, 31, 37, 39, 42, 44, 45, 46, 47, 48, 49, 50, 51, 52, 54, 56, 60, 62, 65, 66, 67, 68, 69, 70, 72, 73, 75, 77, 79, 80, 81, 82, 83, 85, 88, 89, 90, 91, 92, 93, 98, 99]
Test :  [0, 4, 5, 8, 9, 12, 13, 14, 15, 16, 19, 20, 22, 25, 32, 33, 34, 35, 36, 38, 40, 41, 43, 53, 55, 57, 58, 59, 61, 63, 64, 71, 74, 76, 78, 84, 86, 87, 94, 95, 96, 97]
**************************************************


In [169]:
from sklearn import linear_model
from sklearn import metrics
from sklearn import svm


def score(model, tr_x, tr_y, test_x, test_y):
    model.fit(tr_x, tr_y)
    y_pred = model.predict(test_x)
    y_true = test_y
    accuracy = metrics.accuracy_score(y_true, y_pred)
    cm = metrics.confusion_matrix(y_true, y_pred)
    P = metrics.precision_score(y_true, y_pred)
    R = metrics.recall_score(y_true, y_pred)
    F1 = metrics.f1_score(y_true, y_pred)
    print("Confusion Matrix: \n", cm)
    print("Accuracy: %.2f" % accuracy)
    print("Precision: %.2f" % P)
    print("Recall: %.2f" % R)
    print("F1: %.2f\n" % F1)

print("*" * 5 + " 4.2 Score of SDGClassifier " + "*" * 5)
score(linear_model.SGDClassifier(), train_x, train_y, test_x, test_y)
print("*" * 5 + " 4.3 Score of SVC " + "*" * 5)
score(svm.SVC(), train_x, train_y, test_x, test_y)
print("*" * 5 + " 4.4 Score of LogisticRegression " + "*" * 5)
score(linear_model.LogisticRegression(), train_x, train_y, test_x, test_y)
print("*" * 5 + " 4.5 Score of SVC(gamma=0.001, C=100) " + "*" * 5)
score(svm.SVC(gamma=0.001, C=100), train_x, train_y, test_x, test_y)

***** 4.2 Score of SDGClassifier *****
Confusion Matrix: 
 [[23  0]
 [ 0 19]]
Accuracy: 1.00
Precision: 1.00
Recall: 1.00
F1: 1.00

***** 4.3 Score of SVC *****
Confusion Matrix: 
 [[23  0]
 [ 0 19]]
Accuracy: 1.00
Precision: 1.00
Recall: 1.00
F1: 1.00

***** 4.4 Score of LogisticRegression *****
Confusion Matrix: 
 [[23  0]
 [ 0 19]]
Accuracy: 1.00
Precision: 1.00
Recall: 1.00
F1: 1.00

***** 4.5 Score of SVC(gamma=0.001, C=100) *****
Confusion Matrix: 
 [[23  0]
 [ 0 19]]
Accuracy: 1.00
Precision: 1.00
Recall: 1.00
F1: 1.00



In [156]:
print("Exercise 4.4 & 4.5")
print("Draw ROC curve.")

Exercise 4.4 & 4.5


In [173]:
import pandas
dr_train_df = pandas.read_csv("dr_train.csv")
idx_train, idx_test = bootstrap(train_df.shape[0])
dr_train_x = train_df.values[idx_train, 1:]
dr_train_y = train_df.values[idx_train, 0]
dr_test_x = train_df.values[idx_test, 1:]
dr_test_y = train_df.values[idx_test, 0]

dr_pred_df = pandas.read_csv("dr_test.csv")
dr_pred_x = test_df.values

In [176]:
def timeit(func):
    import time
    def wrapper(*args, **kwargs):
        t1 = time.time()
        ret = func(*args, **kwargs)
        t2 = time.time()
        print(">>> Cost %.3f secs" % (t2 - t1))
        return ret
    return wrapper

@timeit
def score_accuracy(model, tr_x, tr_y, test_x, test_y):
    model.fit(tr_x, tr_y)
    y_pred = model.predict(test_x)
    y_true = test_y
    accuracy = metrics.accuracy_score(y_true, y_pred)
    print(">>> Accuracy: %.5f" % accuracy)

models = {
    "SDG": linear_model.SGDClassifier(),
    "SVC": svm.SVC(),
    "LogisticRegression": linear_model.LogisticRegression(),
    "SVC(gamma=0.001, C=100)": svm.SVC(gamma=0.001, C=100)
}

for name, m in models.items():
    print(">>> " + name)
    score_accuracy(m, dr_train_x, dr_train_y, dr_test_x, dr_test_y)
    print()

>>> SDG
>>> Accuracy: 0.85

>>> SVC
>>> Accuracy: 0.11

>>> LogisticRegression
>>> Accuracy: 0.89

>>> SVC(gamma=0.001, C=100)
>>> Accuracy: 0.11



In [179]:
lr = linear_model.LogisticRegression()
lr.fit(dr_train_x, dr_train_y)
dr_pred_y = lr.predict(dr_pred_x)
print(dr_pred_y)

[2 0 9 ..., 3 9 2]


In [185]:
df = pandas.DataFrame({"ImageId": range(1, len(dr_pred_y) + 1), "Label": dr_pred_y})
df.to_csv("dr_submission.csv", index = False)