In [92]:
from pathlib import Path
import sys
import os
import re
import warnings
import scipy 
from collections import Counter
from datetime import datetime
from dateutil import relativedelta

import pandas as pd
import numpy as np
import math
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from configparser import ConfigParser
from collections import defaultdict
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)


%load_ext autoreload
%autoreload 

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [29]:
def get_dataset(filename):
    with open(filename) as f:
        x = f.readlines()
        splitted = [row[:-1].split() for row in x]
        converted = [list(map(lambda y: float(y), x)) for x in splitted]
        df  = pd.DataFrame(converted, columns = ['digit', 'intensity', 'symmetry'])
        df['digit'] = df['digit'].apply(lambda x: int(x))
        return df

In [30]:
train_df = get_dataset('hand_train.txt')
test_df = get_dataset('hand_test.txt')

# Polynomial kernels

In [44]:
def get_one_vs_all_dataset(num, df):
    target = df['digit'].apply(lambda x: 1 if x == num else -1)
    return df[['intensity', 'symmetry']], target

In [59]:
def train_and_get_e_in(train_df, test_df, number, C, degree):
    X_train, y_train = get_one_vs_all_dataset(number, train_df)
    X_test, y_test = get_one_vs_all_dataset(0, test_df)
    svc = SVC(kernel='poly', degree=degree, C=C)
    svc.fit(X_train, y_train)
    y_train_predict = svc.predict(X_train)
    y_test_predict = svc.predict(X_test)
    in_sample_error = 1 - accuracy_score(y_train, y_train_predict)
    out_sample_error = 1 - accuracy_score(y_test, y_test_predict)
    print(svc.n_support_)
    return in_sample_error, out_sample_error

In [56]:
C = 0.01
Q = 2

In [60]:
def compare_e_in(digits):
    for d in digits:
        e_in, e_out = train_and_get_e_in(train_df, test_df, d, C, Q)
        print(f"number {d}: E_in: {e_in}, E_out: {e_out}")

In [61]:
compare_e_in([0, 2, 4, 6, 8])

[1199 1194]
number 0: E_in: 0.16376354409546012, E_out: 0.17887394120577982
[732 731]
number 2: E_in: 0.10026059525442321, E_out: 0.17887394120577982
[660 652]
number 4: E_in: 0.08942531888629812, E_out: 0.17887394120577982
[665 664]
number 6: E_in: 0.09107118365107669, E_out: 0.17887394120577982
[550 542]
number 8: E_in: 0.074338225209162, E_out: 0.17887394120577982


Highest E_in for number 0 - [a]

In [62]:
compare_e_in([1, 3, 5, 7, 9])

[774 774]
number 1: E_in: 0.02825401179536413, E_out: 0.28948679621325357
[671 658]
number 3: E_in: 0.09024825126868741, E_out: 0.17887394120577982
[562 556]
number 5: E_in: 0.07625840076807022, E_out: 0.17887394120577982
[650 645]
number 7: E_in: 0.08846523110684401, E_out: 0.17887394120577982
[645 644]
number 9: E_in: 0.08832807570977919, E_out: 0.17887394120577982


Lowest E_in for number 1 - [a]

4 question - 1200 [b] !!! check, correct - c

In [67]:
def get_one_vs_one(first_num, second_num, df):
    df_to_check = df[(df.digit == first_num) | (df.digit == second_num)]
    target = df_to_check['digit'].apply(lambda x: 1 if x == first_num else -1)
    return df_to_check[['intensity', 'symmetry']], target

In [68]:
X_train, y_train = get_one_vs_one(1, 5, train_df)
X_test, y_test = get_one_vs_one(1, 5, test_df)

In [75]:
C = [0.0001, 0.001, 0.01, 0.1, 1]
Q = 2

In [72]:
def train_and_get_e_in(X_train, y_train, X_test, y_test, C, degree):
    svc = SVC(kernel='poly', degree=degree, C=C)
    svc.fit(X_train, y_train)
    y_train_predict = svc.predict(X_train)
    y_test_predict = svc.predict(X_test)
    in_sample_error = 1 - accuracy_score(y_train, y_train_predict)
    out_sample_error = 1 - accuracy_score(y_test, y_test_predict)
    return in_sample_error, out_sample_error, svc.n_support_

In [76]:
for c in C:
    e_in, e_out, num_of_support = train_and_get_e_in(X_train, y_train, X_test, y_test, c, Q)
    print(f"C: {c}, e_in: {e_in}, e_out: {e_out}, num of support_vectors: {num_of_support}")

C: 0.0001, e_in: 0.3452914798206278, e_out: 0.3514150943396226, num of support_vectors: [556 556]
C: 0.001, e_in: 0.026265214606021825, e_out: 0.037735849056603765, num of support_vectors: [284 284]
C: 0.01, e_in: 0.008968609865470878, e_out: 0.02358490566037741, num of support_vectors: [88 88]
C: 0.1, e_in: 0.005124919923126248, e_out: 0.018867924528301883, num of support_vectors: [37 37]
C: 1, e_in: 0.005124919923126248, e_out: 0.021226415094339646, num of support_vectors: [26 26]


5 - [a], [d]   what about a?

In [77]:
Q = 5
for c in C:
    e_in, e_out, num_of_support = train_and_get_e_in(X_train, y_train, X_test, y_test, c, Q)
    print(f"C: {c}, e_in: {e_in}, e_out: {e_out}, num of support_vectors: {num_of_support}")

C: 0.0001, e_in: 0.06085842408712361, e_out: 0.09198113207547165, num of support_vectors: [216 216]
C: 0.001, e_in: 0.034593209481101894, e_out: 0.05660377358490565, num of support_vectors: [131 131]
C: 0.01, e_in: 0.029468289557975647, e_out: 0.04245283018867929, num of support_vectors: [111 111]
C: 0.1, e_in: 0.03139013452914796, e_out: 0.05896226415094341, num of support_vectors: [107 107]
C: 1, e_in: 0.01665598975016014, e_out: 0.028301886792452824, num of support_vectors: [58 59]


6 - b, c  what about c?

# Cross validation

In [90]:
param_grid = {'C': [0.0001, 0.001, 0.01, 0.1, 1],
              'kernel': ['poly'], 'degree': [2]}
c_selection = defaultdict(int)


for i in range(100): 
    kf = StratifiedKFold(n_splits=10)    
    grid = GridSearchCV(SVC(), param_grid, cv = kf)
    grid.fit(X_train, y_train)
    c_selection[grid.best_params_['C']] = c_selection[grid.best_params_['C']] + 1

In [91]:
c_selection

defaultdict(int, {1: 100})

7 - [e]   !!! b is correct

In [93]:
svc = SVC(kernel='poly', degree=2)
scores = []
for i in range(100):
    scores.append(cross_val_score(svc, X_train, y_train, cv=10))

In [96]:
1 - np.mean(scores)

0.00512820512820511

8 - [c]

# RBF

In [97]:
C = [0.01, 1, 100, 10**4, 10**6]

In [98]:
def train_and_get_e_in(X_train, y_train, X_test, y_test, C):
    svc = SVC(kernel='rbf',  C=C)
    svc.fit(X_train, y_train)
    y_train_predict = svc.predict(X_train)
    y_test_predict = svc.predict(X_test)
    in_sample_error = 1 - accuracy_score(y_train, y_train_predict)
    out_sample_error = 1 - accuracy_score(y_test, y_test_predict)
    return in_sample_error, out_sample_error

In [99]:
for c in C:
    e_in, e_out = train_and_get_e_in(X_train, y_train, X_test, y_test, c)
    print(f"C: {c}, e_in: {e_in}, e_out: {e_out}")

C: 0.01, e_in: 0.005765534913516945, e_out: 0.018867924528301883
C: 1, e_in: 0.004484304932735439, e_out: 0.02358490566037741
C: 100, e_in: 0.0032030749519538215, e_out: 0.021226415094339646
C: 10000, e_in: 0.002562459961563124, e_out: 0.021226415094339646
C: 1000000, e_in: 0.002562459961563124, e_out: 0.02594339622641506


9 - [e]

10 - [a] !!! correct is c