#### Getting the codes ready.

In [2]:
import pandas as pd
train_data = pd.read_csv("~/digit-recognizer/train.csv")
test_data = pd.read_csv("~/digit-recognizer/test.csv")

In [3]:
# take target out of training set
Y = train_data['label']
train_data = train_data.drop(['label'], axis=1)

In [4]:
train_data

Unnamed: 0,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
41996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
41997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
41998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
# remove the features that are all zeros
total_data = train_data
total_data = total_data.append(test_data)
for col in total_data.columns:
    if len(total_data[total_data[col] == 0]) == len(total_data[col]):
        train_data = train_data.drop([col], axis=1)
        test_data = test_data.drop([col], axis=1)

  total_data = total_data.append(test_data)


In [6]:
# min-max scaling
train_data = (train_data - train_data.min()) / (train_data.max() - train_data.min())
test_data = (test_data - test_data.min()) / (test_data.max() - test_data.min())

In [7]:
def countMissing(data):
    missing = data.columns[data.isnull().any()].tolist()
    return missing
misTrain = countMissing(train_data)
misTest = countMissing(test_data)
misTotal = list(set().union(misTrain, misTest))

In [8]:
def imputation(data, column, value):
    data.loc[data[column].isnull(), column] = value

In [9]:
print(misTrain, len(misTrain))

['pixel139', 'pixel141', 'pixel196', 'pixel392', 'pixel420', 'pixel421', 'pixel448', 'pixel532', 'pixel645', 'pixel731', 'pixel760'] 11


In [10]:
print(misTest, len(misTest))

['pixel12', 'pixel13', 'pixel14', 'pixel15', 'pixel32', 'pixel51', 'pixel58', 'pixel113', 'pixel364', 'pixel588', 'pixel615', 'pixel616', 'pixel643', 'pixel698', 'pixel702', 'pixel726', 'pixel753', 'pixel779'] 18


In [11]:
for ele in misTrain:
    imputation(train_data, ele, 1)
    
for ele in misTest:
    imputation(test_data, ele, 1)

In [12]:
# One-Vs-Rest with regularized logistic regression models
from sklearn.multiclass import OneVsRestClassifier
from sklearn import linear_model
# edit by Keith: solver='liblinear'
res = OneVsRestClassifier(linear_model.LogisticRegression(penalty='l1', solver='liblinear')).fit(train_data, Y).predict(test_data)


In [13]:
res

array([2, 0, 9, ..., 3, 9, 2])

In [14]:
# One-Vs-One with regularized logistic regression models
from sklearn.multiclass import OneVsOneClassifier
from sklearn import linear_model
# edit by Keith: solver='liblinear'
res = OneVsOneClassifier(linear_model.LogisticRegression(penalty='l1', solver='liblinear')).fit(train_data, Y).predict(test_data)

In [15]:
res

array([2, 0, 8, ..., 3, 9, 2])

In [16]:
# edit by Keith: cross_val_score
from sklearn.model_selection import cross_val_score
alphas = [0.01, 0.1, 1.0, 10]
regs = ["l1", "l2"]
scores = []
param = []
for alpha in alphas:
    for reg in regs:
        lm = OneVsRestClassifier(linear_model.LogisticRegression(penalty=reg, C=alpha, solver='liblinear'))
        scores.append(cross_val_score(lm, train_data, Y, scoring="accuracy", cv=2).mean())
        param.append([alpha, reg])
scores = pd.DataFrame({'parameter': param, 'score': scores})
print(scores.sort_values(by = 'score', ascending = False))

    parameter     score
4   [1.0, l1]  0.912000
5   [1.0, l2]  0.910714
3   [0.1, l2]  0.910262
7    [10, l2]  0.904881
6    [10, l1]  0.902667
2   [0.1, l1]  0.902286
1  [0.01, l2]  0.898929
0  [0.01, l1]  0.852310


In [18]:
from sklearn.model_selection import cross_val_score
alphas = [0.01, 0.1, 1.0, 10]
regs = ["l1", "l2"]
scores = []
param = []
for alpha in alphas:
    for reg in regs:
        lm = OneVsOneClassifier(linear_model.LogisticRegression(penalty=reg, C=alpha, solver='liblinear'))
        scores.append(cross_val_score(lm, train_data, Y, scoring="accuracy", cv = 2).mean())
        param.append([alpha, reg])
scores = pd.DataFrame({'parameter': param, 'score': scores})
print(scores.sort_values(by = 'score', ascending = False))

    parameter     score
5   [1.0, l2]  0.934429
3   [0.1, l2]  0.933190
4   [1.0, l1]  0.932262
7    [10, l2]  0.927167
6    [10, l1]  0.923167
1  [0.01, l2]  0.918524
2   [0.1, l1]  0.914429
0  [0.01, l1]  0.837143


In [19]:
from sklearn.model_selection import cross_val_score

# Expanded range of parameters.
alphas = [0.001, 0.01, 0.1, 1.0, 10, 100]
#

regs = ["l1", "l2"]
scores = []
param = []
for alpha in alphas:
    for reg in regs:
        lm = OneVsRestClassifier(linear_model.LogisticRegression(penalty=reg, C=alpha, solver='liblinear'))
        scores.append(cross_val_score(lm, train_data, Y, scoring="accuracy", cv=2).mean())
        param.append([alpha, reg])
scores = pd.DataFrame({'parameter': param, 'score': scores})
print(scores.sort_values(by = 'score', ascending = False))

      parameter     score
6     [1.0, l1]  0.912119
7     [1.0, l2]  0.910714
5     [0.1, l2]  0.910262
9      [10, l2]  0.904881
8      [10, l1]  0.902762
4     [0.1, l1]  0.902190
3    [0.01, l2]  0.898929
11    [100, l2]  0.898714
10    [100, l1]  0.893310
1   [0.001, l2]  0.869548
2    [0.01, l1]  0.852619
0   [0.001, l1]  0.446976


From above we can say that the alpha = 1.0, l1 regulirization making the best classifications with accuracy of 0.912


-

In [20]:
from sklearn.model_selection import cross_val_score

# Expanded range of parameters.
alphas = [0.001, 0.01, 0.1, 1.0, 10, 100]
#
regs = ["l1", "l2"]
scores = []
param = []
for alpha in alphas:
    for reg in regs:
        lm = OneVsOneClassifier(linear_model.LogisticRegression(penalty=reg, C=alpha, solver='liblinear'))
        scores.append(cross_val_score(lm, train_data, Y, scoring="accuracy", cv = 2).mean())
        param.append([alpha, reg])
scores = pd.DataFrame({'parameter': param, 'score': scores})
print(scores.sort_values(by = 'score', ascending = False))

      parameter     score
7     [1.0, l2]  0.934429
5     [0.1, l2]  0.933190
6     [1.0, l1]  0.932095
9      [10, l2]  0.927167
8      [10, l1]  0.922548
10    [100, l1]  0.920310
11    [100, l2]  0.920214
3    [0.01, l2]  0.918524
4     [0.1, l1]  0.914310
1   [0.001, l2]  0.886119
2    [0.01, l1]  0.837071
0   [0.001, l1]  0.205810


From above we can say that the alpha = 1.0, l2 regulirization making the best classifications with accuracy of 0.934

In [21]:

# Expanded range of alpha values and regularization types
alphas = [0.001, 0.01, 0.1, 1.0, 10, 100]
regs = ["l1", "l2"]
scores1 = []
param1 = []
scores2 = []
param2 = []

for alpha in alphas:
    for reg in regs:
        lm1 = OneVsRestClassifier(linear_model.LogisticRegression(penalty=reg, C=alpha, solver='liblinear'))
        scores1.append(cross_val_score(lm1, train_data, Y, scoring="accuracy", cv = 2).mean())
        param1.append([alpha, reg])
        
        
    for reg in regs:
        lm2 = OneVsOneClassifier(linear_model.LogisticRegression(penalty=reg, C=alpha, solver='liblinear'))
        scores2.append(cross_val_score(lm2, train_data, Y, scoring="accuracy", cv = 2).mean())
        param2.append([alpha, reg])
        
        
    
##
scores1 = pd.DataFrame({'parameter': param1, 'score': scores1})
print('One vs Rest :')
print(scores1.sort_values(by = 'score', ascending = False))
##
scores2 = pd.DataFrame({'parameter': param2, 'score': scores2})
print('One vs One :')
print(scores2.sort_values(by = 'score', ascending = False))
##

One vs Rest :
      parameter     score
6     [1.0, l1]  0.911976
7     [1.0, l2]  0.910714
5     [0.1, l2]  0.910262
9      [10, l2]  0.904881
8      [10, l1]  0.902786
4     [0.1, l1]  0.902262
3    [0.01, l2]  0.898929
11    [100, l2]  0.898714
10    [100, l1]  0.893238
1   [0.001, l2]  0.869548
2    [0.01, l1]  0.852357
0   [0.001, l1]  0.446881
One vs One :
      parameter     score
7     [1.0, l2]  0.934429
5     [0.1, l2]  0.933190
6     [1.0, l1]  0.932571
9      [10, l2]  0.927167
8      [10, l1]  0.922571
11    [100, l2]  0.920214
10    [100, l1]  0.919595
3    [0.01, l2]  0.918524
4     [0.1, l1]  0.914429
1   [0.001, l2]  0.886119
2    [0.01, l1]  0.836905
0   [0.001, l1]  0.205810


### In Module 11, we only learned softmax regression models in RBF and PCA. Please adopt One-Vs-One or One-Vs-The-Rest instead in RBF and PCA. Post Python codes in your solution and report the best performance you’ve got on testing set for your codes of One-Vs-One and One-Vs-The-Rest separately. (4 points)

In [28]:
from sklearn.model_selection import cross_val_score

# Expanded range of parameters.
alphas = [0.001, 0.01, 0.1, 1.0, 10, 100]
#

regs = ["l1", "l2"]
scores = []
param = []
for alpha in alphas:
    for reg in regs:
        lm = OneVsRestClassifier(linear_model.LogisticRegression(penalty=reg, C=alpha, solver='liblinear'))
        scores.append(cross_val_score(lm, train_data, Y, scoring="accuracy", cv=2).mean())
        param.append([alpha, reg])
scores = pd.DataFrame({'parameter': param, 'score': scores})
print(scores.sort_values(by = 'score', ascending = False))

      parameter     score
6     [1.0, l1]  0.911952
7     [1.0, l2]  0.910714
5     [0.1, l2]  0.910262
9      [10, l2]  0.904881
8      [10, l1]  0.902619
4     [0.1, l1]  0.902286
3    [0.01, l2]  0.898929
11    [100, l2]  0.898714
10    [100, l1]  0.893310
1   [0.001, l2]  0.869548
2    [0.01, l1]  0.852238
0   [0.001, l1]  0.446429


In [29]:
res = OneVsRestClassifier(linear_model.LogisticRegression(penalty='l1', solver='liblinear', C=1.0)).fit(train_data, Y).predict(test_data)
print(res)

[2 0 9 ... 3 9 2]


In [30]:
from sklearn.model_selection import cross_val_score

# Expanded range of parameters.
alphas = [0.001, 0.01, 0.1, 1.0, 10, 100]
#
regs = ["l1", "l2"]
scores = []
param = []
for alpha in alphas:
    for reg in regs:
        lm = OneVsOneClassifier(linear_model.LogisticRegression(penalty=reg, C=alpha, solver='liblinear'))
        scores.append(cross_val_score(lm, train_data, Y, scoring="accuracy", cv = 2).mean())
        param.append([alpha, reg])
scores = pd.DataFrame({'parameter': param, 'score': scores})
print(scores.sort_values(by = 'score', ascending = False))

      parameter     score
7     [1.0, l2]  0.934429
5     [0.1, l2]  0.933190
6     [1.0, l1]  0.932476
9      [10, l2]  0.927167
8      [10, l1]  0.923024
10    [100, l1]  0.920262
11    [100, l2]  0.920214
3    [0.01, l2]  0.918524
4     [0.1, l1]  0.914476
1   [0.001, l2]  0.886119
2    [0.01, l1]  0.837071
0   [0.001, l1]  0.205810


In [31]:
res = OneVsRestClassifier(linear_model.LogisticRegression(penalty='l2', solver='liblinear', C=1.0)).fit(train_data, Y).predict(test_data)
print(res)

[2 0 8 ... 3 9 2]


#### KNN with PCA

In [None]:
from sklearn.decomposition import PCA
# edit by Keith: cross_val_score
from sklearn.model_selection import cross_val_score
alphas = [0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1.0, 3.0, 10, 30, 100, 300, 1000, 3000]
Ks = [10, 100, 500]
scores = []
param = []
for K in Ks: 
    pca = PCA(n_components = K, svd_solver='arpack')
    pca.fit(train_data)
    pca_train = pca.transform(train_data)
    for alpha in alphas:
        lm = linear_model.LogisticRegression(C = alpha, multi_class = 'multinomial', solver = 'lbfgs')
        scores.append(cross_val_score(lm, pca_train, Y, scoring="accuracy", cv = 10).mean())
        param.append([K, alpha])
scores = pd.DataFrame({'parameter': param, 'score': scores})
print(scores.sort_values(by = 'score', ascending = False))

In [33]:
from sklearn.decomposition import PCA
pca = PCA(n_components = 500, svd_solver='arpack')
pca.fit(train_data)
pca_train = pca.transform(train_data)
pca_test = pca.transform(test_data)

In [34]:
from sklearn.neighbors import KNeighborsClassifier
# edit by Keith: cross_val_score
from sklearn.model_selection import cross_val_score
nigs = [1, 10, 20]
scores = []
param = []
for n in nigs:
    neigh = KNeighborsClassifier(n_neighbors = n)
    scores.append(cross_val_score(neigh, pca_train, Y, scoring="accuracy", cv = 10).mean())
    param.append([n])
scores = pd.DataFrame({'parameter': param, 'score': scores})
print(scores.sort_values(by = 'score', ascending = False))

  parameter     score
0       [1]  0.967429
1      [10]  0.964548
2      [20]  0.958357


Best performance can be observed can be '1', But its overfitted, so we can consider 10 as the best choice of neighbours. This lands accuracy near 0.964

#### KNN with RBF

In [35]:
import numpy as np
import math
# RBF function
def rbf(data, centers, sigma):
    res = np.ndarray(shape = (len(data), len(centers)))
    i = 0
    for row in data:
        tmp = []
        for center in centers:
            tmp.append(math.exp(-1.0 * sum(np.square(row - center)) / (2 * sigma * sigma)))
        res[i, :] = tmp
        i = i + 1
    return res
# select k centers from data
def selectCenters(data, k):
    tmp = np.random.choice(len(data), k)
    return data[tmp,:]

In [37]:
alphas = [0.1, 0.5, 1.0, 10]
sigmas = [1.0, 2.0, 3.0, 4.0]
Ks = [10, 100, 500]
scores = []
param = []

for K in Ks:
    centers = selectCenters(train_data.to_numpy(), K)
    for sigma in sigmas:
        rbfX = rbf(train_data.to_numpy(), centers, sigma)
        
        for a in alphas:
            knn = KNeighborsClassifier(n_neighbors=5) 
            scores.append(cross_val_score(knn, rbfX, Y, scoring="accuracy", cv=10).mean())
            param.append([K, sigma, a])
            
scores_df = pd.DataFrame({'parameter': param, 'score': scores})