# Digits dataset

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.metrics import balanced_accuracy_score
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SequentialFeatureSelector, SelectFromModel
from sklearn.linear_model import LassoCV
from boruta import BorutaPy

In [2]:
x_train_digits=pd.read_csv("data/digits_train.data",header=None, sep=" ")

x_train_digits=x_train_digits.drop(columns=[5000])

y_train_digits=pd.read_csv("data/digits_train.labels",header=None, sep=" ")

y_train_digits=y_train_digits.to_numpy().reshape(-1)

x_val_digits=pd.read_csv("data/digits_valid.data",header=None, sep=" ").iloc[:,:5000]

scaler = MinMaxScaler()
scaler.fit(x_train_digits)
x_train=scaler.transform(x_train_digits)
x_val=scaler.transform(x_val_digits)

In [3]:
kbest = SelectKBest(chi2, k=3500)
kbest.fit(x_train_digits, y_train_digits)
x_digits_kbest = kbest.transform(x_train_digits)
x_val = kbest.transform(x_train_digits)

In [None]:
rf_digits = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)
feat_selector_digits = BorutaPy(rf_digits, n_estimators='auto', verbose=2, random_state=1, max_iter = 3000)
feat_selector_digits.fit(x_digits_kbest, y_train_digits)

Iteration: 	1 / 3000
Confirmed: 	0
Tentative: 	3500
Rejected: 	0
Iteration: 	2 / 3000
Confirmed: 	0
Tentative: 	3500
Rejected: 	0
Iteration: 	3 / 3000
Confirmed: 	0
Tentative: 	3500
Rejected: 	0
Iteration: 	4 / 3000
Confirmed: 	0
Tentative: 	3500
Rejected: 	0
Iteration: 	5 / 3000
Confirmed: 	0
Tentative: 	3500
Rejected: 	0
Iteration: 	6 / 3000
Confirmed: 	0
Tentative: 	3500
Rejected: 	0
Iteration: 	7 / 3000
Confirmed: 	0
Tentative: 	3500
Rejected: 	0
Iteration: 	8 / 3000
Confirmed: 	1073
Tentative: 	588
Rejected: 	1839
Iteration: 	9 / 3000
Confirmed: 	1073
Tentative: 	588
Rejected: 	1839
Iteration: 	10 / 3000
Confirmed: 	1073
Tentative: 	588
Rejected: 	1839
Iteration: 	11 / 3000
Confirmed: 	1073
Tentative: 	588
Rejected: 	1839
Iteration: 	12 / 3000
Confirmed: 	1125
Tentative: 	536
Rejected: 	1839


In [5]:
x_digits_boruta = feat_selector_digits.transform(x_digits_kbest)
x_val = feat_selector_digits.transform(x_val)

In [62]:
pca = PCA(n_components=40)

pca.fit(x_digits_boruta)

x_train_pca=pca.transform(x_digits_boruta)
x_val_pca=pca.transform(x_val)

In [63]:
forest=RandomForestClassifier().fit(x_train_pca, y_train_digits)

pred = forest.predict(x_train_pca)

balanced_accuracy_score(y_train_digits, pred)

1.0

In [64]:
acc_reg=[0 for x in range(5)]
acc_forest=[0 for x in range(5)]
for i in range(5):
    mask = np.zeros(x_train_pca.shape[0], dtype=bool)
    mask[i::5] = True

    x_test = x_train_pca[mask]
    x_fold = x_train_pca[~mask]
    
    y_test = y_train_digits[mask]
    y_fold = y_train_digits[~mask]
    
    reg = LogisticRegression().fit(x_fold, y_fold)
    pred = reg.predict(x_test)
    acc_reg[i]=balanced_accuracy_score(y_test, pred)
    
    forest=RandomForestClassifier().fit(x_fold, y_fold)
    pred = forest.predict(x_test)
    acc_forest[i]=balanced_accuracy_score(y_test, pred)
print('Forest: ', acc_forest)
print('Logistic regression: ', acc_reg)

Forest:  [0.9725453408380238, 0.9699986666074047, 0.964577376250292, 0.9650253489825682, 0.962230697176502]
Logistic regression:  [0.9572441109026475, 0.9583425930041335, 0.9511648994759621, 0.9574623237724842, 0.9523785628261743]


In [65]:
sum(acc_forest)/5

0.9668754859709582

In [66]:
posterior_probs = forest.predict_proba(x_val_pca)[:,1]
posterior_probs

array([0.94, 0.06, 0.92, ..., 0.02, 0.02, 0.11])

In [67]:
pd.DataFrame({'MATSZY': posterior_probs}).to_csv('MATSZY_digits_prediction.txt', index = None)

In [102]:
vars_kbest = np.where(kbest.get_support())[0]

In [111]:
vars_kbest

array([   2,    6,    7, ..., 4994, 4997, 4999], dtype=int64)

In [116]:
vars_indexes = vars_kbest[feat_selector_digits.support_]
pd.DataFrame({'MATSZY': vars_indexes + 1}).to_csv('MATSZY_digits_features.txt', index = None)