# Digits dataset

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.metrics import balanced_accuracy_score
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SequentialFeatureSelector, SelectFromModel
from sklearn.linear_model import LassoCV
from boruta import BorutaPy

In [11]:
x_train_digits=pd.read_csv("data/digits_train.data",header=None, sep=" ")

x_train_digits=x_train_digits.drop(columns=[5000])

y_train_digits=pd.read_csv("data/digits_train.labels",header=None, sep=" ")

y_train_digits=y_train_digits.to_numpy().reshape(-1)

x_val_digits=pd.read_csv("data/digits_valid.data",header=None, sep=" ").iloc[:,:5000]

scaler = MinMaxScaler()
scaler.fit(x_train_digits)
x_train=scaler.transform(x_train_digits)
x_val=scaler.transform(x_val_digits)

In [13]:
col=pd.read_csv("MATSZY_digits_features.txt").MATSZY.to_numpy()-1

In [14]:
x_val=x_val[:,col]

In [15]:
x_train=x_train[:,col]

In [17]:
pca = PCA(n_components=40)

pca.fit(x_train)

x_train_pca=pca.transform(x_train)
x_val_pca=pca.transform(x_val)

In [18]:
forest=RandomForestClassifier().fit(x_train_pca, y_train_digits)

pred = forest.predict(x_train_pca)

balanced_accuracy_score(y_train_digits, pred)

1.0

In [19]:
acc_reg=[0 for x in range(5)]
acc_forest=[0 for x in range(5)]
for i in range(5):
    mask = np.zeros(x_train_pca.shape[0], dtype=bool)
    mask[i::5] = True

    x_test = x_train_pca[mask]
    x_fold = x_train_pca[~mask]
    
    y_test = y_train_digits[mask]
    y_fold = y_train_digits[~mask]
    
    reg = LogisticRegression().fit(x_fold, y_fold)
    pred = reg.predict(x_test)
    acc_reg[i]=balanced_accuracy_score(y_test, pred)
    
    forest=RandomForestClassifier().fit(x_fold, y_fold)
    pred = forest.predict(x_test)
    acc_forest[i]=balanced_accuracy_score(y_test, pred)
print('Forest: ', acc_forest)
print('Logistic regression: ', acc_reg)

Forest:  [0.9724619553887847, 0.9716765189563981, 0.9596318383604624, 0.9625182304326689, 0.9629164993978322]
Logistic regression:  [0.9572441109026475, 0.9583537046090937, 0.9485697437666196, 0.9574623237724842, 0.9531647263481868]


In [20]:
sum(acc_forest)/5

0.9658410085072292

In [21]:
forest=RandomForestClassifier().fit(x_train_pca, y_train_digits)

In [24]:
posterior_probs = forest.predict_proba(x_val_pca)[:,1]
posterior_probs

array([0.56, 0.95, 0.12, 0.96, 0.92, 0.06, 0.02, 0.05, 0.01, 0.03, 0.98,
       0.82, 0.24, 0.94, 0.33, 0.82, 0.96, 0.9 , 0.95, 1.  , 0.1 , 0.95,
       0.77, 0.75, 0.87, 0.07, 0.97, 0.99, 0.37, 0.73, 0.8 , 0.57, 0.21,
       0.98, 0.53, 0.98, 0.49, 1.  , 0.22, 0.2 , 0.98, 0.06, 0.82, 0.97,
       0.17, 0.22, 0.9 , 0.3 , 0.84, 0.15, 0.42, 0.02, 0.02, 0.02, 0.87,
       0.98, 0.32, 0.04, 0.51, 0.94, 0.94, 0.02, 0.07, 0.9 , 0.31, 0.12,
       1.  , 0.97, 0.9 , 0.89, 0.34, 0.27, 0.41, 1.  , 0.89, 0.9 , 0.35,
       0.08, 0.98, 0.98, 0.25, 0.99, 0.58, 0.79, 0.4 , 0.91, 0.21, 0.98,
       0.37, 0.7 , 0.84, 0.44, 0.25, 0.04, 0.97, 0.12, 0.98, 0.14, 0.56,
       0.99, 0.05, 0.19, 0.65, 0.94, 0.99, 0.82, 0.04, 0.95, 0.94, 0.03,
       0.71, 0.16, 0.54, 0.2 , 0.82, 0.98, 0.03, 0.  , 0.31, 0.78, 0.03,
       0.99, 0.17, 0.64, 0.19, 0.15, 0.98, 0.11, 0.04, 0.22, 0.21, 0.21,
       0.1 , 0.46, 0.05, 0.89, 0.9 , 0.98, 0.06, 0.86, 0.89, 0.9 , 0.07,
       0.96, 0.82, 0.8 , 0.04, 0.95, 0.78, 0.02, 0.

In [25]:
pd.DataFrame({'MATSZY': posterior_probs}).to_csv('MATSZY_digits_prediction.txt', index = None)

In [102]:
vars_kbest = np.where(kbest.get_support())[0]

In [111]:
vars_kbest

array([   2,    6,    7, ..., 4994, 4997, 4999], dtype=int64)

In [116]:
vars_indexes = vars_kbest[feat_selector_digits.support_]
pd.DataFrame({'MATSZY': vars_indexes + 1}).to_csv('MATSZY_digits_features.txt', index = None)