In [1]:
#MNIST pair of digits recognition using RandomForest model
#Yi Cong Li (20122756)
#this code is built using inpirations from:
#https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
#https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

#please make sure to run this code in default kaggle notebook environnment of competition

# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import sklearn
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import RandomOverSampler

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import collections
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/classification-of-mnist-digits/train_result.csv
/kaggle/input/classification-of-mnist-digits/train.csv
/kaggle/input/classification-of-mnist-digits/test.csv


In [2]:
#loading train/val data
train_data_path = '/kaggle/input/classification-of-mnist-digits/train.csv'
df = pd.read_csv(train_data_path)
df = df.drop(['Unnamed: 1568'], axis = 1)
X_data = df.values.astype(np.float32)


#loading test data
test_data_path = '/kaggle/input/classification-of-mnist-digits/test.csv'
df_test = pd.read_csv(test_data_path)
df_test = df_test.drop(['Unnamed: 1568'], axis = 1)
X_test = df_test.values.astype(np.float32)

#loading train/val labels
train_labels_path = '/kaggle/input/classification-of-mnist-digits/train_result.csv'
df_train_labels = pd.read_csv(train_labels_path)
y_data = df_train_labels['Class'].to_numpy()

#randomly split train set and validation set
np.random.seed(6390)
#80% train set, 20% validation set
len_train = round(0.80*X_data.shape[0])
idx = np.arange(X_data.shape[0])
np.random.shuffle(idx)
train_idx = idx[:len_train]
val_idx = idx[len_train:]
X_train = X_data[train_idx] 
X_val = X_data[val_idx]
y_train = y_data[train_idx]
y_val = y_data[val_idx]

In [3]:
#Oversampling to fix the distribution issue
oversample = RandomOverSampler(sampling_strategy='not majority')
X_train, y_train = oversample.fit_resample(X_train, y_train)

In [4]:
#max_depth, n_estimators chosen using exhaustive trials
model = RandomForestClassifier(n_estimators=500, max_depth=100)
model.fit(X_train,y_train)

RandomForestClassifier(max_depth=100, n_estimators=500)

In [5]:
pred_val = model.predict(X_val)
print ("Confusion Matrix: ")
print(confusion_matrix(y_val, pred_val))
val_accuracy = (y_val == pred_val).sum() / len(y_val)
print("validation accracy: " + str(val_accuracy))

Confusion Matrix: 
[[105   0   1   0   0   1   3   0   1   1   0   0   0   0   0   0   0   0
    0]
 [  1 177   2   1   0   2   0   4   1   0   2   0   0   0   0   0   0   0
    0]
 [  2   0 302   1   1   0   3   4   4   0   2   0   0   0   0   0   0   0
    0]
 [  0   4   3 398   5  11   1   8  12   8   1   3   1   0   0   0   0   0
    0]
 [  1   3   7  14 395   7  14   3   5  36   7   3   2   2   1   0   0   0
    0]
 [  5   6   1  17  24 481   7  11   6   8  25  21   1   0   0   0   0   0
    0]
 [  3  10   4   5  24  15 558   6  11   7  16  42  20   2   2   0   0   0
    0]
 [  5   3   8   7   3  41  31 561  18  22   6  16  42   8   3   2   0   0
    0]
 [  1   6  10  19  10  10  38  28 719  20  19   9  20  42   9   3   2   0
    0]
 [  2   1  13   9  15  13  12  29  19 721  21  34  23  24  23  13   5   1
    0]
 [  0   1   6  16  14  21  15  10  49  26 627  31  50   4   9  21  12   2
    0]
 [  0   0   2   3   6  11  24  11  14  40  25 539  25  21   6  12  21  11
    0]
 [  0   0

In [6]:
pred = model.predict(X_test)
output = pd.DataFrame({'Index': np.arange(X_test.shape[0]), 'Class': pred})
output.to_csv('submission.csv', index=False)