# TASK 1

In this task, you need to use a publicly available simple MNIST dataset and build 3 classification
models around it. It should be the following models:
1) Random Forest;
2) Feed-Forward Neural Network;
3) Convolutional Neural Network;

Each model should be a separate class that implements MnistClassifierInterface with 2
abstract methods - train and predict. Finally, each of your three models should be hidden under
another MnistClassifier class. MnistClassifer takes an algorithm as an input parameter.
Possible values for the algorithm are: cnn, rf, and nn for the three models described above.

The solution should contain:
1) Interface for models called MnistClassifierInterface.
2) 3 classes (1 for each model) that implement MnistClassifierInterface.
3) MnistClassifier, which takes as an input parameter the name of the algorithm and
provides predictions with exactly the same structure (inputs and outputs) not depending
on the selected algorithm.

In [1]:
from abc import ABC, abstractmethod

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.datasets import fetch_openml

import pandas as pd

import tensorflow as tf

import matplotlib.pyplot as plt
from PIL import Image

In [2]:
def clip_array(array) -> np.ndarray:
  array = array.astype(int)
  return np.clip(array, 0, 9).astype(int)

In [3]:
class DataProviderInterface(ABC):
  @classmethod
  @abstractmethod
  def provide_train() -> tuple:
    pass
  
  @classmethod
  @abstractmethod
  def provide_test() -> tuple:
    pass

In [4]:
class SklearnMnistProvider(DataProviderInterface):
  def __init__(self, test_size=0.3, random_state=42):
    self.mnist = fetch_openml('mnist_784')
    self.X = self.mnist.data
    self.y = self.mnist.target
  
    self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
      self.X, self.y, 
      test_size=test_size, 
      random_state=random_state
    ) 
    self.y_train = clip_array(self.y_train)
    self.y_test = clip_array(self.y_test)
    
  def provide_train(self) -> tuple:
    return self.X_train.to_numpy(), self.y_train
  
  def provide_test(self) -> tuple:
    return self.X_test.to_numpy(), self.y_test
  
  def provide_data_frame(self) -> pd.DataFrame:
    ds = pd.DataFrame(self.X)
    ds['y'] = self.y
    return ds

In [465]:
sklearn_provider = SklearnMnistProvider()

In [468]:
sklearn_provider.y_test.info()

<class 'pandas.core.series.Series'>
Index: 21000 entries, 46730 to 2571
Series name: class
Non-Null Count  Dtype
--------------  -----
21000 non-null  int32
dtypes: int32(1)
memory usage: 246.1 KB


In [436]:
class TensorflowMnistProvider(DataProviderInterface):
  def __init__(self):
    self.mnist = tf.keras.datasets.mnist
    (X_train, self.y_train), (X_test, self.y_test) = self.mnist.load_data()
    self.X_train = X_train.reshape(X_train.shape[0], -1)
    self.X_test = X_test.reshape(X_test.shape[0], -1)
    
  def provide_train(self) -> tuple:
    return self.X_train, self.y_train
  
  def provide_test(self) -> tuple:
    return self.X_test, self.y_test

In [437]:
tensorflow_provider = TensorflowMnistProvider()

In [None]:
class DataProviderManager:
  """
    Small research showed that there are two libs for MNIST -- at least two libs like tf and sklearn. I couldn't really
    choose one specific so I wrote this simple wrapper that gives you a dataset that
    you want. Now, RF implemented with sklearn can be trained on TF dataset. 
  """
  @staticmethod
  def request_sklearn():
    return SklearnMnistProvider()
  
  @staticmethod
  def request_tensorflow():
    return TensorflowMnistProvider()
  
  @staticmethod
  def request_img(src: str) -> np.ndarray:
    """
      turns an image into an ndarray with length = 784. Made it to test my own
      written digits. Keeping it fun or smth
    """
    img = Image.open(fp=src)
    img = img.resize((28, 28), Image.LANCZOS)  # best down-sizing filter
    img = img.convert('L')  # convert the image to *greyscale*
    img = np.array(img)
    img = img.reshape(1, 28 * 28)
    return img

In [439]:
class MnistClassifierInterface(ABC):
  @classmethod
  @abstractmethod
  def train() -> None:
    pass
  
  @classmethod
  @abstractmethod
  def predict() -> None:
    pass

### random forest (sklearn)

In [605]:
from sklearn.ensemble import RandomForestClassifier

class RandomForestMnist(MnistClassifierInterface):  
  def __init__(self, n_estimators = 100, criterion='entropy', max_depth=20):
    self.model = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth)
  
  def train(self, X_train, y_train) -> None:
    self.model.fit(X_train, y_train)
    print("> training ended")
  
  def predict(self, X_test) -> np.ndarray:
    y_pred = self.model.predict(X_test)
    return y_pred
  
  def evaluate(self, X_test, y_test) -> None:
    y_pred = self.model.predict(X_test)
    print(metrics.classification_report(y_test, y_pred))

In [441]:
# testing Random Forest here

# preparing data
sklearn_mnist = DataProviderManager.request_sklearn()
tensorflow_mnist = DataProviderManager.request_tensorflow()

X_train, y_train = tensorflow_mnist.provide_train()
X_test, y_test = tensorflow_mnist.provide_test() 

# setting-up the model
rf_model = RandomForestMnist()
rf_model.train(X_train, y_train)
rf_model.evaluate(X_test, y_test)

> training ended
              precision    recall  f1-score   support

           0       0.97      0.99      0.98       980
           1       0.99      0.99      0.99      1135
           2       0.96      0.96      0.96      1032
           3       0.96      0.97      0.96      1010
           4       0.97      0.97      0.97       982
           5       0.96      0.96      0.96       892
           6       0.97      0.97      0.97       958
           7       0.97      0.96      0.97      1028
           8       0.96      0.95      0.96       974
           9       0.96      0.95      0.95      1009

    accuracy                           0.97     10000
   macro avg       0.97      0.97      0.97     10000
weighted avg       0.97      0.97      0.97     10000



In [442]:
# what about digits written by me🤔
my_img = DataProviderManager.request_img(src='data/my_digit_4.png')
y_pred, y_true = rf_model.predict(my_img), ['4']
print(f"Expected: {y_true}\nGot: {y_pred}")

Expected: ['4']
Got: [4]


### feed-forward nn (tf)

In [606]:
class FeedForwardNNMnist(MnistClassifierInterface):
  def __init__(self):
    self.model = tf.keras.Sequential([
        tf.keras.layers.Dense(units=64, activation='relu',
                              input_shape=[784]),
        tf.keras.layers.Dense(units=64, activation='relu'),
        tf.keras.layers.Dense(units=1)
    ])
    self.model.summary()
    self.model.compile(optimizer='adam', loss='mae')  
  
  def train(self, X_train, y_train) -> None:
    self.model.fit(
      X_train, y_train,
      # validation_data=(X_test, y_test),
      batch_size=256, 
      epochs=20,  
    )
    print("> training ended")
  
  def predict(self, X_test) -> np.ndarray:
    y_pred = self.model.predict(X_test)
    y_pred = clip_array(y_pred)
    return y_pred
  
  def evaluate(self, X_test, y_test) -> None:
    y_pred = self.predict(X_test)
    print(metrics.classification_report(y_test, y_pred, zero_division=np.nan))

### convolutional nn (tf)

In [607]:
class ConvolutionalNNMnist(MnistClassifierInterface):
  def __init__(self):
    self.model = tf.keras.models.Sequential([
      tf.keras.layers.Conv2D(32, kernel_size=(3, 3), input_shape=[28, 28, 1]),
      tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
      
      tf.keras.layers.Conv2D(48, kernel_size=(3, 3)),
      tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
      
      tf.keras.layers.Flatten(),
      tf.keras.layers.Dense(64, activation='relu'),
      tf.keras.layers.Dense(units=1),
    ])
    self.model.summary()
    self.model.compile(
      loss="mae",
      optimizer="adam",
    )
  
  def train(self, X_train, y_train) -> None:
    X_train = X_train.reshape(X_train.shape[0], 28, 28, 1)
    self.model.fit(X_train, y_train, batch_size=256, epochs=3)
    print("> training ended")
  
  def predict(self, X_test) -> np.ndarray:
    X_test = X_test.reshape(X_test.shape[0], 28, 28, 1)
    y_pred = self.model.predict(X_test)
    y_pred = clip_array(y_pred)
    return y_pred
  
  def evaluate(self, X_test, y_test) -> None:
    X_test = X_test.reshape(X_test.shape[0], 28, 28, 1)
    y_pred = self.predict(X_test)
    print(metrics.classification_report(y_test, y_pred, zero_division=np.nan))

In [608]:
class MnistClassifier:
  model_map = {
    'cnn': ConvolutionalNNMnist,
    'nn': FeedForwardNNMnist,
    'rf': RandomForestMnist,
  }
  data_map = {
    'tensorflow': DataProviderManager.request_tensorflow,
    'sklearn': DataProviderManager.request_sklearn,
  }
  def __init__(self, algorithm: str, provider: str):
    self.model = self.model_map.get(algorithm)()
    self.data_provider = self.data_map.get(provider)()
    
  def train(self) -> None:
    X_train, y_train = self.data_provider.provide_train()
    self.model.train(X_train, y_train)
    
  def predict(self) -> None:
    X_test, y_test = self.data_provider.provide_test()
    self.model.evaluate(X_test, y_test)

## checking MnistClassifier

In [609]:
mnist_classifier = MnistClassifier(algorithm='cnn', provider='sklearn')

Model: "sequential_82"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_86 (Conv2D)          (None, 26, 26, 32)        320       
                                                                 
 max_pooling2d_86 (MaxPoolin  (None, 13, 13, 32)       0         
 g2D)                                                            
                                                                 
 conv2d_87 (Conv2D)          (None, 11, 11, 48)        13872     
                                                                 
 max_pooling2d_87 (MaxPoolin  (None, 5, 5, 48)         0         
 g2D)                                                            
                                                                 
 flatten_43 (Flatten)        (None, 1200)              0         
                                                                 
 dense_203 (Dense)           (None, 64)              

In [610]:
mnist_classifier.train()

Epoch 1/3
Epoch 2/3
Epoch 3/3
> training ended


In [611]:
mnist_classifier.predict()

              precision    recall  f1-score   support

           0       0.65      0.87      0.74      2058
           1       0.51      0.58      0.54      2364
           2       0.37      0.40      0.39      2133
           3       0.29      0.36      0.32      2176
           4       0.24      0.30      0.27      1936
           5       0.17      0.24      0.20      1915
           6       0.12      0.13      0.13      2088
           7       0.34      0.23      0.27      2248
           8       0.22      0.14      0.17      1992
           9       0.87      0.17      0.29      2090

    accuracy                           0.35     21000
   macro avg       0.38      0.34      0.33     21000
weighted avg       0.38      0.35      0.34     21000

