# Data segmentation on the MNIST Data set 

The purpose of this exercise is to expand the original dataset by creating new artifical sets of training data using the data augmentation technique, This entails shifting the position of each pixel in any direction.

In [6]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

## Imports

In [5]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.datasets import fetch_openml
from sklearn.ensemble import RandomForestClassifier
from scipy.ndimage.interpolation import shift


sns.set_style('darkgrid')

In [7]:
def load_data():
    return fetch_openml('mnist_784', version=1)

mnist = load_data()

In [8]:
mnist.keys()

dict_keys(['data', 'target', 'frame', 'feature_names', 'target_names', 'DESCR', 'details', 'categories', 'url'])

## Looking at the data 

In [9]:
X = mnist.data
X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [10]:
y = mnist.target
y

array(['5', '0', '4', ..., '4', '5', '6'], dtype=object)

##  Creating a training and test 

In [24]:
def split_set(df, size):
    return df[:size], df[size:]

X_train, X_test = split_set(X, 60000)
y_train, y_test = split_set(y.astype(np.int), 60000)

In [25]:
X_train.shape, y_test.shape, X_test.shape

((60000, 784), (10000,), (10000, 784))

## Data augmenting

In [17]:
def augment(data, dy, dx):
    df_reshaped = data.reshape((28, 28))
    shifted_df = shift(df_reshaped, [dy, dx], cval=0.0)
    return shifted_df.reshape([-1])


In [27]:
X_train_augmented = [image for image in X_train]
y_train_augment = [label for label in y_train]


for dy, dx in ((1, 0), (-1, 0), (0, 1), (0, -1)):
    for image, label in zip(X_train, y_train):
        X_train_augmented.append(augment(image, dy, dx))
        y_train_augment.append(label)

X_train_augmented = np.array(X_train_augmented)
y_train_augment = np.array(y_train_augment) 

In [38]:
shuffle_index = np.random.permutation(len(X_train_augmented))
X_train_augmented = X_train_augmented[shuffle_index]

In [40]:
y_train_augment = y_train_augment[shuffle_index]

## Building a model using random forest

I won't build a model since it is quite big for my computer resource