# Principal Components Analysis

In this study, we are interested in finding out the best approach to implement a model for computer vision. In particular, the MNIST data set is used. PCA is applied to it to verify which would be the advantages of using it.

## Import packages

In [72]:
# Operating system basic packages
import os # operating system
import gc # garbage collector
import random # random seed generator
import datetime as dt

# basic daraframe, array and math
import pandas as pd  # data frame operations  
import math # math functions
import numpy as np  # arrays and math functions
import matplotlib.pyplot as plt  # static plotting
import seaborn as sns  # pretty plotting, including heat map

# Scikit Learn
import sklearn as sk # scikit learn
from sklearn.utils import resample # sampling
from sklearn.model_selection import train_test_split as tts # train test split
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet # lieanr models import
from sklearn.metrics import mean_squared_error, r2_score, f1_score# metrics import
from math import sqrt  # for root mean-squared error calculation

# suppress warning messages
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

from sklearn import model_selection # needed models
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer # imputer
from sklearn.ensemble import BaggingRegressor as BR # Bagging regression
from sklearn.ensemble import RandomForestRegressor as RFR # Random Forest regression
from sklearn.ensemble import RandomForestClassifier as RFC # Random Forest Classifier
from sklearn.ensemble import ExtraTreesRegressor as ETR # Extra Tree  regression
from sklearn.ensemble import GradientBoostingRegressor as BR # Gradient Boosting regression
from sklearn.ensemble import BaggingRegressor as BR # Bagging regression
from sklearn.decomposition import PCA 
from pandas.plotting import scatter_matrix

RANDOM_SEED=1234




# Eploratory Data Anaysis

There are no missing values in the data set, the label column containing the digit between 0 and 9, the pixel columns containing values in the range 0 to 255 (as expected).

In [73]:
# read data from MINST
# creating data frame 
mnist_tr_df = pd.read_csv('train.csv')
mnist_ts_df = pd.read_csv('test.csv')

# check the pandas DataFrame object MNIST
print('\n training DataFrame (first five rows):')
print(mnist_tr_df.head())
print('\n test DataFrame (first five rows):')
print(mnist_ts_df.head())

# basic info of the datframe
print('\nGeneral description of the training MNIST DataFrame:')
print(mnist_tr_df.info())

print('\nGeneral description of the test MNIST DataFrame:')
print(mnist_ts_df.info())

# basic info of the datframe
print('\nGeneral description of the training MNIST DataFrame:')
print(mnist_tr_df.describe())

print('\nGeneral description of the test MNIST DataFrame:')
print(mnist_ts_df.describe())


 training DataFrame (first five rows):
   label  pixel0  pixel1  pixel2  pixel3  pixel4  pixel5  pixel6  pixel7  \
0      1       0       0       0       0       0       0       0       0   
1      0       0       0       0       0       0       0       0       0   
2      1       0       0       0       0       0       0       0       0   
3      4       0       0       0       0       0       0       0       0   
4      0       0       0       0       0       0       0       0       0   

   pixel8  ...  pixel774  pixel775  pixel776  pixel777  pixel778  pixel779  \
0       0  ...         0         0         0         0         0         0   
1       0  ...         0         0         0         0         0         0   
2       0  ...         0         0         0         0         0         0   
3       0  ...         0         0         0         0         0         0   
4       0  ...         0         0         0         0         0         0   

   pixel780  pixel781  pixel782  p

## Data preparation for the model

The data is converted to arrays to be used by the model.

In [74]:
#y = mnist_tr_df.loc[:, 'label']
#x = mnist_tr_df.loc[:,'pixel0':'pixel783']

y_train = mnist_tr_df.loc[:, 'label']
x_train = mnist_tr_df.loc[:,'pixel0':'pixel783']

x_test = mnist_ts_df.loc[:,'pixel0':'pixel783']

In [75]:
x_train.head()

Unnamed: 0,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [76]:
y_train.head()

0    1
1    0
2    1
3    4
4    0
Name: label, dtype: int64

In [77]:
#x_train, x_test, y_train, y_test = tts(x, y, test_size = .20, random_state = RANDOM_SEED)

x_test.head()

Unnamed: 0,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [78]:
x_test.info()
x_train.info()
y_train

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28000 entries, 0 to 27999
Columns: 784 entries, pixel0 to pixel783
dtypes: int64(784)
memory usage: 167.5 MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42000 entries, 0 to 41999
Columns: 784 entries, pixel0 to pixel783
dtypes: int64(784)
memory usage: 251.2 MB


0        1
1        0
2        1
3        4
4        0
        ..
41995    0
41996    1
41997    7
41998    6
41999    9
Name: label, Length: 42000, dtype: int64

# Step 1

# Random Forest Classifier

A random forest classifier is used on the training set. the hyperparameters for the model are the following (see appendix):

max_features='sqrt'
n_estimators=100
bootstrap = True

The execution time is displayed. The resulting prediction is stored in a CSV file to be uploaded in Kaggle for testing (score: 0.96728, UserId: Vittorio Pepe).

In [79]:

#using sklearn RandomForestClassifier, create test model:


model_to_fit =  RFC(max_features='sqrt', n_estimators=100, bootstrap = True, random_state=RANDOM_SEED)

# Start timer
start=dt.datetime.now()

fitted_model = model_to_fit.fit(x_train, y_train)

y_pred = fitted_model.predict(x_test)

# Calcuate the f1 score
# score = f1_score(y_test, y_pred, average='macro')

# Record the clock time it takes
duration = dt.datetime.now() - start

#print('\nF1 score: {:.2f}'.format(score))
print('\nDuration: ', duration)





Duration:  0:00:27.322424


In [80]:
df = pd.DataFrame(y_pred, columns=['Label'])

df.index += 1 
df.to_csv('Subm_RFC.csv', index_label='ImageId')
df.info()
#pd.DataFrame(y_pred).to_csv("Subm_RFC.csv", columns=['ImageId','Label'])

#df = pd.DataFrame(myseries, columns=['values'])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28000 entries, 1 to 28000
Data columns (total 1 columns):
Label    28000 non-null int64
dtypes: int64(1)
memory usage: 218.9 KB


# Step 2

# PCA

Principal components analysis (PCA) is executed on the combined training and test set data together, generating principal components that represent 95 percent of the variability in the explanatory variables. 
The number of explanatory variables is reduced from 784 to 154.
Execution time is displayed.
To be noted is that memory usage is substantially lower than before PCA. 


In [81]:
# Concatenate training and testing data
x = np.concatenate([x_train, x_test]).astype(float)

# Start timer
start=dt.datetime.now()

pca =  PCA(n_components=0.95, random_state=RANDOM_SEED)

# Define PCA to explain at least 95% of the variance of the data
x_pca = pca.fit_transform(x)

# splitting data in train and test
#x_train_pca, x_test_pca = tts(x_pca, test_size = .20, random_state = RANDOM_SEED)

x_train_pca = x_pca[0:42000]
x_test_pca = x_pca[-28000:]

# Record the clock time it takes
duration = dt.datetime.now() - start

#print('\nF1 score: {:.2f}'.format(score))

print('\nDuration: ', duration)
print("PCA component shape: {}".format(pca.components_.shape))


Duration:  0:00:08.589191
PCA component shape: (154, 784)


In [82]:
# Create variable names to use
names = []
for i in range(1, 154):
    names.append("var" + str(i).zfill(3))
names

pca_df = pd.DataFrame(data=x_pca[1:,1:],    # values
                 index=x_pca[1:,0],    # 1st column as index
                 columns=names)  # 1st row as the column names

pca_df.info()

<class 'pandas.core.frame.DataFrame'>
Float64Index: 69999 entries, 1690.6251353849707 to 597.3131256751831
Columns: 153 entries, var001 to var153
dtypes: float64(153)
memory usage: 82.2 MB


# Step 3

# RFC classifier with PCA

The PCA transformed training set is used as input for the RFC with the same hyperparameters.

The execution time is displayed. The resulting prediction is stored in a CSV file to be uploaded in Kaggle for testing (score: 0.94214, UserId: Vittorio Pepe).

In [83]:
#model_to_fit =  RFC(max_features='sqrt', bootstrap = True, random_state=RANDOM_SEED)

# Start timer
start=dt.datetime.now()

fitted_model = model_to_fit.fit(x_train_pca, y_train)

y_pred_pca = fitted_model.predict(x_test_pca)

# Calcuate the f1 score
#score = f1_score(y_test, y_pred, average='macro')

# Record the clock time it takes
duration = dt.datetime.now() - start

#print('\nF1 score: {:.2f}'.format(score))
print('\nDuration: ', duration)


Duration:  0:00:59.685510


In [84]:
df = pd.DataFrame(y_pred_pca, columns=['Label'])

df.index += 1 
df.to_csv('Subm_RFC_pca.csv', index_label='ImageId')

# Conclusions

Using the MNIST dataset, the RFC with 10 as the number of estimators has much better performance without using the PCA preprocessing. Using PCA, F1 score decrease and the processing time is substantially higher, and the processing time of PCA is to be added too. The only advantage of using the PCA would be in reduced memory usage.

Using a larger number of estimators (100) improve the F1 scores of the RFC after applying PCA, at the expense of the processing time. In this case, using the PCA would allow reducing memory usage without reducing the F1 score too much.
 

# Appendix: choosing the hyperparamaters
    
To test different hyperparameters the following version of the code has been used using as dataset only the training set, splitting it into training and test set.
The F1 scores improve with the number of estimators, and the difference in performance is less substantial.


In [86]:
from sklearn.metrics import classification_report

RANDOM_SEED=1234

# read data from MINST
# creating data frame 
mnist_tr_df = pd.read_csv('train.csv')



y = mnist_tr_df.loc[:, 'label']
x = mnist_tr_df.loc[:,'pixel0':'pixel783']


x_train, x_test, y_train, y_test = tts(x, y, test_size = .30, random_state = RANDOM_SEED)


#######RFC

#using sklearn RandomForestClassifier, create test model:

model_to_fit =  RFC(max_features='sqrt', n_estimators=10, bootstrap = True, random_state=RANDOM_SEED).fit(x_train, y_train)

#model_to_fit =  RFC(max_features='sqrt', n_estimators=100, bootstrap = True, random_state=RANDOM_SEED)

# Start timer
start=dt.datetime.now()

fitted_model = model_to_fit.fit(x_train, y_train)

y_pred = fitted_model.predict(x_test)


# Record the clock time it takes
duration = dt.datetime.now() - start

print('\nRFC Duration: ', duration)

# Calcuate the f1 score
score = f1_score(y_test, y_pred, average='weighted')

print('\nF1 score: {:.4f}'.format(score))
print('\nF1 Scores, precision and recall: ')
print(classification_report(y_test, y_pred))


##### PCA

# Concatenate training and testing data
x = np.concatenate([x, x_ts]).astype(float)

# Start timer
start=dt.datetime.now()

pca =  PCA(n_components=0.95, random_state=RANDOM_SEED)

# Define PCA to explain at least 95% of the variance of the data
x_pca = pca.fit_transform(x)

# splitting data in train and test
#x_train_pca, x_test_pca = tts(x_pca, test_size = .20, random_state = RANDOM_SEED)

# Record the clock time it takes
duration = dt.datetime.now() - start

print('\nPCA Duration: ', duration)

####creating data frame for size comparision

# Create variable names to use
names = []
for i in range(1, 154):
    names.append("var" + str(i).zfill(3))
names

pca_df = pd.DataFrame(data=x_pca[1:,1:],    # values
                 index=x_pca[1:,0],    # 1st column as index
                 columns=names)  # 1st row as the column names

pca_df.info()

print("PCA component shape: {}".format(pca.components_.shape))

#print('\nExplained Variance ', pca.explained_variance_ratio_)

#model_to_fit =  RFC(max_features='sqrt', n_estimators=10, bootstrap = True, random_state=RANDOM_SEED)

x_train_pca = x_pca[0:42000]

x_trainp, x_testp, y_trainp, y_testp = tts(x_train_pca, y, test_size = .30, random_state = RANDOM_SEED)

#x_test_pca = x_pca[-28000:]

# Start timer
start=dt.datetime.now()

fitted_model = model_to_fit.fit(x_trainp, y_trainp)

y_pred_pca = fitted_model.predict(x_testp)

# Record the clock time it takes
duration = dt.datetime.now() - start

print('\nRFC with PCA Duration: ', duration)

# Calcuate the f1 score
score = f1_score(y_testp, y_pred_pca, average='weighted')

print('\nF1 score: {:.4f}'.format(score))
print('\nF1 Scores, precision and recall: ')
print(classification_report(y_testp, y_pred_pca))



RFC Duration:  0:00:01.878737

F1 score: 0.9346

F1 Scores, precision and recall: 
              precision    recall  f1-score   support

           0       0.96      0.98      0.97      1253
           1       0.97      0.98      0.97      1388
           2       0.92      0.95      0.93      1280
           3       0.90      0.91      0.91      1372
           4       0.92      0.93      0.93      1204
           5       0.92      0.91      0.92      1154
           6       0.96      0.96      0.96      1209
           7       0.96      0.95      0.95      1302
           8       0.92      0.87      0.90      1174
           9       0.92      0.90      0.91      1264

    accuracy                           0.93     12600
   macro avg       0.93      0.93      0.93     12600
weighted avg       0.93      0.93      0.93     12600


PCA Duration:  0:00:07.712001
<class 'pandas.core.frame.DataFrame'>
Float64Index: 69999 entries, 1690.6251353849707 to 597.3131256751831
Columns: 153 entrie