# Gisette Dataset
#####  GISETTE is a handwritten digit recognition problem.
#####  The problem is to separate the highly confusible digits '4' and '9'.

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import csv
import sklearn
import time
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

# Reducing the dataset using Correlation and Multicollinearity:

#### Info regarding the data:

In [2]:
print(f'Some info on the provided Gisette data:\n')
with open('gisette.param') as csvfile:
    for row in csv.reader(csvfile):
        print(row[0])

Some info on the provided Gisette data:

Data type: non-sparse
Number of features: 5000
Number of examples and check-sums:
     	Pos_ex	Neg_ex	Tot_ex	Check_sum
Train	 3000	 3000	 6000	3164568508.00
Valid	  500	  500	 1000	535016668.00
Test	 3250	 3250	 6500	3431572010.00
All  	 6750	 6750	13500	7131157186.00


###### The provided dataset has labels available only for Train and Valid datasets so we made the decision to use Train dataset with its labels to train our machine and then check our machine's performance on the Valid dataset and its labels

#### Reading the Training Dataset and cleaning as needed

In [3]:
gisette_train_data = pd.read_csv('gisette_train.data', delimiter=' ', header=None)
gisette_train_data = gisette_train_data.dropna(axis=1, how='all')

gisette_train_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4990,4991,4992,4993,4994,4995,4996,4997,4998,4999
0,550,0,495,0,0,0,0,976,0,0,...,0,0,0,991,991,0,0,0,0,983
1,0,0,0,0,0,0,0,976,0,0,...,0,475,0,991,0,0,991,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,742,0,0,0,0,684,0,956,...,0,0,0,0,0,0,674,0,0,838
4,0,0,0,0,0,0,0,608,0,979,...,991,0,0,828,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5995,0,0,0,0,0,0,0,0,0,0,...,991,0,0,0,0,0,783,0,0,0
5996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,921,0,886,0
5997,0,0,0,0,0,758,0,0,0,522,...,0,901,0,0,0,0,980,0,0,0
5998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,690,0,0,0,0,0


#### Reading True Positive & Negative Values (Labels) for the Training Dataset

In [4]:
reduced_train_labels = pd.read_csv('gisette_train.labels', delimiter=' ', header=None)

reduced_train_labels

Unnamed: 0,0
0,1
1,-1
2,1
3,1
4,1
...,...
5995,-1
5996,1
5997,-1
5998,-1


#### Merging the training dataset with its provided labels

In [5]:
merged_train_set = reduced_train_labels.merge(gisette_train_data, left_index=True,
                                              right_index=True).rename(columns={'0_y':0,'0_x':'Label'})

#### Reducing features using Correlation

In [6]:
start = time.time()  #---start time---

#---calculating correlation of the features to the target value---
test_corr = merged_train_set.corr()['Label'].abs().sort_values(ascending=False)

end = time.time()  #---end time---

#---dropping NA values---
test_corr = test_corr.dropna()

print('Processing Time for calculating Correlation (in seconds):', end-start)

Processing Time for calculating Correlation (in seconds): 342.29425716400146


In [7]:
#---creating a dataframe of top 10 features with the highest correlation values with the target---
features_top10 = features = test_corr[test_corr > 0.5397].index.to_list()[1:]
corr_top10 = test_corr[test_corr > 0.5397].to_list()[1:]
df_top10 = pd.DataFrame({'Features':features_top10, 'Correlation Value':corr_top10})

df_top10.index = df_top10.index + 1
df_top10

Unnamed: 0,Features,Correlation Value
1,3656,0.671371
2,557,0.646177
3,3975,0.603175
4,4507,0.577779
5,511,0.577598
6,2742,0.569261
7,3002,0.56845
8,1228,0.554855
9,904,0.549525
10,4271,0.539893


In [8]:
#---getting all the features that have at least 15% correlation to the target---
features = test_corr[test_corr > 0.15].index.to_list()[1:]  #---without the 'Label' column---
features.sort()

In [9]:
reduced_train_data = gisette_train_data[features]
reduced_train_data

Unnamed: 0,2,12,16,34,60,66,70,83,95,101,...,4949,4963,4966,4967,4976,4979,4980,4981,4991,4999
0,495,983,983,0,0,991,0,0,0,987,...,987,513,811,0,983,0,0,983,0,983
1,0,0,0,0,0,442,0,0,0,0,...,0,932,764,828,0,0,0,0,475,0
2,0,983,976,0,0,987,0,0,0,991,...,0,991,991,0,0,0,0,0,0,0
3,742,983,983,991,0,0,0,614,0,0,...,0,748,729,0,0,874,0,859,0,838
4,0,0,0,0,0,780,0,0,0,0,...,805,0,0,0,0,780,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5995,0,0,0,804,0,987,0,0,0,0,...,0,995,995,0,691,0,940,0,0,0
5996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,991,0,0,0,0
5997,0,991,968,0,0,0,0,601,0,0,...,0,572,0,892,0,0,0,0,901,0
5998,0,0,504,0,0,913,0,0,0,743,...,878,0,0,0,0,0,0,484,0,0


#### Reading the Testing Dataset and cleaning as needed

In [10]:
gisette_test_data = pd.read_csv('gisette_valid.data', delimiter=' ', header=None)
gisette_test_data = gisette_test_data.dropna(axis=1, how='all')

#---reducing the testing dataset based on the above criteria---
reduced_test_data = gisette_test_data[features]
reduced_test_data

Unnamed: 0,2,12,16,34,60,66,70,83,95,101,...,4949,4963,4966,4967,4976,4979,4980,4981,4991,4999
0,0,0,0,0,0,684,0,0,0,991,...,823,721,995,964,0,0,0,0,0,0
1,0,0,0,0,0,991,0,0,0,0,...,0,0,0,0,0,991,0,0,0,0
2,816,0,0,560,0,0,0,0,0,0,...,983,823,991,0,0,0,704,0,0,0
3,0,0,0,0,0,552,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,616,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0,0,0,0,592,0,0,0,0,0,...,0,0,799,991,802,0,0,906,0,0
996,599,0,0,0,0,983,976,785,781,983,...,983,603,603,0,0,991,565,983,0,707
997,0,0,0,0,0,980,0,0,0,0,...,952,0,0,0,0,0,0,764,0,742
998,0,999,0,0,0,0,0,0,0,0,...,0,0,0,999,0,0,0,885,0,0


#### Reading True Positive & Negative Values (Labels) for the Testing Dataset

In [11]:
reduced_test_labels = pd.read_csv('gisette_valid.labels', delimiter=' ', header=None)

reduced_test_labels

Unnamed: 0,0
0,1
1,1
2,-1
3,1
4,1
...,...
995,-1
996,1
997,1
998,-1


#### Training a model using Logistic Regression

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

start = time.time()  #---start time---

#---training the model using Logistic Regression---
log_reg = LogisticRegression(max_iter = 5000)
log_reg.fit(reduced_train_data, np.ravel(reduced_train_labels))

#---evaluating the model on the training and testing datasets---
train_pred = log_reg.predict(reduced_train_data)
test_pred = log_reg.predict(reduced_test_data)

end = time.time()  #---end time---

print('Percentage Accuracy on the Training Data:', accuracy_score(reduced_train_labels, train_pred)*100)
print('Percentage Accuracy on the Testing Data:', accuracy_score(reduced_test_labels, test_pred)*100)
print('Processing Time (in seconds):', end-start)

Percentage Accuracy on the Training Data: 100.0
Percentage Accuracy on the Testing Data: 95.8
Processing Time (in seconds): 3.864772081375122


#### Training a model using a linear kernel Support Vector Machine

In [13]:
from sklearn.svm import SVC

start = time.time()  #---start time---

#---training the model using Support Vector Machine with a linear kernel---
svm_linear = SVC(kernel='linear')
svm_linear.fit(reduced_train_data, np.ravel(reduced_train_labels))

#---evaluating the model on the training and testing datasets---
train_pred = svm_linear.predict(reduced_train_data)
test_pred = svm_linear.predict(reduced_test_data)

end = time.time()  #---end time---

print('Percentage Accuracy on the Training Data:', accuracy_score(reduced_train_labels, train_pred)*100)
print('Percentage Accuracy on the Testing Data:', accuracy_score(reduced_test_labels, test_pred)*100)
print('Processing Time (in seconds):', end-start)

Percentage Accuracy on the Training Data: 100.0
Percentage Accuracy on the Testing Data: 95.39999999999999
Processing Time (in seconds): 2.4256742000579834


#### Training a model using a degree 2 polynomial kernel Support Vector Machine

In [14]:
start = time.time()  #---start time---

#---training the model using Support Vector Machine with a degree 2 polynomial kernel---
svm_poly_2 = SVC(kernel='poly', degree=2)
svm_poly_2.fit(reduced_train_data, np.ravel(reduced_train_labels))

#---evaluating the model on the training and testing datasets---
train_pred = svm_poly_2.predict(reduced_train_data)
test_pred = svm_poly_2.predict(reduced_test_data)

end = time.time()  #---end time---

print('Percentage Accuracy on the Training Data:', accuracy_score(reduced_train_labels, train_pred)*100)
print('Percentage Accuracy on the Testing Data:', accuracy_score(reduced_test_labels, test_pred)*100)
print('Processing Time (in seconds):', end-start)

Percentage Accuracy on the Training Data: 98.63333333333333
Percentage Accuracy on the Testing Data: 96.8
Processing Time (in seconds): 3.425978183746338


#### Training a model using a degree 3 polynomial kernel Support Vector Machine

In [15]:
start = time.time()  #---start time---

#---training the model using Support Vector Machine with a degree 3 polynomial kernel---
svm_poly_3 = SVC(kernel='poly', degree=3)
svm_poly_3.fit(reduced_train_data, np.ravel(reduced_train_labels))

#---evaluating the model on the training and testing datasets---
train_pred = svm_poly_3.predict(reduced_train_data)
test_pred = svm_poly_3.predict(reduced_test_data)

end = time.time()  #---end time---

print('Percentage Accuracy on the Training Data:', accuracy_score(reduced_train_labels, train_pred)*100)
print('Percentage Accuracy on the Testing Data:', accuracy_score(reduced_test_labels, test_pred)*100)
print('Processing Time (in seconds):', end-start)

Percentage Accuracy on the Training Data: 97.3
Percentage Accuracy on the Testing Data: 95.1
Processing Time (in seconds): 6.376935958862305


#### Training a model using a degree 4 polynomial kernel Support Vector Machine

In [16]:
start = time.time()  #---start time---

#---training the model using Support Vector Machine with a degree 4 polynomial kernel---
svm_poly_4 = SVC(kernel='poly', degree=4)
svm_poly_4.fit(reduced_train_data, np.ravel(reduced_train_labels))

#---evaluating the model on the training and testing datasets---
train_pred = svm_poly_4.predict(reduced_train_data)
test_pred = svm_poly_4.predict(reduced_test_data)

end = time.time()  #---end time---

print('Percentage Accuracy on the Training Data:', accuracy_score(reduced_train_labels, train_pred)*100)
print('Percentage Accuracy on the Testing Data:', accuracy_score(reduced_test_labels, test_pred)*100)
print('Processing Time (in seconds):', end-start)

Percentage Accuracy on the Training Data: 95.71666666666667
Percentage Accuracy on the Testing Data: 93.4
Processing Time (in seconds): 7.337788820266724


#### Reducing features further using Multicollinearity

In [17]:
from sklearn.linear_model import LinearRegression

#---writing a function to calculate the Variance Inflation Factor (VIF)---
def calculate_vif(df, features):    
    vif, tolerance = {}, {}
    #---all the features that we want to examine---
    for feature in features:
        #---extracting all the other features that we regress against---
        X = [f for f in features if f != feature]        
        X, y = df[X], df[feature]
        #---extracting r-squared from the fit---
        r2 = LinearRegression().fit(X, y).score(X, y)                
        
        #---calculating tolerance---
        tolerance[feature] = 1 - r2
        #---assigning a very small number to tolerance if it is zero to avoid division by zero---
        if tolerance[feature] == 0:
            tolerance[feature] = 0.00001
        #---calculating VIF---
        vif[feature] = 1/(tolerance[feature])
    #---returning VIF DataFrame---
    return pd.DataFrame({'VIF': vif, 'Tolerance': tolerance})

start = time.time()  #---start time---

#---calculating VIF for our dataset against the reduced features with atleast 15% Correlation---
VIF = calculate_vif(merged_train_set,features)

end = time.time()  #---end time---

print('Processing Time for calculating Variance Inflation Factor (in seconds):', end-start)

Processing Time for calculating Variance Inflation Factor (in seconds): 182.10516810417175


In [18]:
#---reducing features further by restricting VIF to be less than 10---
new_features = [ i for i in features if i not in VIF[VIF['VIF']>10].index.to_list() ]

#---calculating VIF for our dataset against the further reduced features---
new_VIF = calculate_vif(merged_train_set,new_features)
new_VIF

Unnamed: 0,VIF,Tolerance
2,7.425761,0.134666
16,2.986956,0.334789
34,5.647502,0.177069
60,1.623279,0.616037
66,3.401201,0.294014
...,...,...
4976,4.758777,0.210138
4979,4.626300,0.216155
4980,4.839445,0.206635
4991,3.875394,0.258038


In [19]:
#---finding the ten features with the lowest VIF---
low_10_VIF = new_VIF.sort_values(by='VIF')[:10]
low_10_VIF

Unnamed: 0,VIF,Tolerance
60,1.623279,0.616037
1125,1.691646,0.59114
593,1.716827,0.58247
391,1.761319,0.567756
4835,1.887546,0.529789
1375,1.887829,0.529709
1565,1.90492,0.524956
458,2.024286,0.494001
4197,2.06442,0.484398
2615,2.09211,0.477986


In [20]:
#---making sure that all of our VIF values are indeed bounded by 10---
new_VIF['VIF'].max()

8.822960675991833

In [21]:
reduced_train_data = gisette_train_data[new_VIF.index]
reduced_train_data

Unnamed: 0,2,16,34,60,66,70,95,101,106,112,...,4924,4925,4941,4949,4967,4976,4979,4980,4991,4999
0,495,983,0,0,991,0,0,987,956,983,...,0,983,0,987,0,983,0,0,0,983
1,0,0,0,0,442,0,0,0,0,0,...,987,0,0,0,828,0,0,0,475,0
2,0,976,0,0,987,0,0,991,0,0,...,582,0,0,0,0,0,0,0,0,0
3,742,983,991,0,0,0,0,0,987,0,...,0,0,0,0,0,0,874,0,0,838
4,0,0,0,0,780,0,0,0,0,0,...,0,0,0,805,0,0,780,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5995,0,0,804,0,987,0,0,0,0,0,...,457,0,0,0,0,691,0,940,0,0
5996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,991,0,0,0
5997,0,968,0,0,0,0,0,0,0,0,...,741,0,0,0,892,0,0,0,901,0
5998,0,504,0,0,913,0,0,743,0,0,...,0,0,0,878,0,0,0,0,0,0


#### Reading the Testing Dataset and cleaning as needed

In [22]:
gisette_test_data = pd.read_csv('gisette_valid.data', delimiter=' ', header=None)
gisette_test_data = gisette_test_data.dropna(axis=1, how='all')

#---reducing the testing dataset based on the above criteria---
reduced_test_data = gisette_test_data[new_VIF.index]
reduced_test_data

Unnamed: 0,2,16,34,60,66,70,95,101,106,112,...,4924,4925,4941,4949,4967,4976,4979,4980,4991,4999
0,0,0,0,0,684,0,0,991,0,0,...,0,0,0,823,964,0,0,0,0,0
1,0,0,0,0,991,0,0,0,0,0,...,0,0,795,0,0,0,991,0,0,0
2,816,0,560,0,0,0,0,0,836,0,...,0,579,0,983,0,0,0,704,0,0
3,0,0,0,0,552,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,616,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0,0,0,592,0,0,0,0,0,0,...,0,0,0,0,991,802,0,0,0,0
996,599,0,0,0,983,976,781,983,983,0,...,0,0,746,983,0,0,991,565,0,707
997,0,0,0,0,980,0,0,0,976,758,...,0,786,0,952,0,0,0,0,0,742
998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,999,0,0,0,0,0


#### Reading True Positive & Negative Values (Labels) for the Testing Dataset

In [23]:
reduced_test_labels = pd.read_csv('gisette_valid.labels', delimiter=' ', header=None)

reduced_test_labels

Unnamed: 0,0
0,1
1,1
2,-1
3,1
4,1
...,...
995,-1
996,1
997,1
998,-1


#### Training a model using Logistic Regression

In [24]:
start = time.time()  #---start time---

#---training the model using Logistic Regression---
log_reg.fit(reduced_train_data, np.ravel(reduced_train_labels))

#---evaluating the model on the training and testing datasets---
train_pred = log_reg.predict(reduced_train_data)
test_pred = log_reg.predict(reduced_test_data)

end = time.time()  #---end time---

print('Percentage Accuracy on the Training Data:', accuracy_score(reduced_train_labels, train_pred)*100)
print('Percentage Accuracy on the Testing Data:', accuracy_score(reduced_test_labels, test_pred)*100)
print('Processing Time (in seconds):', end-start)

Percentage Accuracy on the Training Data: 98.11666666666666
Percentage Accuracy on the Testing Data: 95.89999999999999
Processing Time (in seconds): 6.876144886016846


###### The following linear kernel SVM took longer than 20 minutes of runtime to fit our training data so we decided to keep the code but remove it from our runs

#### Training a model using a linear kernel Support Vector Machine

In [25]:
# start = time.time()  #---start time---

# #---training the model using Support Vector Machine with a linear kernel---
# svm_linear.fit(reduced_train_data, np.ravel(reduced_train_labels))

# #---evaluating the model on the training and testing datasets---
# train_pred = svm_linear.predict(reduced_train_data)
# test_pred = svm_linear.predict(reduced_test_data)

# end = time.time()  #---end time---

# print('Percentage Accuracy on the Training Data:', accuracy_score(reduced_train_labels, train_pred)*100)
# print('Percentage Accuracy on the Testing Data:', accuracy_score(reduced_test_labels, test_pred)*100)
# print('Processing Time (in seconds):', end-start)

#### Training a model using a degree 2 polynomial kernel Support Vector Machine

In [26]:
start = time.time()  #---start time---

#---training the model using Support Vector Machine with a degree 2 polynomial kernel---
svm_poly_2.fit(reduced_train_data, np.ravel(reduced_train_labels))

#---evaluating the model on the training and testing datasets---
train_pred = svm_poly_2.predict(reduced_train_data)
test_pred = svm_poly_2.predict(reduced_test_data)

end = time.time()  #---end time---

print('Percentage Accuracy on the Training Data:', accuracy_score(reduced_train_labels, train_pred)*100)
print('Percentage Accuracy on the Testing Data:', accuracy_score(reduced_test_labels, test_pred)*100)
print('Processing Time (in seconds):', end-start)

Percentage Accuracy on the Training Data: 98.55000000000001
Percentage Accuracy on the Testing Data: 97.39999999999999
Processing Time (in seconds): 2.515845775604248


#### Training a model using a degree 3 polynomial kernel Support Vector Machine

In [27]:
start = time.time()  #---start time---

#---training the model using Support Vector Machine with a degree 3 polynomial kernel---
svm_poly_3.fit(reduced_train_data, np.ravel(reduced_train_labels))

#---evaluating the model on the training and testing datasets---
train_pred = svm_poly_3.predict(reduced_train_data)
test_pred = svm_poly_3.predict(reduced_test_data)

end = time.time()  #---end time---

print('Percentage Accuracy on the Training Data:', accuracy_score(reduced_train_labels, train_pred)*100)
print('Percentage Accuracy on the Testing Data:', accuracy_score(reduced_test_labels, test_pred)*100)
print('Processing Time (in seconds):', end-start)

Percentage Accuracy on the Training Data: 96.81666666666666
Percentage Accuracy on the Testing Data: 95.19999999999999
Processing Time (in seconds): 4.111234426498413


#### Training a model using a degree 4 polynomial kernel Support Vector Machine

In [28]:
start = time.time()  #---start time---

#---training the model using Support Vector Machine with a degree 4 polynomial kernel---
svm_poly_4.fit(reduced_train_data, np.ravel(reduced_train_labels))

#---evaluating the model on the training and testing datasets---
train_pred = svm_poly_4.predict(reduced_train_data)
test_pred = svm_poly_4.predict(reduced_test_data)

end = time.time()  #---end time---

print('Percentage Accuracy on the Training Data:', accuracy_score(reduced_train_labels, train_pred)*100)
print('Percentage Accuracy on the Testing Data:', accuracy_score(reduced_test_labels, test_pred)*100)
print('Processing Time (in seconds):', end-start)

Percentage Accuracy on the Training Data: 93.85
Percentage Accuracy on the Testing Data: 90.9
Processing Time (in seconds): 5.261126756668091
