# This notebook covers machine learning techniques for solving the computer vision problem as given in the problem statement. It is done on Google Colab jupyter notebook, since Google Colab offers great GPU and helps solve computationally expensive tasks easily

Setting up Google Colab

In [1]:
!pip install PyDrive

Collecting PyDrive
[?25l  Downloading https://files.pythonhosted.org/packages/52/e0/0e64788e5dd58ce2d6934549676243dc69d982f198524be9b99e9c2a4fd5/PyDrive-1.3.1.tar.gz (987kB)
[K    1% |▎                               | 10kB 20.5MB/s eta 0:00:01[K    2% |▋                               | 20kB 4.9MB/s eta 0:00:01[K    3% |█                               | 30kB 6.9MB/s eta 0:00:01[K    4% |█▎                              | 40kB 4.3MB/s eta 0:00:01[K    5% |█▋                              | 51kB 5.3MB/s eta 0:00:01[K    6% |██                              | 61kB 6.3MB/s eta 0:00:01[K    7% |██▎                             | 71kB 7.0MB/s eta 0:00:01[K    8% |██▋                             | 81kB 7.9MB/s eta 0:00:01[K    9% |███                             | 92kB 8.7MB/s eta 0:00:01[K    10% |███▎                            | 102kB 7.0MB/s eta 0:00:01[K    11% |███▋                            | 112kB 7.1MB/s eta 0:00:01[K    12% |████                            | 122

In [0]:
import os
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

In [0]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

Importing the necessary libraries

In [0]:
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

 Importing all classifier libraries

In [0]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm 
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

Downloading of training images from Google Drive. ID refers to the id of the file in google drive

In [0]:
download = drive.CreateFile({'id': '1rlaREE5rBT0lEpH0NOXqZ7ZJRrqGIs1Q'})
download.GetContentFile('train_image.pkl')
with open('train_image.pkl', 'rb') as f:
    train_images = pickle.load(f)

Downloading of training labels from Google Drive. ID refers to the id of the file in google drive

In [0]:
download = drive.CreateFile({'id': '1XEFDs7HTb9UHOezRZrf9NIU_c-M1fkXN'})
download.GetContentFile('train_label.pkl')
with open('train_label.pkl', 'rb') as f:
    train_labels = pickle.load(f)

Exploration of training data

In [11]:
len(train_images[0])
 

784

In [8]:
type(train_images)

list

In [0]:
# getting a glimpse of the training data
data = pd.DataFrame({'label':train_labels, 
            'index': range(0, 8000)})

In [11]:
data.head()

Unnamed: 0,index,label
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0


Assigning the label values to a variable y. y refers to the labels that should be assigned to the images

In [0]:
y=data['label'].values


In [13]:
len(y)

8000

In [14]:
type(y)

numpy.ndarray

converting training images into a numpy array to check the shape of training images

In [0]:
X = np.array(train_images)

In [24]:
X.shape

(8000, 784)

Dividing the data into training and cross validation set. This is very important since we can test out the performance of the algorithm on the cross validation set and experiment with the algorithms to get a better performance. When we get our best performance we use that algorithm for the test set. Evaluation on the test set is done only once.

In [0]:
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=42, test_size=0.2)

## Model 1 -->* K-Nearest Neighbour*. This can be used for both classification and regression problems. Since our problem involves multicalss classification, we can start with this classifier

In [0]:
model = KNeighborsClassifier(n_neighbors=3)

In [28]:
model.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=3, p=2,
           weights='uniform')

In [0]:
y_predicted = 

Testing the accuracy on training set and cross validation set. Checking on  training set is also very important to let us know of the overfirring, underfitting and a good fit

In [30]:
# predict function in this case is used to predict the classes for the inputed data
print("Accuracy on Training Set {}".format(accuracy_score(y_train, model.predict(X_train))))
print("Accuracy on Validation Set {}".format(accuracy_score(y_val, model.predict(X_val))))

Accuracy on Training Set 0.88546875
Accuracy on Validation Set 0.794375


In [0]:
# try with different K values

### Model 2 --> * Support Vector Machines*. This can also be used for both classification and regression problems. These are used heavily in classification problems because of its various advantages such as specifying custom kernels, and its effectiveness in high dimentional space

####  First variant of svm invoves using C-Support Vector Classification. SVC is so called because of its use of C parameter which is just like the regularization parameter. Usually if C is large the algorithm can make well separable decision boundary taking into account the outliers as well

In [0]:
# by default the kernel is gaussian kernel and gamma is the coefficient of the kernel, gamma should'nt be too large as it will lead to 
# overfitting, it should be set to 'scale', setting it to 'auto' leads to overfitting 
# C is by default 1.0
model = svm.SVC(gamma='scale')

In [32]:
# training the model
model.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [33]:
print("Accuracy on Training Set {}".format(accuracy_score(y_train, model.predict(X_train))))
print("Accuracy on Validation Set {}".format(accuracy_score(y_val, model.predict(X_val))))

Accuracy on Training Set 0.87875
Accuracy on Validation Set 0.8225


The above model provides good accuracy, but let's see whether it can be better by tuning the parameters

### Now, setting C to a larger value. Generally setting C to a larger value makes the algorithm define well separable decision boundary taking into account the outliers as well, however setting C too large(relative term, have to experiment with the values) may cause overfitting

In [0]:
model = svm.SVC(C=10, gamma='scale')

In [41]:
# training the model
model.fit(X_train, y_train)

SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [42]:
print("Accuracy on Training Set {}".format(accuracy_score(y_train, model.predict(X_train))))
print("Accuracy on Validation Set {}".format(accuracy_score(y_val, model.predict(X_val))))

Accuracy on Training Set 0.9803125
Accuracy on Validation Set 0.84125


C=10 is close to overfitting the training set, but it provides better accuracy as well, so let's test on one more value of C=5

In [43]:
model = svm.SVC(C=5, gamma='scale')
# training the model
model.fit(X_train, y_train)

SVC(C=5, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [44]:
print("Accuracy on Training Set {}".format(accuracy_score(y_train, model.predict(X_train))))
print("Accuracy on Validation Set {}".format(accuracy_score(y_val, model.predict(X_val))))

Accuracy on Training Set 0.95171875
Accuracy on Validation Set 0.84


In [0]:
model = svm.LinearSVC()

In [31]:
model.fit(X_train, y_train)



LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [32]:
print("Accuracy on Validation Set {}".format(accuracy_score(y_test, model.predict(X_test))))

Accuracy on Validation Set 0.7425


In [0]:
model = svm.SVC(C=10000, kernel='poly', gamma='scale')

In [34]:
model.fit(X_train, y_train)

SVC(C=10000, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='scale', kernel='poly',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [35]:
print("Accuracy on Validation Set {}".format(accuracy_score(y_test, model.predict(X_test))))

Accuracy on Validation Set 0.810625


In [0]:
# Naive Bayes
model = GaussianNB()

In [37]:
model.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [38]:
print("Accuracy on Validation Set {}".format(accuracy_score(y_test, model.predict(X_test))))

Accuracy on Validation Set 0.664375


In [0]:
model = RandomForestClassifier(n_estimators=10, n_jobs=-1)

In [40]:
model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [41]:
print("Accuracy on Validation Set {}".format(accuracy_score(y_test, model.predict(X_test))))

Accuracy on Validation Set 0.811875


In [0]:
model = RandomForestClassifier(n_estimators=20, n_jobs=-1)

In [43]:
model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [44]:
print("Accuracy on Validation Set {}".format(accuracy_score(y_test, model.predict(X_test))))

Accuracy on Validation Set 0.816875


In [0]:
model = RandomForestClassifier(n_estimators=100, n_jobs=-1)

In [46]:
model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [47]:
print("Accuracy on Validation Set {}".format(accuracy_score(y_test, model.predict(X_test))))

Accuracy on Validation Set 0.824375
