# MLDS Spring 2021 Image Classification Competition

Created by Irvin Shen

### Import Statements

In [1]:
# Import libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Scikit-learn imports
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import plot_confusion_matrix, precision_score, recall_score, accuracy_score
from sklearn.model_selection import train_test_split
from xgboost import XGBRFClassifier
from sklearn.model_selection import GridSearchCV

### Import file and do some simple EDS

In [2]:
dfChar = pd.read_csv("./training_data.csv")

In [3]:
dfChar.head()

Unnamed: 0,index,0,1,2,3,4,5,6,7,8,...,391,392,393,394,395,396,397,398,399,Y
0,0,255,255,233,183,194,255,223,253,251,...,245,251,249,255,255,255,251,195,221,5
1,1,255,255,255,253,255,255,255,255,255,...,255,205,222,222,255,255,255,255,253,18
2,2,255,249,235,255,255,255,253,231,194,...,255,248,208,233,187,196,189,147,255,0
3,3,231,255,244,243,255,255,255,252,162,...,193,167,191,198,255,202,190,209,171,3
4,4,255,201,101,205,223,192,189,143,202,...,241,252,255,231,210,216,255,220,195,2


In [4]:
# Split df into target and features

y = dfChar.Y
X = dfChar.drop("Y", axis=1)

In [5]:
X = X.drop("index",axis=1)

### Split data and create model, along with the hyperparameter tuner

In [6]:
train_X, test_X, train_y, test_y = train_test_split(X, y, random_state=1, train_size=.8)

In [7]:
model = make_pipeline(StandardScaler(), SVC(shrinking=True), verbose=True)

In [10]:
model.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'standardscaler', 'svc', 'standardscaler__copy', 'standardscaler__with_mean', 'standardscaler__with_std', 'svc__C', 'svc__break_ties', 'svc__cache_size', 'svc__class_weight', 'svc__coef0', 'svc__decision_function_shape', 'svc__degree', 'svc__gamma', 'svc__kernel', 'svc__max_iter', 'svc__probability', 'svc__random_state', 'svc__shrinking', 'svc__tol', 'svc__verbose'])

Previous testing showed that the gamma and kernel performed better when left as default.

In [8]:
params = {"svc__C": [1, 5, 10, 100]}

In [9]:
search = GridSearchCV(model, params, n_jobs=-1, verbose=True, cv=3)

### Fit the model and tune the hyperparameters

In [11]:
search.fit(train_X, train_y)

Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of  12 | elapsed:  5.6min remaining: 28.1min
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:  6.5min finished


[Pipeline] .... (step 1 of 2) Processing standardscaler, total=   0.1s
[Pipeline] ............... (step 2 of 2) Processing svc, total= 2.9min


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                       ('svc', SVC())],
                                verbose=True),
             n_jobs=-1, param_grid={'svc__C': [1, 5, 10, 100]}, verbose=True)

In [12]:
preds = search.predict(test_X)

In [13]:
accuracy_score(preds, test_y)

0.5723331709693132

### Our model isn't very accurate, but I'm not sure I can get much more out of it just using SVC

In [14]:
dfFinal = pd.read_csv("./predictions.csv")

In [15]:
dfFinal_noindex = dfFinal.drop("index", axis=1)

In [16]:
final_preds = pd.DataFrame()

In [19]:
final_preds["Category"] = search.predict(dfFinal_noindex)

In [20]:
final_preds.index.name = "Id"

In [22]:
final_preds.to_csv("final_predictions.csv")

In [28]:
from sklearn.neural_network import MLPClassifier

mlpModel = make_pipeline(StandardScaler(), MLPClassifier())
mlpModel.fit(train_X, train_y)
mlpPreds = mlpModel.predict(test_X)
accuracy_score(mlpPreds, test_y)

0.49488553336580615