<h1>Machine Learning Classifiers on distance vectors</h1>

<h2>Data Preparation</h2>

In [1]:
import pandas as pd

DATASET_FILEPATH = "../_data/MATCHING_DISTANCE_VECTORS.csv"
MODELS_EXPORT_PATH = "../_models/"

df = pd.read_csv(DATASET_FILEPATH, header=0, sep=",")
df.head()

Unnamed: 0,BRAND1,BRAND2,BRAND3,MODEL1,MODEL2,MODEL3,MODEL4,GB_RAM,COLOR,EUR_PRICE,MATCH
0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1000000.0,1.0,0.0,MATCH
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1000000.0,1.0,0.0,MATCH
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1000000.0,0.0,0.0,MATCH
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1000000.0,1.0,0.0,MATCH
4,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1000000.0,0.0,0.0,MATCH


In [2]:
df.dtypes

BRAND1       float64
BRAND2       float64
BRAND3       float64
MODEL1       float64
MODEL2       float64
MODEL3       float64
MODEL4       float64
GB_RAM       float64
COLOR        float64
EUR_PRICE    float64
MATCH         object
dtype: object

In [3]:
df.describe()

Unnamed: 0,BRAND1,BRAND2,BRAND3,MODEL1,MODEL2,MODEL3,MODEL4,GB_RAM,COLOR,EUR_PRICE
count,536067.0,536067.0,536067.0,536067.0,536067.0,536067.0,536067.0,536067.0,536067.0,536067.0
mean,0.35087,0.0,0.0,0.436837,0.19405,0.029125,0.000386,999128.862349,0.127336,75993.726939
std,0.449487,0.0,0.0,0.473811,0.38126,0.166073,0.01952,29501.835379,0.295093,264658.371884
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1000000.0,0.0,30.9
50%,0.0,0.0,0.0,0.17,0.0,0.0,0.0,1000000.0,0.0,103.88
75%,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1000000.0,0.0,337.0
max,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1000000.0,1.0,1000000.0


In [4]:
df["MATCH"].describe()

count      536067
unique          2
top       UNMATCH
freq       295184
Name: MATCH, dtype: object

In [5]:
# Class variable
df["MATCH"].value_counts()

UNMATCH    295184
MATCH      240883
Name: MATCH, dtype: int64

In [6]:
from sklearn.model_selection import train_test_split

X = df.drop(columns=["MATCH"])
y = df["MATCH"].values

# Train and test sets are stratified, i.e. they both contain the same proportion of classes than the original set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, shuffle=True)

print("X_train: {}".format(X_train.shape))
print("y_train: {}".format(y_train.shape))
print("X_test: {}".format(X_test.shape))
print("y_test: {}".format(y_test.shape))

X_train: (375246, 10)
y_train: (375246,)
X_test: (160821, 10)
y_test: (160821,)


In [7]:
# Minimize length of training set to make hyperparameter tuning take less time
X_train = X_train[:20000]
y_train = y_train[:20000]

<h2>Hyperparameter tuning</h2>

<h3>K Nearest Neighbors</h3>

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
import time

knn = KNeighborsClassifier()

# Perform a Grid Search to find the best values for the hyperparameters (k and weights)
# using Cross Validation with 4 stratified partitions
param_grid = {
    "n_neighbors": range(1, 5),
    "weights": ["uniform", "distance"]
}

grid_search = GridSearchCV(knn, param_grid=param_grid, cv=4)

start_time = time.time()
grid_search.fit(X_train, y_train)
finish_time = time.time()

print("Grid Search for k-NN took {} seconds.".format(round(finish_time - start_time, 3)))

mean_scores = grid_search.cv_results_["mean_test_score"]
best_params = grid_search.best_params_

best_params

<h3>Support Vector Machines</h3>

In [8]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn import svm
from scipy.stats import uniform as sp_rand
from time import time

svm_clf = svm.SVC()

# Perform a Random Search to find the best values for the hyperparameters (C and gamma)
# using Cross Validation with 4 stratified partitions.
# n_iter_search random parameter settings are chosen to perform CV
param_rand = {
    "C": sp_rand(loc=1, scale=500), 
    "gamma": sp_rand(loc=1e-9, scale=1e-5)
}

n_iter_search = 10
random_search = RandomizedSearchCV(svm_clf, param_distributions=param_rand, n_iter=n_iter_search, cv=4)

start_time = time()
random_search.fit(X_train, y_train)
finish_time = time()

print("Random Search for SVM took {} seconds.".format(round(finish_time - start_time, 3)))

mean_scores = random_search.cv_results_["mean_test_score"]
best_params = random_search.best_params_

best_params

Random Search for SVM took 365.079 seconds.


{'C': 450.07963080130486, 'gamma': 9.711499749455284e-06}

<h3>Logistic Regression</h3>

<h3>Random Forest</h3>

In [1]:
import gc

# Force Garbage Collector to realease unreferenced memory
gc.collect()

38