<h2>Data Preparation</h2>

In [1]:
import gc

# Force Garbage Collector to realease unreferenced memory
gc.collect()

57

In [2]:
import pandas as pd

DATASET_FILEPATH = "../_data/StructuredSmartPhonesDataset.csv"

df = pd.read_csv(DATASET_FILEPATH, header=0, sep=",")
df.head()

Unnamed: 0,Title,Brand,Model,Price,Currency,Color,MatchingCode
0,leifheit kleidersack lang farbe schwarz,,,14.99,EUR,schwarz,295639
1,leifheit kleidersack lang farbe schwarz,,,14.99,EUR,schwarz,295639
2,leifheit kleidersack lang farbe schwarz,,,14.99,EUR,schwarz,295639
3,leifheit kleidersack lang farbe schwarz,,,14.99,EUR,schwarz,295639
4,leifheit kleidersack lang farbe schwarz,,,14.99,EUR,schwarz,295639


In [3]:
print("Initial shape of dataset: {}".format(df.shape))

# Drop non-relevant columns for classification
df = df.drop(columns=["Title"])

# Remove rows with NULL values
df = df.dropna()

# Limit number of rows
df = df.head(10000)

print("Final shape of dataset: {}".format(df.shape))

n_unique_classes = len(set(df["MatchingCode"].values))
n_unique_rows = df.drop_duplicates().shape[0]

print("\nNumber of unique classes (MatchingCode): {}".format(n_unique_classes))
print("Number of unique rows: {}".format(n_unique_rows))

# Export processed dataset to file
df.to_csv("../_data/StructuredSmartPhonesDatasetCleaned.csv", index=False)

Initial shape of dataset: (50000, 7)
Final shape of dataset: (10000, 6)

Number of unique classes (MatchingCode): 214
Number of unique rows: 1517


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

X = df.drop(columns=["MatchingCode"])
y = df["MatchingCode"].values

# Codify string variables with OneHot encoding
enc = OneHotEncoder()
enc.fit(X)
X_encoded = enc.transform(X)

print("Shape of One-Hot-Encoded dataset: {}".format(X_encoded.shape))

# Train and test sets are stratified, i.e. they both contain the same proportion of classes than the original set
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=1, shuffle=True)

Shape of One-Hot-Encoded dataset: (10000, 1210)


<h2>K Nearest Neighbours</h2>

In [72]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
import time

knn = KNeighborsClassifier()

# Perform a Grid Search to find the best values for the hyperparameters (k and weights)
# using Cross Validation with 4 stratified partitions
param_grid = {
    "n_neighbors": range(1, 11),
    "weights": ["uniform", "distance"]
}

grid_search = GridSearchCV(knn, param_grid=param_grid, cv=4)

start_time = time.time()
grid_search.fit(X_train, y_train)
finish_time = time.time()

print("Grid Search for k-NN took {} seconds.".format(round(finish_time - start_time, 3)))

mean_scores = grid_search.cv_results_["mean_test_score"]
best_params = grid_search.best_params_

best_params



Grid Search for k-NN took 135.514 seconds.


{'n_neighbors': 4, 'weights': 'distance'}

In [76]:
best_k = best_params["n_neighbors"]
best_weights = best_params["weights"]

print("Best values for the k-NN hyperparameters: k = {}, weights = {}".format(best_k, best_weights))

knn = KNeighborsClassifier(n_neighbors=best_k, weights=best_weights)
knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)

Best values for the k-NN hyperparameters: k = 4, weights = distance


In [77]:
import numpy as np

total_ins = len(y_pred)
n_pred_ok = np.sum(y_test == y_pred)
n_pred_ko = total_ins - n_pred_ok
acc = n_pred_ok / total_ins * 100

print("Accuracy: {}".format(acc))
print("Number of instances correctly classified: {}/{}".format(n_pred_ok, total_ins))
print("Number of instances incorrectly classified: {}/{}".format(n_pred_ko, total_ins))

# inspect instances incorrecly classified (some "head", for example)


Accuracy: 92.95
Number of instances correctly classified: 1859/2000
Number of instances incorrectly classified: 141/2000


<h2>Support Vector Machines</h2>

In [16]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn import svm
from scipy.stats import uniform as sp_rand
from time import time

svm_clf = svm.SVC()

# Perform a Random Search to find the best values for the hyperparameters (C and gamma)
# using Cross Validation with 4 stratified partitions.
# n_iter_search random parameter settings are chosen to perform CV
param_rand = {
    "C": sp_rand(loc=1, scale=500), 
    "gamma": sp_rand(loc=1e-9, scale=1e-5)
}

n_iter_search = 20
random_search = RandomizedSearchCV(svm_clf, param_distributions=param_rand, n_iter=n_iter_search, cv=4)

start_time = time()
random_search.fit(X_train, y_train)
finish_time = time()

print("Random Search for SVM took {} seconds.".format(round(finish_time - start_time, 3)))

mean_scores = random_search.cv_results_["mean_test_score"]
best_params = random_search.best_params_

best_params



Random Search for SVM took 1443.44 seconds.


{'C': 470.79411276428084, 'gamma': 8.890128936232237e-06}

In [17]:
best_C = best_params["C"]
best_gamma = best_params["gamma"]

print("Best values for the SVM hyperparameters: C = {}, gamma = {}".format(best_C, best_gamma))

svm_clf = svm.SVC(C=best_C, gamma=best_gamma, probability=True)
svm_clf.fit(X_train, y_train)

y_pred = svm_clf.predict(X_test)

Best values for the SVM hyperparameters: C = 470.79411276428084, gamma = 8.890128936232237e-06


In [19]:
import numpy as np

total_ins = len(y_pred)
n_pred_ok = np.sum(y_test == y_pred)
n_pred_ko = total_ins - n_pred_ok
acc = n_pred_ok / total_ins * 100

print("Accuracy: {}".format(acc))
print("Number of instances correctly classified: {}/{}".format(n_pred_ok, total_ins))
print("Number of instances incorrectly classified: {}/{}".format(n_pred_ko, total_ins))

Accuracy: 34.849999999999994
Number of instances correctly classified: 697/2000
Number of instances incorrectly classified: 1303/2000


<h2>Logistic regression</h2>

In [21]:
import numpy as np
import time
from sklearn import linear_model, datasets
from sklearn.model_selection import GridSearchCV

lr_clf = linear_model.LogisticRegression()

# Perform a Grid Search to find the best values for the hyperparameters (penalty and C)
# using Cross Validation with 4 stratified partitions.
param_grid = {
    "penalty": ['l1', 'l2'],
    "C": np.logspace(0, 4, 10)
}

grid_search = GridSearchCV(lr_clf, param_grid, cv=4)

start_time = time.time()
grid_search.fit(X_train, y_train)
finish_time = time.time()

print("Grid Search for Logistic Regression took {} seconds.".format(round(finish_time - start_time, 3)))

mean_scores = grid_search.cv_results_["mean_test_score"]
best_params = grid_search.best_params_
best_params



Grid Search for Logistic Regression took 464.122 seconds.


{'C': 1291.5496650148827, 'penalty': 'l2'}

In [22]:
best_penalty = best_params["penalty"]
best_C = best_params["C"]

print("Best values for the Logistic Regression hyperparameters: penalty = {}, C = {}".format(best_penalty, best_C))

lr_clf = linear_model.LogisticRegression(penalty=best_penalty, C=best_C)
lr_clf.fit(X_train, y_train)

y_pred = lr_clf.predict(X_test)

Best values for the Logistic Regression hyperparameters: penalty = l2, C = 1291.5496650148827


In [23]:
import numpy as np

total_ins = len(y_pred)
n_pred_ok = np.sum(y_test == y_pred)
n_pred_ko = total_ins - n_pred_ok
acc = n_pred_ok / total_ins * 100

print("Accuracy: {}".format(acc))
print("Number of instances correctly classified: {}/{}".format(n_pred_ok, total_ins))
print("Number of instances incorrectly classified: {}/{}".format(n_pred_ko, total_ins))

Accuracy: 93.0
Number of instances correctly classified: 1860/2000
Number of instances incorrectly classified: 140/2000


<h2>Random Forest</h2>

In [25]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor()

# Perform Grid Search on number of trees (n_estimators)
param_grid = {
    'n_estimators': range(1, 10)
}

grid_search = GridSearchCV(rf, param_grid=param_grid, cv=4)

start_time = time.time()
grid_search.fit(X_train, y_train)
finish_time = time.time()

print("Grid Search for Random Forest took {} seconds.".format(round(finish_time - start_time, 3)))

mean_scores = grid_search.cv_results_["mean_test_score"]
best_params = grid_search.best_params_
best_params

Grid Search for Random Forest took 9.565 seconds.


{'n_estimators': 7}

In [27]:
best_n_estimators = best_params["n_estimators"]

rf = RandomForestRegressor(n_estimators=best_n_estimators)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)

In [28]:
import numpy as np

total_ins = len(y_pred)
n_pred_ok = np.sum(y_test == y_pred)
n_pred_ko = total_ins - n_pred_ok
acc = n_pred_ok / total_ins * 100

print("Accuracy: {}".format(acc))
print("Number of instances correctly classified: {}/{}".format(n_pred_ok, total_ins))
print("Number of instances incorrectly classified: {}/{}".format(n_pred_ko, total_ins))

Accuracy: 81.75
Number of instances correctly classified: 1635/2000
Number of instances incorrectly classified: 365/2000
