In [32]:
import pandas as pd

DATASET_FILEPATH = "../_data/StructuredSmartPhonesDataset.csv"

df = pd.read_csv(DATASET_FILEPATH, header=0, sep=",")
df.head()

Unnamed: 0,Title,Brand,Model,Price,Currency,Color,MatchingCode
0,leifheit kleidersack lang farbe schwarz,,,14.99,EUR,schwarz,295639
1,leifheit kleidersack lang farbe schwarz,,,14.99,EUR,schwarz,295639
2,leifheit kleidersack lang farbe schwarz,,,14.99,EUR,schwarz,295639
3,leifheit kleidersack lang farbe schwarz,,,14.99,EUR,schwarz,295639
4,leifheit kleidersack lang farbe schwarz,,,14.99,EUR,schwarz,295639


In [33]:
print("Initial shape of dataset: {}".format(df.shape))

# Drop non-relevant columns
df = df.drop(columns=["Title"])

# Remove rows with NULL values
df = df.dropna()

# Limit number of rows
df = df.head(10000)

print("Final shape of dataset: {}".format(df.shape))

Initial shape of dataset: (50000, 7)
Final shape of dataset: (10000, 6)


In [34]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

X = df.drop(columns=["MatchingCode"])
y = df["MatchingCode"].values

# Codify string variables with OneHot encoding
enc = OneHotEncoder()
enc.fit(X)
X_encoded = enc.transform(X)

print("Shape of One-Hot-Encoded dataset: {}".format(X_encoded.shape))

# Train and test sets are stratified, i.e. they both contain the same proportion of classes than the original set
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=1, shuffle=True)

Shape of One-Hot-Encoded dataset: (10000, 1210)


In [35]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

knn = KNeighborsClassifier()

# Perform a Grid Search to find the best values for the hyperparameters (k and weights)
# using Cross Validation with 4 stratified partitions
param_grid = {
    "n_neighbors": range(1, 11),
    "weights": ["uniform", "distance"]
}

grid_search = GridSearchCV(knn, param_grid=param_grid, cv=4)
grid_search.fit(X_train, y_train)

# Results on test set
mean_scores = grid_search.cv_results_["mean_test_score"]
best_params = grid_search.best_params_

best_params



{'n_neighbors': 4, 'weights': 'distance'}

In [38]:
import numpy as np

best_k = best_params["n_neighbors"]
best_weights = best_params["weights"]

print("Best values for the k-NN hyperparameters: k = {}, weights = {}".format(best_k, best_weights))

knn = KNeighborsClassifier(n_neighbors=best_k, weights=best_weights)
knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)
acc = np.sum(y_test == y_pred) / len(y_pred) * 100

print("Accuracy: {}".format(acc))

Best values for the k-NN hyperparameters: k = 4, weights = distance
Accuracy: 92.95
