In [1]:
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
import os

In [2]:
import zipfile
import glob

file = glob.glob('./clothing-dataset-small-master.zip')

with zipfile.ZipFile(file[0], 'r') as zip_ref:
    zip_ref.extractall('data/')

In [3]:
for data_type in ["train", "validation", "test"]:
    folder_path = "./data/clothing-dataset-small-master/" + data_type
    
    # combine validation and train
    new_folder_path = "./data/clothing-dataset/" + ("test" if data_type == "test" else "train")

    for folder in os.listdir(folder_path):
        new_path = os.path.join(new_folder_path, folder)
        if not os.path.exists(new_path):
            os.makedirs(new_path)

        for filename in os.listdir(os.path.join(folder_path, folder)):
            if filename.endswith(".jpg") or filename.endswith(".png"):
                image_path = os.path.join(folder_path, folder, filename)
                img = Image.open(image_path)
                new_image_path = os.path.join(new_folder_path, folder, filename)
                img.save(new_image_path)
                                              
                # flip all images besides t-shirts to balance the data
                if folder != "t-shirt":
                    img = img.transpose(Image.FLIP_LEFT_RIGHT)
                    new_image_path = os.path.join(new_folder_path, folder, "flipped_" + filename)
                    img.save(new_image_path)

In [6]:
# Feature: Grayscale
X_train = []
y_train = []
X_test = []
y_test = []

for data_type in ["train", "test"]:
    folder_path = "./data/clothing-dataset/" + data_type

    for folder in os.listdir(folder_path):
        for filename in os.listdir(os.path.join(folder_path, folder)):
            # Open the image
            if filename.endswith(".jpg") or filename.endswith(".png"):
                image_path = os.path.join(folder_path, folder, filename)
                img = Image.open(image_path)
                # resize the image
                img = img.resize((64, 64))
                # convert to grayscale
                img = img.convert('L')
                # flatten to 1D array
                array = np.array(img).ravel()
                
                if data_type == "test":
                    X_test.append(array)
                    y_test.append(folder)
                else:
                    X_train.append(array)
                    y_train.append(folder)

In [4]:
from skimage.io import imread, imshow
from skimage import transform
from skimage.filters import prewitt

def preprocess_image_edge_only(image):
    resized_image = transform.resize(image, (64, 64), anti_aliasing=True)
    edges_prewitt = prewitt(resized_image)
    edges_prewitt_array = edges_prewitt.reshape(1, 64 * 64)
    return edges_prewitt_array[0]

def preprocess_image(image):
    resized_image = transform.resize(image, (64, 64), anti_aliasing=True)
    edges_prewitt = prewitt(resized_image)
    edges_prewitt_array = edges_prewitt.reshape(1, 64 * 64)
    image_array = resized_image.reshape(1, 64 * 64)
    return np.concatenate((edges_prewitt_array[0], image_array[0]))

In [11]:
# Feature: Edges
X_train = []
y_train = []
X_test = []
y_test = []

data_types = ["train", "test"]

image = None
for data_type in data_types:
    folder_path = "./data/clothing-dataset/" + data_type

    for folder in os.listdir(folder_path):
        for filename in os.listdir(os.path.join(folder_path, folder)):
            # Open the image
            if filename.endswith(".jpg") or filename.endswith(".png"):
                image_path = os.path.join(folder_path, folder, filename)
                image = imread(image_path,as_gray=True)
                result = preprocess_image_edge_only(image)
                if data_type == "test":
                    X_test.append(result)
                    y_test.append(folder)
                else:
                    X_train.append(result)
                    y_train.append(folder)

In [5]:
# Feature: Grayscale + Edges
X_train = []
y_train = []
X_test = []
y_test = []

data_types = ["train", "test"]

image = None
count = 0
for data_type in data_types:
    folder_path = "./data/clothing-dataset/" + data_type

    for folder in os.listdir(folder_path):
        for filename in os.listdir(os.path.join(folder_path, folder)):
            # Open the image
            if filename.endswith(".jpg") or filename.endswith(".png"):
                image_path = os.path.join(folder_path, folder, filename)
                image = imread(image_path,as_gray=True)
                result = preprocess_image(image)
                if data_type == "test":
                    X_test.append(result)
                    y_test.append(folder)
                else:
                    X_train.append(result)
                    y_train.append(folder)

In [5]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score, make_scorer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# define the number of folds for cross validation
NUM_FOLDS = 5

# define param search space for knn and dt
knn_param_grid = {'n_neighbors': list(range(2, 100))}
dt_param_grid = {'max_depth': list(range(2, 100))}

# define models
knn = KNeighborsClassifier()
dt = DecisionTreeClassifier(max_features=512)

# define scoring metric
f1_scorer = make_scorer(f1_score, average='micro')

# KNN
knn_model = RandomizedSearchCV(knn, knn_param_grid, cv=NUM_FOLDS, scoring=f1_scorer, n_jobs=-1)
knn_model.fit(X_train, y_train)

print("KNN:")
print("Best k: ", knn_model.best_params_)
print("Best F1 score: ", knn_model.best_score_)

knn_final = KNeighborsClassifier(n_neighbors=int(knn_model.best_params_['n_neighbors']))
knn_final.fit(X_train, y_train)
y_pred = knn_final.predict(X_test)
print("Test F1 Score: ", f1_score(y_test, y_pred, average='micro'))

# DT
dt_model = RandomizedSearchCV(dt, dt_param_grid, cv=NUM_FOLDS, scoring=f1_scorer, n_jobs=-1)
dt_model.fit(X_train, y_train)

print("DT:")
print("Best depth: ", dt_model.best_params_)
print("Best F1 score: ", dt_model.best_score_)

dt_final = DecisionTreeClassifier(max_depth=int(dt_model.best_params_['max_depth']), max_features=512)
dt_final.fit(X_train, y_train)
y_pred = dt_final.predict(X_test)
print("Test F1 Score: ", f1_score(y_test, y_pred, average='micro'))

KNN:
Best k:  {'n_neighbors': 13}
Best F1 score:  0.48636646815788953
Test F1 Score:  0.32947976878612717
DT:
Best depth:  {'max_depth': 65}
Best F1 score:  0.3658659768415866
Test F1 Score:  0.29335260115606937


In [7]:
from sklearn import svm
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier

# SVM
svc = make_pipeline(StandardScaler(), SGDClassifier(max_iter=3000))
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)
print("Test F1 Score: ", f1_score(y_test, y_pred, average='micro'))

# svm_model = RandomizedSearchCV(svc, param_distributions=param_dist, cv=2, scoring=f1_scorer, n_jobs=-1, n_iter=2)
# svm_model.fit(X_train, y_train)

# print("SVM")
# print("Best params: ", svm_model.best_params_)
# print("Best F1 score: ", svm_model.best_score_)

Test F1 Score:  0.30346820809248554




In [16]:
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, f1_score, make_scorer

# n_components is the number of features we would like to get
pca = PCA(n_components=130)

# Fit PCA to the training data (greyscale + edges)
X_train_pca = pca.fit_transform(X_train)

X_test_pca = pca.transform(X_test)

# print("Transformed training data:", X_train_pca)
# print("Transformed test data:", X_test_pca)

# define models
knn = KNeighborsClassifier()
dt = DecisionTreeClassifier(max_features=512)

# define param search space for knn and dt
knn_param_grid = {'n_neighbors': list(range(2, 100))}
dt_param_grid = {'max_depth': list(range(2, 100))}

NUM_FOLDS = 5
f1_scorer = make_scorer(f1_score, average='micro')

# PCA + KNN
knn_final = RandomizedSearchCV(knn, knn_param_grid, cv=NUM_FOLDS, scoring=f1_scorer, n_jobs=-1, n_iter=10)
knn_final.fit(X_train_pca, y_train)
y_pred = knn_final.predict(X_test_pca)
print("Accuracy Score: ", accuracy_score(y_test, y_pred))
print("\n The best estimator across ALL searched params:\n", knn_final.best_estimator_)
print("\n The best score across ALL searched params:\n", knn_final.best_score_)
print("\n The best parameters across ALL searched params:\n", knn_final.best_params_)

# PCA + DT
dt_final = RandomizedSearchCV(dt, dt_param_grid, cv=NUM_FOLDS, scoring=f1_scorer, n_jobs=-1, n_iter=10)
dt_final.fit(X_train_pca, y_train)
y_pred = dt_final.predict(X_test_pca)
print("Accuracy Score: ", accuracy_score(y_test, y_pred))
print("\n The best estimator across ALL searched params:\n", dt_final.best_estimator_)
print("\n The best score across ALL searched params:\n", dt_final.best_score_)
print("\n The best parameters across ALL searched params:\n", dt_final.best_params_)

# Outcome
# Based on random trial and error, somewhere within the range of 100 - 150 features will return the best results.
# Did not do for loop because at n_components = 1000 it takes quite long already.

Accuracy Score:  0.3786127167630058

 The best estimator across ALL searched params:
 KNeighborsClassifier(n_neighbors=2)

 The best score across ALL searched params:
 0.5829718547969179

 The best parameters across ALL searched params:
 {'n_neighbors': 2}
Accuracy Score:  0.29335260115606937

 The best estimator across ALL searched params:
 DecisionTreeClassifier(max_depth=47, max_features=512)

 The best score across ALL searched params:
 0.49663710273466366

 The best parameters across ALL searched params:
 {'max_depth': 47}


In [None]:
from sklearn.manifold import LocallyLinearEmbedding
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score, make_scorer

lle = LocallyLinearEmbedding(n_components=50, n_neighbors=100, method="modified")

# Fit PCA to the training data (greyscale + edges)
X_train_lle = lle.fit_transform(X_train)

X_test_lle = lle.transform(X_test)

# print("Transformed training data:", X_train_lle)
# print("Transformed test data:", X_test_lle)

# define models
knn = KNeighborsClassifier()
dt = DecisionTreeClassifier(max_features=512)

# define param search space for knn and dt
knn_param_grid = {'n_neighbors': list(range(2, 100))}
dt_param_grid = {'max_depth': list(range(2, 100))}

NUM_FOLDS = 5
f1_scorer = make_scorer(f1_score, average='micro')

# define models
knn = KNeighborsClassifier()
dt = DecisionTreeClassifier(max_features=512)

# PCA + KNN
knn_final = RandomizedSearchCV(knn, knn_param_grid, cv=NUM_FOLDS, scoring=f1_scorer, n_jobs=-1, n_iter=3)
knn_final.fit(X_train_lle, y_train)
y_pred = knn_final.predict(X_test_lle)
print("Accuracy Score: ", accuracy_score(y_test, y_pred))
print("\n The best estimator across ALL searched params:\n", knn_final.best_estimator_)
print("\n The best score across ALL searched params:\n", knn_final.best_score_)
print("\n The best parameters across ALL searched params:\n", knn_final.best_params_)

# PCA + DT
dt_final = RandomizedSearchCV(dt, dt_param_grid, cv=NUM_FOLDS, scoring=f1_scorer, n_jobs=-1, n_iter=3)
dt_final.fit(X_train_lle, y_train)
y_pred = dt_final.predict(X_test_lle)
print("Accuracy Score: ", accuracy_score(y_test, y_pred))
print("\n The best estimator across ALL searched params:\n", dt_final.best_estimator_)
print("\n The best score across ALL searched params:\n", dt_final.best_score_)
print("\n The best parameters across ALL searched params:\n", dt_final.best_params_)

# Outcome
# This is much slower than PCA, did not do loop
# This took around 1-2 mins

Accuracy Score:  0.3554913294797688

 The best estimator across ALL searched params:
 KNeighborsClassifier(n_neighbors=53)

 The best score across ALL searched params:
 0.5148128325588376

 The best parameters across ALL searched params:
 {'n_neighbors': 53}


In [34]:
from sklearn.manifold import LocallyLinearEmbedding
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

lle = LocallyLinearEmbedding(n_components=90, n_neighbors=100, method="modified")

# Fit PCA to the training data (greyscale + edges)
X_train_lle = lle.fit_transform(X_train)

X_test_lle = lle.transform(X_test)

# print("Transformed training data:", X_train_lle)
# print("Transformed test data:", X_test_lle)

# PCA + KNN
knn_final = KNeighborsClassifier(n_neighbors=13)
knn_final.fit(X_train_lle, y_train)
y_pred = knn_final.predict(X_test_lle)
print("Accuracy Score: ", accuracy_score(y_test, y_pred))

# PCA + DT
dt_final = DecisionTreeClassifier(max_depth=65)
dt_final.fit(X_train_lle, y_train)
y_pred = dt_final.predict(X_test_lle)
print("Accuracy Score: ", accuracy_score(y_test, y_pred))

# Outcome
# This is much slower than PCA, did not do loop
# This took around 1-2 mins

# Comments
# Maybe we need to do normalization of features first. Since this is highly dependent on neighbours.

Accuracy Score:  0.3930635838150289
Accuracy Score:  0.3092485549132948
