In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import mlflow
import pickle
from tqdm import tqdm

# Experiment setup

In [2]:
mlflow.set_tracking_uri('../mlflow')

if mlflow.get_experiment_by_name('image-based recsys') is None:
    mlflow.set_experiment('image-based recsys')

# Data

In [3]:
with open('../data/styles-44c73c23.pkl', 'rb') as f:
    styles = pickle.load(f)

In [4]:
embeddings = pd.read_parquet('../data/embeddings-44c73c23.parquet')

In [5]:
styles.shape, embeddings.shape

((44412, 40), (44412, 1281))

In [6]:
embeddings = embeddings.set_index('image')
styles = styles.set_index('id')

  return Index(sequences[0], name=names)


In [7]:
styles.index.isin(embeddings.index).sum(), embeddings.index.isin(styles.index).sum()

(44412, 44412)

In [8]:
holdout = pd.read_csv('../data/holdout_ids.csv')
holdout.head()

Unnamed: 0,id
0,23591
1,39604
2,49462
3,26809
4,56261


In [9]:
styles = styles[~styles.index.isin(holdout['id'])]
embeddings = embeddings[~embeddings.index.isin(holdout['id'])]

In [10]:
styles = styles.sort_index()
embeddings = embeddings.sort_index()

# Scaling

In [12]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(embeddings)
embeddings = pd.DataFrame(scaler.fit_transform(embeddings.values), columns=embeddings.columns, index=embeddings.index)

In [13]:
with open('../src/backend/ML-models/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

In [14]:
embeddings.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,1270,1271,1272,1273,1274,1275,1276,1277,1278,1279
image,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1163,-0.559129,0.064624,-0.541653,0.494476,-0.732576,-0.364267,0.217243,-0.063711,-0.646313,-0.179434,...,-0.627197,0.06526,0.076759,0.114998,1.375037,2.669481,-0.539308,1.23573,-0.264875,-0.551288
1164,-0.766388,0.337273,-0.494332,0.815344,-0.737816,-0.310305,0.393708,0.110811,-0.465037,0.744875,...,-0.638863,-0.071575,0.001753,0.326641,1.232199,2.403879,-0.61521,1.178555,-0.179312,-0.542332
1165,0.001387,-0.152014,-0.213541,0.618311,-0.020924,-0.840245,-1.184028,-0.156743,-0.092964,-1.446525,...,0.190283,-0.624775,1.240647,-0.261998,-0.666391,-0.848931,-0.06692,1.264973,-0.255418,-0.544567
1525,1.538574,0.237531,-0.865232,-0.432264,-0.699365,-0.021174,0.358182,0.011606,-0.859797,1.771345,...,-0.424415,-0.100269,-1.089163,0.107877,-0.042046,-0.429806,0.405103,-0.404875,1.59821,-0.669227
1526,0.98972,0.304483,-0.677383,-0.325525,-0.689746,-0.702989,0.339682,-0.302013,-0.595768,0.823138,...,-0.56331,-1.299171,-0.895376,0.419114,0.449609,-0.27048,1.217311,-0.35899,-0.562421,-0.664526


# Nearest Neighbors

In [13]:
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import precision_score, recall_score

def get_recall(y_true, y_pred, k=5):
    y_true = np.array(y_true.values)
    y_pred = np.array(y_pred.values)
    y_true = np.repeat(y_true, k)

    return recall_score(y_true.reshape(-1), y_pred.reshape(-1), average='weighted')

def get_precision(y_true, y_pred, k=5):
    y_true = np.array(y_true.values)
    y_pred = np.array(y_pred.values)
    y_true = np.repeat(y_true, k)

    return precision_score(y_true.reshape(-1), y_pred.reshape(-1), average='weighted')

def get_knn_score(X, y, k=5, metric='cosine', category='sub'):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    prec = []
    recall = []
    # mlflow.set_experiment('image-based recsys')
    with mlflow.start_run(experiment_id=mlflow.get_experiment_by_name('image-based recsys').experiment_id):
        
        mlflow.log_param('k', k)
        mlflow.log_param('category', category)
        mlflow.log_param('embeddings', 'EfficientNet_V2_S')
        mlflow.log_param('dataset_hash', '44c73c23')
        mlflow.log_param('metric', metric)
        
        for train_index, test_index in (progress := tqdm(skf.split(X, y), total=5, miniters=1)):
            progress.set_description(f'{metric=}')
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            knn = NearestNeighbors(n_neighbors=k, metric=metric)
            knn.fit(X_train)

            _, indices = knn.kneighbors(X_test)

            y_pred = y_train.iloc[indices.reshape(-1)]
            prec.append(get_precision(y_test, y_pred, k))
            recall.append(get_recall(y_test, y_pred, k))

        mlflow.log_metric(f'precision-at-{k}', np.mean(prec))
        mlflow.log_metric(f'recall-at-{k}', np.mean(recall))
        mlflow.sklearn.log_model(knn, 'knn-ranker')

    return {'precision': np.mean(prec), 'recall': np.mean(recall)}

In [14]:
K = 5
METRICS = ['cosine', 'euclidean']

scores = []

for metric in METRICS:
    score = get_knn_score(embeddings, styles['masterCategory'].cat.codes, k=K, metric=metric, category='master')
    scores.append(score)

scores = pd.DataFrame({'metric': METRICS, 'score': scores})
scores

metric='cosine': 100%|████████████████████████████████████████████████████████████████████| 5/5 [00:24<00:00,  4.84s/it]
metric='euclidean': 100%|█████████████████████████████████████████████████████████████████| 5/5 [00:43<00:00,  8.72s/it]


Unnamed: 0,metric,score
0,cosine,"{'precision': 0.9908139683533242, 'recall': 0...."
1,euclidean,"{'precision': 0.9902476837910734, 'recall': 0...."


In [15]:
K = 5
METRICS = ['cosine', 'euclidean']

scores = []

for metric in METRICS:
    score = get_knn_score(embeddings, styles['subCategory'].cat.codes, k=K, metric=metric, category='sub')
    scores.append(score)

scores = pd.DataFrame({'metric': METRICS, 'score': scores})
scores

metric='cosine': 100%|████████████████████████████████████████████████████████████████████| 5/5 [00:23<00:00,  4.74s/it]
metric='euclidean': 100%|█████████████████████████████████████████████████████████████████| 5/5 [00:39<00:00,  7.94s/it]


Unnamed: 0,metric,score
0,cosine,"{'precision': 0.9334580175735366, 'recall': 0...."
1,euclidean,"{'precision': 0.9310988692484546, 'recall': 0...."


In [16]:
knn = NearestNeighbors(n_neighbors=K, metric='euclidean')
knn.fit(embeddings)

NearestNeighbors(metric='euclidean')

In [17]:
s = styles.sample(1).index

sample_emb = embeddings.loc[s]

In [18]:
s

Int64Index([3843], dtype='int64', name='id')

In [19]:
pd.set_option('display.max_columns', 100)

In [20]:
styles.loc[s]

Unnamed: 0_level_0,price,discountedPrice,styleType,productTypeId,articleNumber,visualTag,productDisplayName,variantName,myntraRating,catalogAddDate,brandName,ageGroup,gender,baseColour,colour1,colour2,fashionType,season,year,usage,vat,displayCategories,weight,navigationId,landingPageUrl,articleAttributes,brandUserProfile,codEnabled,styleImages,lookGoodAlbum,style360Images,masterCategory,subCategory,articleType,isEMIEnabled,productDescriptors,styleOptions,colours,discountData
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1
3843,1195,1195,P,323,10823-710-420,,Crocs Women Lady Gold Sandal,Lady WomenGold,1,1458193333,Crocs,Adults-Women,Women,Gold,,,Fashion,Summer,2011,Casual,14.5,"Footwear,Sale",0,0,Flats/Crocs/Crocs-Women-Lady-Gold-Sandal/3843/buy,{},{'uidx': '89d00bb2.93eb.4b42.a4d4.21fed5dcc7a8...,True,{'default': {'imageURL': 'http://assets.myntas...,{},{},Footwear,Shoes,Flats,True,"<p><p style=""text-align: justify;""><strong>Com...","[{'id': 13752, 'name': 'Size', 'value': 'W5', ...",{'colors': {'3840': {'dre_landing_page_url': '...,


In [21]:
print(styles.loc[s, 'styleImages'].iloc[0]['default']['imageURL'])

http://assets.myntassets.com/v1/images/style/properties/504a27acee8e6d89d7eec2fae5b5ef01_images.jpg


In [22]:
_, indices = knn.kneighbors(sample_emb)
indices

array([[ 1405,  1404,  1403, 23951, 24968]])

In [23]:
for i in indices[0]:
    print(styles.iloc[i]['styleImages']['default']['imageURL'])

http://assets.myntassets.com/v1/images/style/properties/504a27acee8e6d89d7eec2fae5b5ef01_images.jpg
http://assets.myntassets.com/v1/images/style/properties/19b8f384b805fe6a77f21b203703e0c5_images.jpg
http://assets.myntassets.com/v1/images/style/properties/38a2d7faa1b64859acdcd8d30bb03d07_images.jpg
http://assets.myntassets.com/v1/images/style/properties/46693e48cbce8e6c469ee94853441437_images.jpg
http://assets.myntassets.com/v1/images/style/properties/bc36a8b2565c3758ba7659ed46bb00a7_images.jpg
