In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import mlflow
import pickle
from tqdm import tqdm

# Experiment setup

In [2]:
mlflow.set_tracking_uri('../mlflow')

if mlflow.get_experiment_by_name('image-based recsys') is None:
    mlflow.set_experiment('image-based recsys')

# Data

In [3]:
with open('../data/styles-44c73c23.pkl', 'rb') as f:
    styles = pickle.load(f)

In [4]:
to_drop = ['styleType', 'productTypeId',
           'articleNumber', 'visualTag',
           'myntraRating', 'catalogAddDate',
           'colour1', 'colour2', 'vat',
           'weight', 'navigationId',
           'landingPageUrl', 'brandUserProfile',
           'codEnabled', 'lookGoodAlbum',
           'style360Images', 'isEMIEnabled',
           'styleOptions', 'colours', 'discountData',
            'articleAttributes', ]

styles = styles.drop(to_drop, axis=1)

In [5]:
def get_360x480(row):
    new_dict = {}
    # print(row)
    for key, value in row.items():
        if key == 'size_representation':
            continue
        
        new_dict[key] = value.get('imageURL', None)

    return new_dict

In [6]:
styles['images'] = styles.styleImages.apply(get_360x480)

In [7]:
styles = styles.drop('styleImages', axis=1)

In [8]:
embeddings = pd.read_parquet('../data/embeddings-44c73c23.parquet')

In [9]:
styles.shape, embeddings.shape

((44412, 19), (44412, 1281))

In [10]:
embeddings = embeddings.set_index('image')
styles = styles.set_index('id')

  return Index(sequences[0], name=names)


In [11]:
holdout = pd.read_csv('../data/holdout_ids.csv')
holdout.head()

Unnamed: 0,id
0,23591
1,39604
2,49462
3,26809
4,56261


In [12]:
styles = styles[~styles.index.isin(holdout['id'])]
embeddings = embeddings[~embeddings.index.isin(holdout['id'])]

In [13]:
(styles.index.isin(embeddings.index).sum() / styles.shape[0], 
embeddings.index.isin(styles.index).sum() / embeddings.shape[0])

(1.0, 1.0)

In [15]:
styles = styles.sort_index()
embeddings = embeddings.sort_index()
styles.head()

Unnamed: 0_level_0,price,discountedPrice,productDisplayName,variantName,brandName,ageGroup,gender,baseColour,fashionType,season,year,usage,displayCategories,masterCategory,subCategory,articleType,productDescriptors,images
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1163,895,895,Nike Sahara Team India Fanwear Round Neck Jersey,Roundneck Jersey,NIKE,Adults-Men,Men,Blue,Fashion,Summer,2011,Sports,"Sports Wear,Sale",Apparel,Topwear,Tshirts,"<p>Blue round neck Sahara Team India jersey, h...",{'default': 'http://assets.myntassets.com/v1/i...
1164,1595,1595,Nike Men Blue T20 Indian Cricket Jersey,Collared Jersey,NIKE,Adults-Men,Men,Blue,Fashion,Winter,2015,Sports,Sports Wear,Apparel,Topwear,Tshirts,<p><strong>Composition</strong><br />Blue cott...,{'default': 'http://assets.myntassets.com/v1/i...
1165,2495,2495,Nike Mean Team India Cricket Jersey,Authentic Jersey,NIKE,Adults-Men,Men,Blue,Fashion,Summer,2013,Sports,Sports Wear,Apparel,Topwear,Tshirts,"<p>Blue jersey with a tipped collar, concealed...",{'default': 'http://assets.myntassets.com/asse...
1525,1299,1299,Puma Deck Navy Blue Backpack,Deck Backpack,PUMA,Adults-Unisex,Unisex,Navy Blue,Fashion,Fall,2010,Casual,Accessories,Accessories,Bags,Backpacks,<p>asfafaf<br> kasjhdkashd</p>,{'default': 'http://assets.myntassets.com/v1/i...
1526,1299,1299,Puma Big Cat Backpack Black,Big Cat Backpack,PUMA,Adults-Unisex,Unisex,Black,Fashion,Fall,2010,Sports,Accessories,Accessories,Bags,Backpacks,"<p style=""text-align: justify;""><br />1. Polye...",{'default': 'http://assets.myntassets.com/v1/i...


In [16]:
all(styles.index == embeddings.index)

True

# Scaling

In [17]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(embeddings)
embeddings = pd.DataFrame(scaler.fit_transform(embeddings.values), columns=embeddings.columns, index=embeddings.index)

In [18]:
with open('../src/backend/ML-models/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

In [19]:
embeddings.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,1270,1271,1272,1273,1274,1275,1276,1277,1278,1279
image,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1163,-0.559129,0.064624,-0.541653,0.494476,-0.732576,-0.364267,0.217243,-0.063711,-0.646313,-0.179434,...,-0.627197,0.06526,0.076759,0.114998,1.375037,2.669481,-0.539308,1.23573,-0.264875,-0.551288
1164,-0.766388,0.337273,-0.494332,0.815344,-0.737816,-0.310305,0.393708,0.110811,-0.465037,0.744875,...,-0.638863,-0.071575,0.001753,0.326641,1.232199,2.403879,-0.61521,1.178555,-0.179312,-0.542332
1165,0.001387,-0.152014,-0.213541,0.618311,-0.020924,-0.840245,-1.184028,-0.156743,-0.092964,-1.446525,...,0.190283,-0.624775,1.240647,-0.261998,-0.666391,-0.848931,-0.06692,1.264973,-0.255418,-0.544567
1525,1.538574,0.237531,-0.865232,-0.432264,-0.699365,-0.021174,0.358182,0.011606,-0.859797,1.771345,...,-0.424415,-0.100269,-1.089163,0.107877,-0.042046,-0.429806,0.405103,-0.404875,1.59821,-0.669227
1526,0.98972,0.304483,-0.677383,-0.325525,-0.689746,-0.702989,0.339682,-0.302013,-0.595768,0.823138,...,-0.56331,-1.299171,-0.895376,0.419114,0.449609,-0.27048,1.217311,-0.35899,-0.562421,-0.664526


# Reset Index to Match KNN

In [20]:
embeddings = embeddings.reset_index(drop=True)
embeddings.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1270,1271,1272,1273,1274,1275,1276,1277,1278,1279
0,-0.559129,0.064624,-0.541653,0.494476,-0.732576,-0.364267,0.217243,-0.063711,-0.646313,-0.179434,...,-0.627197,0.06526,0.076759,0.114998,1.375037,2.669481,-0.539308,1.23573,-0.264875,-0.551288
1,-0.766388,0.337273,-0.494332,0.815344,-0.737816,-0.310305,0.393708,0.110811,-0.465037,0.744875,...,-0.638863,-0.071575,0.001753,0.326641,1.232199,2.403879,-0.61521,1.178555,-0.179312,-0.542332
2,0.001387,-0.152014,-0.213541,0.618311,-0.020924,-0.840245,-1.184028,-0.156743,-0.092964,-1.446525,...,0.190283,-0.624775,1.240647,-0.261998,-0.666391,-0.848931,-0.06692,1.264973,-0.255418,-0.544567
3,1.538574,0.237531,-0.865232,-0.432264,-0.699365,-0.021174,0.358182,0.011606,-0.859797,1.771345,...,-0.424415,-0.100269,-1.089163,0.107877,-0.042046,-0.429806,0.405103,-0.404875,1.59821,-0.669227
4,0.98972,0.304483,-0.677383,-0.325525,-0.689746,-0.702989,0.339682,-0.302013,-0.595768,0.823138,...,-0.56331,-1.299171,-0.895376,0.419114,0.449609,-0.27048,1.217311,-0.35899,-0.562421,-0.664526


In [21]:
styles = styles.reset_index(drop=True)
styles.head()

Unnamed: 0,price,discountedPrice,productDisplayName,variantName,brandName,ageGroup,gender,baseColour,fashionType,season,year,usage,displayCategories,masterCategory,subCategory,articleType,productDescriptors,images
0,895,895,Nike Sahara Team India Fanwear Round Neck Jersey,Roundneck Jersey,NIKE,Adults-Men,Men,Blue,Fashion,Summer,2011,Sports,"Sports Wear,Sale",Apparel,Topwear,Tshirts,"<p>Blue round neck Sahara Team India jersey, h...",{'default': 'http://assets.myntassets.com/v1/i...
1,1595,1595,Nike Men Blue T20 Indian Cricket Jersey,Collared Jersey,NIKE,Adults-Men,Men,Blue,Fashion,Winter,2015,Sports,Sports Wear,Apparel,Topwear,Tshirts,<p><strong>Composition</strong><br />Blue cott...,{'default': 'http://assets.myntassets.com/v1/i...
2,2495,2495,Nike Mean Team India Cricket Jersey,Authentic Jersey,NIKE,Adults-Men,Men,Blue,Fashion,Summer,2013,Sports,Sports Wear,Apparel,Topwear,Tshirts,"<p>Blue jersey with a tipped collar, concealed...",{'default': 'http://assets.myntassets.com/asse...
3,1299,1299,Puma Deck Navy Blue Backpack,Deck Backpack,PUMA,Adults-Unisex,Unisex,Navy Blue,Fashion,Fall,2010,Casual,Accessories,Accessories,Bags,Backpacks,<p>asfafaf<br> kasjhdkashd</p>,{'default': 'http://assets.myntassets.com/v1/i...
4,1299,1299,Puma Big Cat Backpack Black,Big Cat Backpack,PUMA,Adults-Unisex,Unisex,Black,Fashion,Fall,2010,Sports,Accessories,Accessories,Bags,Backpacks,"<p style=""text-align: justify;""><br />1. Polye...",{'default': 'http://assets.myntassets.com/v1/i...


In [22]:
all(styles.index == embeddings.index)

True

In [23]:
with open('../data/styles-76a25060.pkl', 'wb') as f:
    pickle.dump(styles, f)
    
embeddings.to_parquet('../data/embeddings-76a25060.parquet')

# Nearest Neighbors

In [24]:
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import precision_score, recall_score

def get_recall(y_true, y_pred, k=5):
    y_true = np.array(y_true.values)
    y_pred = np.array(y_pred.values)
    y_true = np.repeat(y_true, k)

    return recall_score(y_true.reshape(-1), y_pred.reshape(-1), average='weighted')

def get_precision(y_true, y_pred, k=5):
    y_true = np.array(y_true.values)
    y_pred = np.array(y_pred.values)
    y_true = np.repeat(y_true, k)

    return precision_score(y_true.reshape(-1), y_pred.reshape(-1), average='weighted')

def get_knn_score(X, y, k=5, metric='cosine', category='sub'):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    prec = []
    recall = []
    # mlflow.set_experiment('image-based recsys')
    with mlflow.start_run(experiment_id=mlflow.get_experiment_by_name('image-based recsys').experiment_id):
        
        mlflow.log_param('k', k)
        mlflow.log_param('category', category)
        mlflow.log_param('embeddings', 'EfficientNet_V2_S')
        mlflow.log_param('dataset_hash', '57d051f0')
        mlflow.log_param('metric', metric)
        
        for train_index, test_index in (progress := tqdm(skf.split(X, y), total=5, miniters=1)):
            progress.set_description(f'{metric=}')
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            knn = NearestNeighbors(n_neighbors=k, metric=metric)
            knn.fit(X_train.values)

            _, indices = knn.kneighbors(X_test.values)

            y_pred = y_train.iloc[indices.reshape(-1)]
            prec.append(get_precision(y_test, y_pred, k))
            recall.append(get_recall(y_test, y_pred, k))

        mlflow.log_metric(f'precision-at-{k}', np.mean(prec))
        mlflow.log_metric(f'recall-at-{k}', np.mean(recall))
        mlflow.sklearn.log_model(knn, 'knn-ranker')

    return {'precision': np.mean(prec), 'recall': np.mean(recall)}

In [25]:
K = 5
METRICS = ['cosine', 'euclidean']

scores = []

for metric in METRICS:
    score = get_knn_score(embeddings, styles['masterCategory'].cat.codes, k=K, metric=metric, category='master')
    scores.append(score)

scores = pd.DataFrame({'metric': METRICS, 'score': scores})
scores

metric='cosine': 100%|████████████████████████████████████████████████████████████████████| 5/5 [00:28<00:00,  5.66s/it]
metric='euclidean': 100%|█████████████████████████████████████████████████████████████████| 5/5 [00:48<00:00,  9.71s/it]


Unnamed: 0,metric,score
0,cosine,"{'precision': 0.9908139683533242, 'recall': 0...."
1,euclidean,"{'precision': 0.9902476837910734, 'recall': 0...."


In [26]:
knn = NearestNeighbors(n_neighbors=K, metric='euclidean')
knn.fit(embeddings)

NearestNeighbors(metric='euclidean')

In [27]:
s = styles.sample(1).index

sample_emb = embeddings.loc[s]

In [28]:
s

Int64Index([34333], dtype='int64')

In [29]:
pd.set_option('display.max_columns', 100)

In [30]:
styles.loc[s]

Unnamed: 0,price,discountedPrice,productDisplayName,variantName,brandName,ageGroup,gender,baseColour,fashionType,season,year,usage,displayCategories,masterCategory,subCategory,articleType,productDescriptors,images
34333,899,899,Prafful Women Black & Grren Sari,Prafful Sari,UNK,Adults-Women,Women,Black,Fashion,Fall,2012,Ethnic,,Apparel,Saree,Sarees,<p>Black sari with green accents</p>,{'default': 'http://assets.myntassets.com/v1/i...


In [31]:
print(styles.loc[s, 'images'].iloc[0]['default'])

http://assets.myntassets.com/v1/images/style/properties/Prafful-Multi-Coloured-Sari_4b6bb3490078cfde92530a70f36cdcb4_images.jpg


In [33]:
_, indices = knn.kneighbors(sample_emb)
indices

array([[34333, 35315, 31279, 31383, 30197]])

In [34]:
for i in indices[0]:
    print(styles.iloc[i]['images']['default'])

http://assets.myntassets.com/v1/images/style/properties/Prafful-Multi-Coloured-Sari_4b6bb3490078cfde92530a70f36cdcb4_images.jpg
http://assets.myntassets.com/v1/images/style/properties/FNF-Purple---Blue-Printed-Sari_c1800bc551ddac6e9c116402a11448cd_images.jpg
http://assets.myntassets.com/v1/images/style/properties/Fashion-N-Fabrics-Orange---Purple-Sari_06451f612d368907a8bc23eb65701e9c_images.jpg
http://assets.myntassets.com/v1/images/style/properties/Fashion-N-Fabrics-Multi-Coloured-Sari_43f8ff5e48908d59103bbe0827a40727_images.jpg
http://assets.myntassets.com/v1/images/style/properties/bb960aa2fd903f3cd8c458ba1147a2a9_images.jpg


In [37]:
with open('../src/backend/ML-models/ranker.pkl', 'wb') as r:
    pickle.dump(knn, r)