In [1]:
import json
import numpy as np
import random
import pandas as pd
from collections import Counter
import itertools
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, CategoricalNB
from numpy import mean
from numpy import std
import heapq
import matplotlib.pyplot as plt
pd.set_option('display.max_rows', 10)

In [2]:
def DataGenerator(n=100):
    order_list = {}
    plotting_list = {}
    type_list = ["t-shirt","trousers","jeans","jacket","trainers","boots","sweater","tracksuit","shirt"]
    colour_list = ["red", "yellow", "orange", "green", "blue", "brown", "beige", "black", "white", "dark-gray", "light-gray"]
    inventory = ["_".join(i[::-1]) for i in list(itertools.product(type_list,colour_list))]
    p = np.random.dirichlet(np.ones(len(inventory))/25,size=len(inventory)) # impose artificial trend
    probability_table= {item:prob for (item,prob) in zip(inventory,p)}
    for i in range(n):
        orderID = f"order{i+1}_ID"
        item1 = random.sample(inventory,1)[0] # col_tp
        col,tp = item1.split("_")
        item2 = np.random.choice(inventory,p=probability_table[item1]) #col_tp
        order_list[orderID]={"type": tp,"colour":col, "together_with":item2}
        plotting_list[orderID] = {"item":item1,"together_with":item2}
    return order_list, plotting_list

In [3]:
order_list, plotting_list = DataGenerator(n=30000)
order_list = json.dumps(order_list)

In [4]:
def jsonParser(item_list):
    parsed_json = json.loads(item_list)
    return parsed_json

In [5]:
def jsonToDataFrame(parsed_json):
    df = pd.DataFrame.from_dict(parsed_json).T
    return df

In [6]:
def DFTrainPrep(df):
    ohe = OneHotEncoder()
    return ohe.fit_transform(df[["type","colour"]]).toarray()

In [7]:
parsed_inp = jsonParser(order_list) 
df_inp= jsonToDataFrame(parsed_inp)
df_inp.head()

Unnamed: 0,type,colour,together_with
order1_ID,jeans,blue,yellow_trousers
order2_ID,shirt,orange,blue_t-shirt
order3_ID,boots,red,beige_shirt
order4_ID,t-shirt,white,brown_sweater
order5_ID,jeans,light-gray,light-gray_boots


In [8]:
grp = df_inp.groupby(["type","colour","together_with"]).size().reset_index().rename(columns={0:'count'})

In [9]:
grp

Unnamed: 0,type,colour,together_with,count
0,boots,beige,beige_jeans,19
1,boots,beige,beige_shirt,8
2,boots,beige,black_jacket,1
3,boots,beige,black_t-shirt,13
4,boots,beige,blue_trainers,14
...,...,...,...,...
1638,trousers,yellow,red_jacket,11
1639,trousers,yellow,white_trousers,114
1640,trousers,yellow,yellow_boots,1
1641,trousers,yellow,yellow_t-shirt,1


In [10]:
plotting_list = json.dumps(plotting_list)
parsed_plot = jsonParser(plotting_list) 
df_plot= jsonToDataFrame(parsed_plot)
df_plot.head()

Unnamed: 0,item,together_with
order1_ID,blue_jeans,yellow_trousers
order2_ID,orange_shirt,blue_t-shirt
order3_ID,red_boots,beige_shirt
order4_ID,white_t-shirt,brown_sweater
order5_ID,light-gray_jeans,light-gray_boots


In [11]:
grp_plot = df_plot.groupby(["item","together_with"]).size().reset_index().rename(columns={0:'count'})

In [12]:
# df_heatmap = grp_plot.pivot( "together_with","item", "count")
# plt.figure(figsize=(30,30))
# sns.heatmap(data=df_heatmap,annot=True)

In [13]:
le = LabelEncoder()
y = le.fit_transform(df_inp["together_with"])
X = DFTrainPrep(df_inp)

## LOGREG

In [14]:
# model = LogisticRegression(multi_class='multinomial', solver='saga', penalty='none')
# model.fit(X,y)

In [15]:
# import operator
# for i in range(5,10):
#     example = i
#     pred = model.predict(X[example].reshape(1,-1))
#     prob = model.predict_proba(X[example].reshape(1,-1))
#     print(pred, "Prediction: ", le.inverse_transform(pred),"OG",y[example],le.inverse_transform([y[example]]))
#     print(list(zip(*heapq.nlargest(5, enumerate(prob[0]), key=operator.itemgetter(1)))))

In [16]:
# def evaluate_model(model, X, y):
# 	# define the evaluation procedure
# 	cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# 	# evaluate the model
# 	scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
# 	return scores

# models = LogisticRegression(multi_class='multinomial', solver='saga', penalty='none')
# scores = evaluate_model(models, X, y)
# print(f"mean (std) acc : {mean(scores)} ({std(scores)})")

## NB

In [17]:
# def evaluate_model(model, X, y):
# 	# define the evaluation procedure
# 	cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# 	# evaluate the model
# 	scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
# 	return scores

# models = CategoricalNB()
# #evaluate the model and collect the scores
# scores = evaluate_model(model, X, y)
# scores

## NN

In [18]:
# import tensorflow as tf
# from tensorflow import keras
# from keras.utils import np_utils
# from keras.wrappers.scikit_learn import KerasClassifier
# from sklearn.model_selection import cross_val_score
# from sklearn.model_selection import KFold
# tf.keras.backend.clear_session()

# X_tensor = tf.convert_to_tensor(X)
# y_tensor = tf.convert_to_tensor(y)
# y_cat = np_utils.to_categorical(y)

In [19]:
# def baseline_model():
#     model_NN = keras.models.Sequential()
#     model_NN.add(keras.layers.Dense(10,input_dim=20,activation="relu"))
#     model_NN.add(keras.layers.Dense(256,activation="relu"))
#     model_NN.add(keras.layers.Dropout(0.1))
#     model_NN.add(keras.layers.Dense(256,activation="relu"))
#     model_NN.add(keras.layers.Dropout(0.1))
#     model_NN.add(keras.layers.Dense(99,activation="sigmoid"))
#     model_NN.summary()
#     model_NN.compile(optimizer='adam', 
#               loss="categorical_crossentropy",
#               metrics=['accuracy'])
#     return model_NN                                 

In [20]:
# estimator = KerasClassifier(build_fn=baseline_model, epochs=50, batch_size=5, verbose=1)


In [21]:
# estimator.fit(X,y_cat)

In [22]:
# pred = estimator.predict(X[:20])
# print(pred,
# y[:20])

## Estimate

In [14]:
df_heatmap = grp_plot.pivot( "item","together_with","count").fillna(0)
df_heatmap += 1 # non-zero probability

In [15]:
df_heatmap

together_with,beige_boots,beige_jacket,beige_jeans,beige_shirt,beige_sweater,beige_t-shirt,beige_tracksuit,beige_trainers,beige_trousers,black_boots,...,white_trousers,yellow_boots,yellow_jacket,yellow_jeans,yellow_shirt,yellow_sweater,yellow_t-shirt,yellow_tracksuit,yellow_trainers,yellow_trousers
item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
beige_boots,1.0,1.0,20.0,9.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,120.0,1.0,1.0,1.0,1.0,1.0
beige_jacket,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,...,3.0,1.0,1.0,1.0,3.0,1.0,1.0,1.0,1.0,4.0
beige_jeans,1.0,1.0,16.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,...,2.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
beige_shirt,1.0,1.0,19.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,100.0,102.0,1.0,1.0,1.0,1.0,1.0,7.0,1.0,1.0
beige_sweater,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,...,1.0,1.0,1.0,2.0,1.0,1.0,1.0,4.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
yellow_sweater,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,4.0,1.0,1.0,4.0,31.0,1.0,6.0
yellow_t-shirt,1.0,1.0,15.0,1.0,1.0,1.0,1.0,1.0,2.0,40.0,...,1.0,6.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
yellow_tracksuit,4.0,5.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,...,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
yellow_trainers,1.0,1.0,1.0,6.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,18.0,1.0,1.0,1.0,3.0


In [16]:
def GetCount(df,item_name,together):
    return df[f"{together}"][df.index== item_name]
def GetItemRow(df,item_name):
    return df[df.index==item_name]

In [17]:
inventory = list(df_heatmap.index)

In [20]:
probability_table_estimate = df_heatmap.div(df_heatmap.sum(axis=1), axis=0)
probability_table_estimate

together_with,beige_boots,beige_jacket,beige_jeans,beige_shirt,beige_sweater,beige_t-shirt,beige_tracksuit,beige_trainers,beige_trousers,black_boots,...,white_trousers,yellow_boots,yellow_jacket,yellow_jeans,yellow_shirt,yellow_sweater,yellow_t-shirt,yellow_tracksuit,yellow_trainers,yellow_trousers
item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
beige_boots,0.002519,0.002519,0.050378,0.022670,0.002519,0.002519,0.002519,0.002519,0.002519,0.002519,...,0.002519,0.002519,0.002519,0.002519,0.302267,0.002519,0.002519,0.002519,0.002519,0.002519
beige_jacket,0.002558,0.002558,0.002558,0.002558,0.002558,0.002558,0.005115,0.002558,0.002558,0.002558,...,0.007673,0.002558,0.002558,0.002558,0.007673,0.002558,0.002558,0.002558,0.002558,0.010230
beige_jeans,0.002457,0.002457,0.039312,0.002457,0.002457,0.002457,0.004914,0.002457,0.002457,0.002457,...,0.004914,0.002457,0.004914,0.002457,0.002457,0.002457,0.002457,0.002457,0.002457,0.002457
beige_shirt,0.002451,0.002451,0.046569,0.002451,0.002451,0.002451,0.002451,0.002451,0.002451,0.002451,...,0.245098,0.250000,0.002451,0.002451,0.002451,0.002451,0.002451,0.017157,0.002451,0.002451
beige_sweater,0.002451,0.002451,0.002451,0.002451,0.002451,0.002451,0.002451,0.002451,0.004902,0.002451,...,0.002451,0.002451,0.002451,0.004902,0.002451,0.002451,0.002451,0.009804,0.002451,0.002451
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
yellow_sweater,0.002353,0.002353,0.002353,0.002353,0.002353,0.002353,0.002353,0.002353,0.002353,0.002353,...,0.002353,0.002353,0.002353,0.009412,0.002353,0.002353,0.009412,0.072941,0.002353,0.014118
yellow_t-shirt,0.002481,0.002481,0.037221,0.002481,0.002481,0.002481,0.002481,0.002481,0.004963,0.099256,...,0.002481,0.014888,0.002481,0.002481,0.002481,0.002481,0.002481,0.002481,0.002481,0.002481
yellow_tracksuit,0.009368,0.011710,0.002342,0.002342,0.002342,0.002342,0.002342,0.004684,0.002342,0.002342,...,0.004684,0.002342,0.002342,0.002342,0.002342,0.002342,0.002342,0.002342,0.002342,0.002342
yellow_trainers,0.002475,0.002475,0.002475,0.014851,0.002475,0.002475,0.002475,0.002475,0.002475,0.002475,...,0.002475,0.002475,0.002475,0.002475,0.002475,0.044554,0.002475,0.002475,0.002475,0.007426


In [29]:
GetItemRow(probability_table_estimate, "beige_boots").to_numpy()[0]

array([0.00251889, 0.00251889, 0.05037783, 0.02267003, 0.00251889,
       0.00251889, 0.00251889, 0.00251889, 0.00251889, 0.00251889,
       0.00503778, 0.00251889, 0.00251889, 0.00251889, 0.03526448,
       0.00251889, 0.00251889, 0.00251889, 0.00251889, 0.00251889,
       0.00251889, 0.00251889, 0.00251889, 0.00251889, 0.00251889,
       0.03778338, 0.00251889, 0.00251889, 0.00251889, 0.00251889,
       0.00251889, 0.00251889, 0.00251889, 0.00251889, 0.00251889,
       0.00251889, 0.00251889, 0.00503778, 0.00251889, 0.00251889,
       0.00251889, 0.00251889, 0.00251889, 0.00503778, 0.00251889,
       0.01007557, 0.00503778, 0.00251889, 0.00251889, 0.18891688,
       0.00251889, 0.00251889, 0.00503778, 0.00251889, 0.00251889,
       0.00251889, 0.00251889, 0.00251889, 0.00251889, 0.00251889,
       0.00251889, 0.00251889, 0.00251889, 0.00251889, 0.03778338,
       0.0302267 , 0.00251889, 0.00251889, 0.03274559, 0.00251889,
       0.00251889, 0.00251889, 0.00251889, 0.00251889, 0.00251

In [37]:
def GetRecommendations(prob_table,currently_viewed_item,n=5):
    recommendations = []
    inventory = list(prob_table.index)
    probabilities = GetItemRow(prob_table,currently_viewed_item).to_numpy()[0]
    for _ in range(n):
        item = np.random.choice(inventory,p=probabilities)
        if (item == currently_viewed_item) or (item in recommendations):
            while (item == currently_viewed_item) or (item in recommendations ):
                item = np.random.choice(inventory,p=probabilities)# do this to give 5 recommendations no repeats
        recommendations.append(item)
    return recommendations

In [42]:
GetRecommendations(probability_table_estimate,"beige_shirt")

['green_jacket',
 'yellow_boots',
 'beige_t-shirt',
 'white_trousers',
 'white_jeans']