In [1]:
import json
import numpy as np
import random
import pandas as pd
from collections import Counter
import itertools
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, CategoricalNB
from numpy import mean
from numpy import std
import heapq
import matplotlib.pyplot as plt
pd.set_option('display.max_rows', 10)

In [2]:
def DataGenerator(n=100):
    order_list = {}
    plotting_list = {}
    type_list = ["t-shirt","trousers","jeans","jacket","trainers","boots","sweater","tracksuit","shirt"]
    colour_list = ["red", "yellow", "orange", "green", "blue", "brown", "beige", "black", "white", "dark-gray", "light-gray"]
    inventory = ["_".join(i[::-1]) for i in list(itertools.product(type_list,colour_list))]
    p = np.random.dirichlet(np.ones(len(inventory))/25,size=len(inventory)) # impose artificial trend
    probability_table= {item:prob for (item,prob) in zip(inventory,p)}
    for i in range(n):
        orderID = f"order{i+1}_ID"
        item1 = random.sample(inventory,1)[0] # col_tp
        col,tp = item1.split("_")
        item2 = np.random.choice(inventory,p=probability_table[item1]) #col_tp
        order_list[orderID]={"type": tp,"colour":col, "together_with":item2}
        plotting_list[orderID] = {"item":item1,"together_with":item2}
    return order_list, plotting_list

In [3]:
order_list, plotting_list = DataGenerator(n=30000)
order_list = json.dumps(order_list)

In [4]:
def jsonParser(item_list):
    parsed_json = json.loads(item_list)
    return parsed_json

In [5]:
def jsonToDataFrame(parsed_json):
    df = pd.DataFrame.from_dict(parsed_json).T
    return df

In [6]:
def DFTrainPrep(df):
    ohe = OneHotEncoder()
    return ohe.fit_transform(df[["type","colour"]]).toarray()

In [7]:
parsed_inp = jsonParser(order_list) 
df_inp= jsonToDataFrame(parsed_inp)
df_inp.head()

Unnamed: 0,type,colour,together_with
order1_ID,sweater,dark-gray,green_trousers
order2_ID,jeans,yellow,red_jeans
order3_ID,sweater,beige,white_tracksuit
order4_ID,jeans,white,beige_boots
order5_ID,tracksuit,orange,light-gray_boots


In [8]:
grp = df_inp.groupby(["type","colour","together_with"]).size().reset_index().rename(columns={0:'count'})

In [9]:
grp

Unnamed: 0,type,colour,together_with,count
0,boots,beige,beige_sweater,68
1,boots,beige,black_jeans,1
2,boots,beige,blue_tracksuit,27
3,boots,beige,dark-gray_boots,4
4,boots,beige,dark-gray_shirt,1
...,...,...,...,...
1605,trousers,yellow,light-gray_t-shirt,15
1606,trousers,yellow,orange_tracksuit,207
1607,trousers,yellow,red_t-shirt,1
1608,trousers,yellow,white_shirt,3


In [10]:
plotting_list = json.dumps(plotting_list)
parsed_plot = jsonParser(plotting_list) 
df_plot= jsonToDataFrame(parsed_plot)
df_plot.head()

Unnamed: 0,item,together_with
order1_ID,dark-gray_sweater,green_trousers
order2_ID,yellow_jeans,red_jeans
order3_ID,beige_sweater,white_tracksuit
order4_ID,white_jeans,beige_boots
order5_ID,orange_tracksuit,light-gray_boots


In [11]:
grp_plot = df_plot.groupby(["item","together_with"]).size().reset_index().rename(columns={0:'count'})

In [12]:
# df_heatmap = grp_plot.pivot( "together_with","item", "count")
# plt.figure(figsize=(30,30))
# sns.heatmap(data=df_heatmap,annot=True)

In [13]:
le = LabelEncoder()
y = le.fit_transform(df_inp["together_with"])
X = DFTrainPrep(df_inp)

## LOGREG

In [14]:
# model = LogisticRegression(multi_class='multinomial', solver='saga', penalty='none')
# model.fit(X,y)

In [15]:
# import operator
# for i in range(5,10):
#     example = i
#     pred = model.predict(X[example].reshape(1,-1))
#     prob = model.predict_proba(X[example].reshape(1,-1))
#     print(pred, "Prediction: ", le.inverse_transform(pred),"OG",y[example],le.inverse_transform([y[example]]))
#     print(list(zip(*heapq.nlargest(5, enumerate(prob[0]), key=operator.itemgetter(1)))))

In [16]:
# def evaluate_model(model, X, y):
# 	# define the evaluation procedure
# 	cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# 	# evaluate the model
# 	scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
# 	return scores

# models = LogisticRegression(multi_class='multinomial', solver='saga', penalty='none')
# scores = evaluate_model(models, X, y)
# print(f"mean (std) acc : {mean(scores)} ({std(scores)})")

## NB

In [17]:
# def evaluate_model(model, X, y):
# 	# define the evaluation procedure
# 	cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# 	# evaluate the model
# 	scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
# 	return scores

# models = CategoricalNB()
# #evaluate the model and collect the scores
# scores = evaluate_model(model, X, y)
# scores

## NN

In [18]:
# import tensorflow as tf
# from tensorflow import keras
# from keras.utils import np_utils
# from keras.wrappers.scikit_learn import KerasClassifier
# from sklearn.model_selection import cross_val_score
# from sklearn.model_selection import KFold
# tf.keras.backend.clear_session()

# X_tensor = tf.convert_to_tensor(X)
# y_tensor = tf.convert_to_tensor(y)
# y_cat = np_utils.to_categorical(y)

In [19]:
# def baseline_model():
#     model_NN = keras.models.Sequential()
#     model_NN.add(keras.layers.Dense(10,input_dim=20,activation="relu"))
#     model_NN.add(keras.layers.Dense(256,activation="relu"))
#     model_NN.add(keras.layers.Dropout(0.1))
#     model_NN.add(keras.layers.Dense(256,activation="relu"))
#     model_NN.add(keras.layers.Dropout(0.1))
#     model_NN.add(keras.layers.Dense(99,activation="sigmoid"))
#     model_NN.summary()
#     model_NN.compile(optimizer='adam', 
#               loss="categorical_crossentropy",
#               metrics=['accuracy'])
#     return model_NN                                 

In [20]:
# estimator = KerasClassifier(build_fn=baseline_model, epochs=50, batch_size=5, verbose=1)


In [21]:
# estimator.fit(X,y_cat)

In [22]:
# pred = estimator.predict(X[:20])
# print(pred,
# y[:20])

## Estimate

In [24]:
df_heatmap = grp_plot.pivot( "item","together_with","count").fillna(0)
df_heatmap += 1 # non-zero probability

In [43]:
df_heatmap

together_with,beige_boots,beige_jacket,beige_jeans,beige_shirt,beige_sweater,beige_t-shirt,beige_tracksuit,beige_trainers,beige_trousers,black_boots,...,white_trousers,yellow_boots,yellow_jacket,yellow_jeans,yellow_shirt,yellow_sweater,yellow_t-shirt,yellow_tracksuit,yellow_trainers,yellow_trousers
item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
beige_boots,1.0,1.0,1.0,1.0,69.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,102.0,1.0,8.0,1.0,1.0
beige_jacket,1.0,1.0,1.0,1.0,25.0,1.0,94.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,4.0,1.0,1.0,14.0,21.0
beige_jeans,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,2.0,11.0,1.0,1.0,1.0,1.0,2.0
beige_shirt,1.0,52.0,1.0,1.0,1.0,1.0,17.0,1.0,1.0,1.0,...,30.0,1.0,1.0,1.0,1.0,114.0,1.0,1.0,1.0,2.0
beige_sweater,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,4.0,1.0,8.0,1.0,1.0,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
yellow_sweater,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,6.0,1.0,1.0,1.0,1.0,1.0
yellow_t-shirt,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,8.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,5.0,1.0,1.0,1.0
yellow_tracksuit,1.0,1.0,2.0,1.0,1.0,1.0,76.0,1.0,1.0,1.0,...,1.0,1.0,1.0,3.0,1.0,1.0,1.0,1.0,1.0,2.0
yellow_trainers,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,13.0,27.0,...,1.0,28.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0


In [58]:
def GetCount(df,item_name,together):
    return df[f"{together}"][df.index== item_name]
def GetItemRow(df,item_name):
    return df[df.index==item_name]

In [29]:
inventory = list(df_heatmap.index)

Index(['beige_boots', 'beige_jacket', 'beige_jeans', 'beige_shirt',
       'beige_sweater', 'beige_t-shirt', 'beige_tracksuit', 'beige_trainers',
       'beige_trousers', 'black_boots', 'black_jacket', 'black_jeans',
       'black_shirt', 'black_sweater', 'black_t-shirt', 'black_tracksuit',
       'black_trainers', 'black_trousers', 'blue_boots', 'blue_jacket',
       'blue_jeans', 'blue_shirt', 'blue_sweater', 'blue_t-shirt',
       'blue_tracksuit', 'blue_trainers', 'blue_trousers', 'brown_boots',
       'brown_jacket', 'brown_jeans', 'brown_shirt', 'brown_sweater',
       'brown_t-shirt', 'brown_tracksuit', 'brown_trainers', 'brown_trousers',
       'dark-gray_boots', 'dark-gray_jacket', 'dark-gray_jeans',
       'dark-gray_shirt', 'dark-gray_sweater', 'dark-gray_t-shirt',
       'dark-gray_tracksuit', 'dark-gray_trainers', 'dark-gray_trousers',
       'green_boots', 'green_jacket', 'green_jeans', 'green_shirt',
       'green_sweater', 'green_t-shirt', 'green_tracksuit', 'green_trai

In [59]:
probability_table_estimate = df_heatmap.div(df_heatmap.sum(axis=1), axis=0)

together_with,beige_boots,beige_jacket,beige_jeans,beige_shirt,beige_sweater,beige_t-shirt,beige_tracksuit,beige_trainers,beige_trousers,black_boots,...,white_trousers,yellow_boots,yellow_jacket,yellow_jeans,yellow_shirt,yellow_sweater,yellow_t-shirt,yellow_tracksuit,yellow_trainers,yellow_trousers
item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
beige_boots,0.002469,0.002469,0.002469,0.002469,0.170370,0.002469,0.002469,0.002469,0.002469,0.002469,...,0.002469,0.002469,0.002469,0.002469,0.002469,0.251852,0.002469,0.019753,0.002469,0.002469
beige_jacket,0.002398,0.002398,0.002398,0.002398,0.059952,0.002398,0.225420,0.002398,0.002398,0.002398,...,0.002398,0.002398,0.002398,0.002398,0.002398,0.009592,0.002398,0.002398,0.033573,0.050360
beige_jeans,0.004988,0.002494,0.002494,0.002494,0.002494,0.002494,0.002494,0.002494,0.002494,0.002494,...,0.002494,0.002494,0.002494,0.004988,0.027431,0.002494,0.002494,0.002494,0.002494,0.004988
beige_shirt,0.002532,0.131646,0.002532,0.002532,0.002532,0.002532,0.043038,0.002532,0.002532,0.002532,...,0.075949,0.002532,0.002532,0.002532,0.002532,0.288608,0.002532,0.002532,0.002532,0.005063
beige_sweater,0.002463,0.002463,0.002463,0.002463,0.002463,0.002463,0.002463,0.002463,0.002463,0.002463,...,0.002463,0.009852,0.002463,0.019704,0.002463,0.002463,0.002463,0.002463,0.002463,0.002463
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
yellow_sweater,0.002506,0.002506,0.002506,0.002506,0.002506,0.002506,0.002506,0.002506,0.002506,0.002506,...,0.002506,0.002506,0.002506,0.002506,0.015038,0.002506,0.002506,0.002506,0.002506,0.002506
yellow_t-shirt,0.002463,0.002463,0.002463,0.002463,0.002463,0.002463,0.002463,0.002463,0.019704,0.002463,...,0.002463,0.002463,0.002463,0.002463,0.002463,0.002463,0.012315,0.002463,0.002463,0.002463
yellow_tracksuit,0.002398,0.002398,0.004796,0.002398,0.002398,0.002398,0.182254,0.002398,0.002398,0.002398,...,0.002398,0.002398,0.002398,0.007194,0.002398,0.002398,0.002398,0.002398,0.002398,0.004796
yellow_trainers,0.002469,0.002469,0.002469,0.002469,0.004938,0.002469,0.002469,0.002469,0.032099,0.066667,...,0.002469,0.069136,0.002469,0.002469,0.004938,0.002469,0.002469,0.002469,0.002469,0.002469
