#### Author: Rameez

Instacart dataset : Instacart product file - 49688 unique products which inlcude non food items too

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import sklearn
from sklearn.utils import check_random_state
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.model_selection import train_test_split
import missingno as msno
import random
from mpl_toolkits.mplot3d import Axes3D
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import DBSCAN
import math
from sklearn.manifold import TSNE
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.metrics import silhouette_score
import copy
import seaborn as sns

#### Load data

In [None]:
data = pd.read_csv('D:/Downloads/Cust_agg_with_nut_score_trend_Aug_31.csv', index_col = 0)

In [None]:
pd.set_option('display.max_rows', 200)
data.isnull().any().any()

In [None]:
data.head(3)

#### Train/Test Split

In [None]:
cols_for_clustering = ["days_since_prior_order_median","order_number","total_number_of_items_bought","%_of_repetition",
                       "Q1_scr_list","Q2_scr_list","Q3_scr_list","Q4_scr_list","nut_score_basket_median","price_basket_sum","price_basket_median"]


In [None]:
df = data.sample(frac=1).reset_index(drop=True)
x_train = df[:round(df.shape[0]*0.7)]
x_test  = df[round(df.shape[0]*0.7)+1:]
print(x_train.shape, x_test.shape)

In [None]:
x_train_gmm = x_train.loc[:,cols_for_clustering]
x_test_gmm = x_test.loc[:,cols_for_clustering]

In [None]:
x_train_gmm

#### Gaussian Mixture Model with StandardScaler

In [None]:
# Try min-max scaler
SS = StandardScaler().fit(x_train_gmm)
train_gmm = SS.transform(x_train_gmm)
test_gmm  = SS.transform(x_test_gmm)

In [None]:
# n_init = 40
# bic = range(2,10)

bic = range(2,10) 
gmm_bic = []
for b in bic:
    model = GaussianMixture(n_components = b,n_init = 40, random_state = 2020)
    model.fit(train_gmm)
    gmm_bic.append(model.bic(train_gmm))

plt.plot(bic, gmm_bic, '-o')
plt.xlabel('number of clusters, k')
plt.ylabel('bic')
plt.xticks()
plt.show()

In [None]:
gmm_bic

In [None]:
aic = range(2,10) 
gmm_aic = []
for a in aic:
    model = GaussianMixture(n_components = a,n_init = 40, random_state = 2020)
    model.fit(train_gmm)
    gmm_aic.append(model.aic(train_gmm))

plt.plot(aic, gmm_aic, '-o')
plt.xlabel('number of clusters, k')
plt.ylabel('aic')
plt.xticks()
plt.show()

In [None]:
gmm_aic

In [None]:
slht = range(2,10)

for s in slht:
    model_train_n = GaussianMixture(n_components = s,n_init = 20, random_state = 2020)
    labels_train_n = model_train_n.fit_predict(train_gmm)
    sht_score_n = silhouette_score(train_gmm, labels_train_n)
    print("when n =",s , "sil_score is =", sht_score_n )

#### Gaussian Mixture Model with MinMaxScaler

In [None]:
MM = MinMaxScaler().fit(x_train_gmm)
mm_train_gmm = MM.transform(x_train_gmm)
mm_test_gmm  = MM.transform(x_test_gmm)

In [None]:
bic = range(2,10) 
gmm_bic = []
for b in bic:
    model = GaussianMixture(n_components = b,n_init = 40, random_state = 2020)
    model.fit(mm_train_gmm)
    gmm_bic.append(model.bic(mm_train_gmm))

plt.plot(bic, gmm_bic, '-o')
plt.xlabel('number of clusters, k')
plt.ylabel('bic')
plt.xticks()
plt.show()

In [None]:
gmm_bic

In [None]:
aic = range(2,10) 
gmm_aic = []
for a in aic:
    model = GaussianMixture(n_components = a,n_init = 40, random_state = 2020)
    model.fit(mm_train_gmm)
    gmm_aic.append(model.aic(mm_train_gmm))

plt.plot(aic, gmm_aic, '-o')
plt.xlabel('number of clusters, k')
plt.ylabel('aic')
plt.xticks()
plt.show()

In [None]:
gmm_aic

In [None]:
slht = range(2,10)

for s in slht:
    model_train_n = GaussianMixture(n_components = s,n_init = 40, random_state = 2020)
    labels_train_n = model_train_n.fit_predict(mm_train_gmm)
    sht_score_n = silhouette_score(mm_train_gmm, labels_train_n)
    print("when n =",s , "sil_score is =", sht_score_n )

In [None]:
# Select 5 clusters

model_gmm = GaussianMixture(n_components = 5,n_init = 40, random_state = 2020)
labels_train_gmm = model_gmm.fit_predict(mm_train_gmm)

In [None]:
label_train_series = pd.Series(labels_train_gmm).value_counts()
count_table = label_train_series.to_frame().reset_index()
count_table.rename(columns = {"index":"label",0:"count"},inplace = True)
count_table.plot.bar(x='label', y='count', rot=0)

In [None]:
pd.DataFrame(model_gmm.means_, columns = cols_for_clustering)

#### Plot some feature to look at homogeneity 

In [None]:
train_3D_plot = pd.DataFrame(mm_train_gmm, columns = cols_for_clustering)
train_3D_plot["customer_type"] = labels_train_gmm

import plotly.express as px
fig_train = px.scatter_3d(train_3D_plot, x='nut_score_basket_median', y='order_number', z='price_basket_median',
              color='customer_type')
fig_train.show()

In [None]:
fig_train = px.scatter_3d(train_3D_plot, x='nut_score_basket_median', y='total_number_of_items_bought', z='price_basket_median',
              color='customer_type')
fig_train.show()

In [None]:
fig_train = px.scatter_3d(train_3D_plot, x='total_number_of_items_bought', y='%_of_repetition', z='price_basket_sum',
              color='customer_type')
fig_train.show()

#### Look at distribution of features to see which ones are Gaussian distribution

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.distplot(data['days_since_prior_order_median'])

In [None]:
sns.distplot(data['total_number_of_items_bought'])

In [None]:
sns.distplot(data['%_of_repetition'])

In [None]:
sns.distplot(data['nut_score_basket_median'])

In [None]:
sns.distplot(data['price_basket_sum'])

In [None]:
sns.distplot(data['price_basket_median'])

In [None]:
sns.distplot(data['Q1_scr_list'])

In [None]:
sns.distplot(data['Q2_scr_list'])

In [None]:
sns.distplot(data['Q3_scr_list'])

In [None]:
sns.distplot(data['Q4_scr_list'])

#### GMM on Nutrition Data

In [None]:
cols_for_clustering = ["Q1_scr_list","Q2_scr_list","Q3_scr_list","Q4_scr_list","nut_score_basket_median"]


In [None]:
df = data.sample(frac=1).reset_index(drop=True)
x_train = df[:round(df.shape[0]*0.7)]
x_test  = df[round(df.shape[0]*0.7)+1:]
print(x_train.shape, x_test.shape)

In [None]:
x_train_gmm = x_train.loc[:,cols_for_clustering]
x_test_gmm = x_test.loc[:,cols_for_clustering]

In [None]:
x_train_gmm

In [None]:
MM = MinMaxScaler().fit(x_train_gmm)
mm_train_gmm = MM.transform(x_train_gmm)
mm_test_gmm  = MM.transform(x_test_gmm)

In [None]:
bic = range(2,6) 
gmm_bic = []
for b in bic:
    model = GaussianMixture(n_components = b,n_init = 40, random_state = 2020)
    model.fit(mm_train_gmm)
    gmm_bic.append(model.bic(mm_train_gmm))

plt.plot(bic, gmm_bic, '-o')
plt.xlabel('number of clusters, k')
plt.ylabel('bic')
plt.xticks()
plt.show()

In [None]:
gmm_bic

In [None]:
aic = range(2,6) 
gmm_aic = []
for a in aic:
    model = GaussianMixture(n_components = a,n_init = 40, random_state = 2020)
    model.fit(mm_train_gmm)
    gmm_aic.append(model.aic(mm_train_gmm))

plt.plot(aic, gmm_aic, '-o')
plt.xlabel('number of clusters, k')
plt.ylabel('aic')
plt.xticks()
plt.show()

In [None]:
gmm_aic

In [None]:
slht = range(2,6)

for s in slht:
    model_train_n = GaussianMixture(n_components = s,n_init = 40, random_state = 2020)
    labels_train_n = model_train_n.fit_predict(mm_train_gmm)
    sht_score_n = silhouette_score(mm_train_gmm, labels_train_n)
    print("when n =",s , "sil_score is =", sht_score_n )

In [None]:
# Select 3 clusters

model_gmm = GaussianMixture(n_components = 3,n_init = 40, random_state = 2020)
labels_train_gmm = model_gmm.fit_predict(mm_train_gmm)

In [None]:
label_train_series = pd.Series(labels_train_gmm).value_counts()
count_table = label_train_series.to_frame().reset_index()
count_table.rename(columns = {"index":"label",0:"count"},inplace = True)
count_table.plot.bar(x='label', y='count', rot=0)

In [None]:
pd.DataFrame(model_gmm.means_, columns = cols_for_clustering)

In [None]:
# Select 2 clusters

model_gmm = GaussianMixture(n_components = 2,n_init = 40, random_state = 2020)
labels_train_gmm = model_gmm.fit_predict(mm_train_gmm)

In [None]:
label_train_series = pd.Series(labels_train_gmm).value_counts()
count_table = label_train_series.to_frame().reset_index()
count_table.rename(columns = {"index":"label",0:"count"},inplace = True)
count_table.plot.bar(x='label', y='count', rot=0)

In [None]:
pd.DataFrame(model_gmm.means_, columns = cols_for_clustering)