# Itemization Clustering
Cluster customers into groups based on what products they usually order.

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.externals import joblib
from sklearn.cluster import KMeans

from scripts import product_processing

import warnings
warnings.filterwarnings("ignore")

pd.set_option('display.max_colwidth', 400)

## Load Data

In [2]:
df = pd.read_csv("../data/items.csv") # df with items per order
df_orders = pd.read_csv("../data/order_churn_data.csv") # df with orders per customer

In [3]:
df = df.merge(df_orders[["customer_db_id", "email", "order_id", "aov", "order_state", 
                         "order_created_datetime", "frequency", "order_num"]])

df = df.loc[df.order_state == "completed"] # only get completed orders, to make sure they are itemized

# Product Processing
In the DB we have around 400 products -> we have to group them to product groups to reduce dimensionality. This can be done based on text similarity.

### Load Data
Load itemization data from:
- https://docs.google.com/spreadsheets/d/1bWyhdLxkGqO6MsCuwc6aaj5ZIPTVXG14AJ_sOKh2-gQ/edit?usp=sharing

This sheet was manually created and contains *product_types* and *product_groups*. The unique product names should be grouped to product types based on character similarity, while the product types are groupe to product_groups manually.

In [4]:
df_products = pd.read_csv("../data/products.csv") # df with IDs and english names for all products
products = df_products.product_name.sort_values().unique().tolist()

In [5]:
df_itemization = pd.read_csv("../data/itemization.csv") 
df_itemization['product_type_sort'] = df_itemization.product_type_category + '_' + df_itemization.product_type
product_types = df_itemization.product_type.unique().tolist()

### Group products

In [6]:
products_grouped = product_processing.get_product_types(products, product_types)

In [7]:
df_products = pd.DataFrame.from_dict(products_grouped, orient='index').reset_index()
df_products.columns = ['product_name', 'product_type']
df_products = df_products.merge(df_itemization, on='product_type')
df_products.head()

Unnamed: 0,product_name,product_type,product_group,product_type_category,product_type_sort
0,#FINILALESSIVE Blouse,Blouse,BUS_Blouse,LAD,LAD_Blouse
1,#LAUNDRYLIBERATION SPECIAL Blouse,Blouse,BUS_Blouse,LAD,LAD_Blouse
2,#WÄSCHEREVOLUTION Blouse,Blouse,BUS_Blouse,LAD,LAD_Blouse
3,12 Blouse Bundle,Blouse,BUS_Blouse,LAD,LAD_Blouse
4,15 Blouse Bundle,Blouse,BUS_Blouse,LAD,LAD_Blouse


In [8]:
df = df.merge(df_products, how='left', on='product_name')
df.head()

Unnamed: 0,product_id,order_id,order_date,quantity,price_per_unit,segmentation,category,product_name,customer_db_id,email,aov,order_state,order_created_datetime,frequency,order_num,product_type,product_group,product_type_category,product_type_sort
0,DE-PRO-A1270993,DE-X-76260,2016-12-15 02:43:30,3.0,15.0,DC,misc,Sneaker,57961b83d4cde81c22ffbe2b,anyafriesen@gmail.com,55.882353,completed,2016-07-25 16:00:36,0.0,1,Boots / Shoes / Sneaker,Accessories,ACC,ACC_Boots / Shoes / Sneaker
1,DE-PRO-B1255636,DE-X-76260,2016-12-15 02:43:30,4.0,4.5,WF,washnfold,Wash and Fold (per kg),57961b83d4cde81c22ffbe2b,anyafriesen@gmail.com,55.882353,completed,2016-07-25 16:00:36,0.0,1,Wash and Fold / Bag of Folded Laundry / Colour Separation,WashFold,MIX,MIX_Wash and Fold / Bag of Folded Laundry / Colour Separation
2,DE-PRO-D1330444,DE-X-76260,2016-12-15 02:43:30,1.0,3.5,HH,drycleaning,Pillow case,57961b83d4cde81c22ffbe2b,anyafriesen@gmail.com,55.882353,completed,2016-07-25 16:00:36,0.0,1,Pillow case,HH_pillow_case,HH,HH_Pillow case
3,DE-PRO-G4166715,DE-X-76260,2016-12-15 02:43:30,1.0,5.0,HH,drycleaning,Duvet cover,57961b83d4cde81c22ffbe2b,anyafriesen@gmail.com,55.882353,completed,2016-07-25 16:00:36,0.0,1,Duvet,HH_blanket,HH,HH_Duvet
4,DE-PRO-Y9074269,DE-X-76260,2016-12-15 02:43:30,1.0,5.0,HH,drycleaning,Bed sheet,57961b83d4cde81c22ffbe2b,anyafriesen@gmail.com,55.882353,completed,2016-07-25 16:00:36,0.0,1,Bedsheets,HH_linens,HH,HH_Bedsheets


# Cluster Customer based on Product Groups
Each product_type belongs to a product_group. We want to find clusters of customers who order the same product_groups.

### Prepare Data

In [None]:
# for each customer and product_group get how many orders this group was included in and in what quantity
df_cust = df.groupby(['customer_db_id', 'product_group'])\
            .agg({'quantity': 'sum', 'order_id': 'nunique', 'frequency': 'min'})\
            .reset_index()\
            .set_index('customer_db_id')

In [None]:
df_cust.head()

In [None]:
# get dummy values of customers and product_groups
df_dumm = pd.get_dummies(df_cust['product_group'])
# group all dummies for one customer into one row
df_dumm = df_dumm.reset_index().groupby(['customer_db_id']).max()
df_dumm.head()

### Train K-Means clustering

In [None]:
# X = df_dumm.values
# m_km = KMeans(n_clusters=10)
# m_km.fit(X)
# m_clusters = m_km.labels_.tolist()

In [None]:
# joblib.dump(m_km, 'data/models/best_items_clf.pkl') 
m_km = joblib.load('data/models/best_items_clf.pkl') 
m_clusters = m_km.labels_.tolist()

In [None]:
df_clustered = df_dumm.copy()
df_clustered['cluster'] = m_clusters

In [None]:
cluster_names = {0: 'Shirt+Trousers', 
                 1: 'Shirt+Trousers+Suit+Blazer',
                 2: 'Shirt',
                 3: 'WashFold',
                 4: 'Suit+Shirt',
                 5: 'Jackets+Coats',
                 6: 'Suit',
                 7: 'Dress+Blouse+Skirt+Top',
                 8: 'Mix',
                 9: 'Household'}
df_clustered['cluster_name'] = df_clustered.cluster.map(cluster_names)

In [None]:
df_clustered.groupby(['cluster_name'])['Accessories'].count().sort_values()

### Get average frequency per cluster

In [None]:
df_clustered = df_clustered.join(df.groupby('customer_db_id')['frequency'].mean())

In [None]:
df_avg_freq = df_clustered.loc[df_clustered.frequency > 0]\
                .groupby('cluster_name')\
                .agg({'frequency': ['mean', 'median']})\
                .reset_index()
df_avg_freq.columns = df_avg_freq.columns.droplevel()
df_avg_freq.columns = ['cluster_name', 'freq_mean', 'freq_median']
df_avg_freq

In [None]:
df_clustered = df_clustered.reset_index().merge(df_avg_freq, on='cluster_name')

In [None]:
df_clustered.to_csv('data/clustered_customers.csv')

### Plot cluster centers and avg frequencies

In [None]:
centers = pd.DataFrame(data=m_km.cluster_centers_, columns=df_dumm.columns)
centers.index = centers.index.map(cluster_names)

In [None]:
centers.head()
fig, ax = plt.subplots(figsize=(8,8))
sns.heatmap(centers.sort_index().T, cmap='Blues')

In [None]:
fig = plt.figure(figsize=(16,8))
sns.set()
sns.categorical.boxplot(data=df_clustered.loc[(df_clustered.frequency < 365) & (df_clustered.frequency > 0)], 
                           x='cluster_name', y='frequency',
                           order=sorted(df_clustered.cluster_name.unique().tolist()))
plt.xticks(rotation=90)

# NPS

In [None]:
df_nps = pd.read_csv("data/NPS.csv")
df_nps = df_nps[['email', 'How often do you use a dry cleaner and/or launderette?', 'file']]
df_nps.columns = ['email', 'NPS_frequency', 'file']

translations = {'Toutes les deux semaines': 'Bi-Weekly',
               'Chaque mois': 'Monthly',
               'Moins que chaque trimestre': 'Less than quarterly',
               'Chaque trimestre': 'Quarterly',
               'Chaque semaine': 'Weekly',
               'Weniger oft': 'Less than quarterly',
               'VierteljÃ¤hrlich': 'Quarterly',
               'ZweiwÃ¶chentlich': 'Bi-Weekly',
               'Monatlich': 'Monthly',
               'WÃ¶chentlich': 'Weekly'}

sorting = {'Weekly': '0_Weekly',
          'Bi-Weekly': '1_Bi-Weekly',
          'Monthly': '2_Monthly',
          'Quarterly': '3_Quarterly',
          'Less than quarterly': '4_Less than quarterly'}

df_nps.loc[df_nps.file.isin(['DE.csv', 'FR.csv']), 'NPS_frequency'] = df_nps.NPS_frequency.map(translations)
df_nps.NPS_frequency = df_nps.NPS_frequency.map(sorting)
df_nps = df_nps[['email', 'NPS_frequency']]

In [None]:
df_nps_cluster = df_clustered.groupby(['cluster', 'NPS_frequency'])['customer_db_id'].nunique().reset_index()
df_nps_cluster_total = df_clustered.loc[~df_clustered.NPS_frequency.isnull()]\
                                    .groupby(['cluster'])['customer_db_id'].nunique()\
                                    .reset_index().rename(columns={'customer_db_id': 'cluster_total'})
df_nps_cluster = df_nps_cluster.merge(df_nps_cluster_total, how='left', on='cluster')
df_nps_cluster['cluster_freq'] = df_nps_cluster.customer_db_id / df_nps_cluster.cluster_total

In [None]:
df_nps_cluster_pivot = df_nps_cluster.pivot(index='NPS_frequency', columns='cluster', values='cluster_freq')
sns.heatmap(df_nps_cluster_pivot, cmap='Greens')

In [None]:
# df_clustered.to_csv('data/clustered_customers.csv', index=False)

In [None]:
df = df.merge(df_clustered.reset_index()[['customer_db_id', 'cluster']], on='customer_db_id', how='left')

In [None]:
df.groupby('cluster')['aov'].describe()

In [None]:
df_nps_products = df.loc[~df.NPS_frequency.isnull()]\
                    .groupby(['NPS_frequency', 'product_group'])['customer_db_id']\
                    .nunique().reset_index()
df_nps_products_total = df.loc[~df.NPS_frequency.isnull()]\
                    .groupby(['product_group'])['customer_db_id']\
                    .nunique().reset_index()\
                    .rename(columns={'customer_db_id': 'product_group_total'})
df_nps_products = df_nps_products.merge(df_nps_products_total, on='product_group', how='left')
df_nps_products['customers_ratio'] = df_nps_products.customer_db_id / df_nps_products.product_group_total
df_nps_products.head()

In [None]:
df_nps_products_pivot = df_nps_products.pivot(index='product_group', columns='NPS_frequency', values='customer_db_id')

In [None]:
fig, ax = plt.subplots(figsize=(8,8))
sns.heatmap(df_nps_products_pivot, cmap='Greens', yticklabels=df_nps_products_pivot.index)