Segmentez des clients d'un site de e-commerce
==========================================

![logo](https://olist.com/wp-content/uploads/2018/04/Logo-01.png)

In [None]:
import os
from pathlib import Path
from datetime import datetime, timedelta

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from ipywidgets import widgets, interact, interact_manual
from IPython.display import display
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

from src.features import build_features
from src.visualization.visualize import group_analysis

sns.set()

RANDOM_STATE = 100 # Keep cluster consistent over execution

In [None]:
data = build_features.load_data(Path('../data/raw').resolve())

In [None]:
def make_dataset(data, date_start, date_end):
    orders = build_features.get_orders_between_two_dates(data, date_start, date_end)
    customers = build_features.customer_table(data)
    customers = build_features.frequencies(customers, orders, data)
    customers = build_features.recencies(customers, orders, data)
    customers = build_features.monetary(customers, orders, data)
    customers = build_features.items_per_cart(customers, orders, data)
    customers = build_features.monetary_per_categ(customers, orders, data)
    customers = build_features.reviews(customers, data)
    customers = customers[customers['frequency'] > 0]
    customers = customers[customers['recency'].notna()]
    customers.fillna(0.0, inplace=True)
    customers['recency'] = customers['recency'].apply(lambda x: x.days)
    customers.reset_index(drop=False, inplace=True)
    customers.drop_duplicates(inplace=True)
    return customers

In [None]:
date_start = widgets.DatePicker(value=datetime(year=2017, month=1, day=1))
date_end = widgets.DatePicker(value=datetime(year=2018, month=1, day=1))
n_clusters = widgets.IntSlider(value=8, min=2, max=20)

@interact_manual()
def make_data(date_start=date_start, date_end=date_end, n_clusters=n_clusters):
    df = make_dataset(data, pd.to_datetime(date_start), pd.to_datetime(date_end))
    for col in ['monetary', 'clothing', 'food', 
                'high-tech', 'home', 'other']:
        df[col] = df[col].apply(np.log1p)
    X = df.drop('index', axis=1).values
    X = StandardScaler().fit_transform(X)
    kmeans = KMeans(n_clusters=n_clusters).fit(X)
    df.loc[:, 'group'] = kmeans.labels_
    group_analysis(df)

In [None]:
deltas = [1, 7, 14, 30, 90, 180]
deltas = list(map(timedelta, deltas))

In [None]:
date_start = datetime(2017, 1, 1)
date_end = datetime(2018, 1, 1)
df_ = make_dataset(data, date_start, date_end)

scaler = StandardScaler()

for col in ['monetary', 'clothing', 'food', 
            'high-tech', 'home', 'other']:
    df_[col] = df_[col].apply(np.log1p)
X = df_.drop('index', axis=1).values
X = scaler.fit_transform(X)
kmeans = KMeans(n_clusters=9).fit(X)
df_.loc[:, 'group'] = kmeans.labels_
group_analysis(df_)
df_.set_index('index', inplace=True)
centers_ = kmeans.cluster_centers_

for delta in deltas:
    date_start = datetime(2017, 1, 1)
    date_end = datetime(2018, 1, 1) + delta
    print(f'Période allant du {date_start} au {date_end}')
    df = make_dataset(data, date_start, date_end)
    for col in ['monetary', 'clothing', 'food', 
                'high-tech', 'home', 'other']:
        df[col] = df[col].apply(np.log1p)
    X = df.drop('index', axis=1).values
    X = scaler.fit_transform(X)
    kmeans = KMeans(n_clusters=9, init=centers_).fit(X)
    df.loc[:, 'group'] = kmeans.labels_
    group_analysis(df)
    df.set_index('index', inplace=True)
    print(f"Nombre de nouveaux clients %i" % (df.shape[0] - df_.shape[0]))
    moved = df_.shape[0] - (df.loc[df_.index]['group'] == df_.group).sum()
    print(f'Nombre de clients ayant changé de groupe %i' % moved)