Segmentez des clients d'un site de e-commerce
==========================================

![logo](https://olist.com/wp-content/uploads/2018/04/Logo-01.png)

In [None]:
import os
from pathlib import Path
from datetime import datetime, timedelta

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from ipywidgets import widgets, interact, interact_manual
from IPython.display import display
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from tqdm.notebook import tqdm
import plotly.express as px

from pandas.plotting import register_matplotlib_converters

from src.features.build_features import load_data, make_dataset
from src.visualization.visualize import group_analysis

sns.set(font_scale=1.5)
register_matplotlib_converters()

## Chargement des données

In [None]:
data = load_data(Path('../data/raw').resolve())

### groupes

In [None]:
date_start = widgets.DatePicker(value=datetime(year=2017, month=1, day=1))
date_end = widgets.DatePicker(value=datetime(year=2018, month=1, day=1))
n_clusters = widgets.IntSlider(value=8, min=2, max=20)


@interact_manual()
def make_data(date_start=date_start, date_end=date_end, n_clusters=n_clusters):
    df = make_dataset(data, pd.to_datetime(
        date_start), pd.to_datetime(date_end))
    for col in ['monetary', 'clothing',
                'high-tech', 'home', 'other']:
        df[col] = df[col].apply(np.log1p)
    X = df.drop('index', axis=1).values
    X = StandardScaler().fit_transform(X)
    kmeans = KMeans(n_clusters=n_clusters).fit(X)
    df.loc[:, 'group'] = kmeans.labels_
    group_analysis(df)

## Comment se comportent les groupes avec l'ajout de données

In [None]:
# let check with one day more to 180 days
deltas = [1, 7, 14, 30, 90, 180]
deltas = list(map(timedelta, deltas))

In [None]:
sns.set(font_scale=1)

date_start = datetime(2017, 1, 1)
date_end = datetime(2018, 1, 1)
df_ = make_dataset(data, date_start, date_end)

scaler = StandardScaler()

for col in ['monetary', 'clothing',
            'high-tech', 'home', 'other']:
    df_[col] = df_[col].apply(np.log1p)
X = df_.drop('index', axis=1).values
X = scaler.fit_transform(X)
kmeans = KMeans(n_clusters=9).fit(X)
df_.loc[:, 'group'] = kmeans.labels_
col_to_log = ['monetary', 'clothing',
              'high-tech', 'home', 'other']
for col in col_to_log:
    df_[col] = df_[col].apply(np.expm1)
group_analysis(df_)
df_.set_index('index', inplace=True)
# Reuse centroids from previous clustering
centers_ = kmeans.cluster_centers_
print('Nombre de clients : %i' % df_.shape[0])
for delta in deltas:
    date_start = datetime(2017, 1, 1)
    date_end = datetime(2018, 1, 1) + delta
    print(f'Période allant du {date_start} au {date_end}')
    df = make_dataset(data, date_start, date_end)
    for col in ['monetary', 'clothing',
                'high-tech', 'home', 'other']:
        df[col] = df[col].apply(np.log1p)
    X = df.drop('index', axis=1).values
    X = scaler.fit_transform(X)
    kmeans = KMeans(n_clusters=9, init=centers_, n_init=1).fit(X)
    df.loc[:, 'group'] = kmeans.labels_
    col_to_log = ['monetary', 'clothing',
                  'high-tech', 'home', 'other']
    for col in col_to_log:
        df[col] = df[col].apply(np.expm1)
    group_analysis(df)
    df.set_index('index', inplace=True)
    print(f"Nombre de nouveaux clients %i" % (df.shape[0] - df_.shape[0]))
    moved = df_.shape[0] - (df.loc[df_.index]['group'] == df_.group).sum()
    print(f'Nombre de clients ayant changé de groupe %i' % moved)

In [None]:
def score_on_periods(n_group=2):
    """Compute silhouette score on different periods."""
    scores = list()
    for delta in tqdm(deltas):
        date_start = datetime(2017, 1, 1)
        date_end = datetime(2018, 1, 1) + delta
        print(f'Période allant du {date_start} au {date_end}')
        df = make_dataset(data, date_start, date_end)
        for col in ['monetary', 'clothing', 'high-tech',
                    'home', 'other']:
            df[col] = df[col].apply(np.log1p)
        X = df.drop('index', axis=1).values
        X = StandardScaler().fit_transform(X)
        kmeans = KMeans(n_clusters=n_group).fit(X)
        score = silhouette_score(X, kmeans.labels_, n_jobs=8)
        scores.append(score)
    return scores

In [None]:
scoring_results = dict()
for n in tqdm(range(7, 10)):
    key = f'{n} groups'
    scoring_results[key] = score_on_periods(n_group=n)

In [None]:
fig, ax = plt.subplots(1, figsize=(12, 8))
for key, val in scoring_results.items():
    x = list(map(lambda x: datetime(2017, 1, 1) + x, deltas))
    ax.plot(x, val, label=key)
    plt.legend()