Segmentez des clients d'un site de e-commerce
==========================================

![logo](https://olist.com/wp-content/uploads/2018/04/Logo-01.png)

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import seaborn as sns
from sklearn.cluster import DBSCAN, KMeans, AgglomerativeClustering
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

from src.visualization.visualize import group_analysis
sns.set(font_scale=1.)

In [None]:
data = pd.read_csv('../data/processed/dataset.csv')

**Note**

Les résultats ci-dessous peuvent varier d'une exécution à l'autre.

## Transformation log

In [None]:
col_to_log = ['monetary', 'clothing', 'high-tech', 'home', 'other']
for col in col_to_log:
    data[col] = data[col].apply(np.log1p)

## Split manuel entre client régulier et client ponctuel

In [None]:
ponctual = data[data['frequency'] == 1].copy()
loyal = data[data['frequency'] > 1].copy()

In [None]:
ponctual

In [None]:
ponctual.shape[0] / data.shape[0]

DBSCAN
--------

### Données complètes

In [None]:
features = ['recency',
            'monetary',
            'item_per_c',
            'clothing',
            'high-tech',
            'hobbies',
            'home',
            'other',
            'review_score']

In [None]:
X = data.loc[:, features + ['frequency']].values
X = StandardScaler().fit_transform(X)

In [None]:
db = DBSCAN(eps=1, min_samples=100).fit(X)
labels = db.labels_
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True

In [None]:
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)
print('Estimated number of clusters: %d' % n_clusters_)
print('Estimated number of noise points: %d' % n_noise_)

In [None]:
data.loc[:, 'group'] = labels

In [None]:
group_analysis(data)

### Clients pontuels

In [None]:
X = ponctual.loc[:, features].values
X = StandardScaler().fit_transform(X)

In [None]:
db = DBSCAN(eps=1.0, min_samples=1000).fit(X)
labels = db.labels_
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True

In [None]:
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)
print('Estimated number of clusters: %d' % n_clusters_)
print('Estimated number of noise points: %d' % n_noise_)

In [None]:
ponctual.loc[:, 'group'] = labels

In [None]:
group_analysis(ponctual)

In [None]:
%%time
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, labels))

### Clients fidèles

In [None]:
X = loyal.loc[:, features + ['frequency']].values
X = StandardScaler().fit_transform(X)

In [None]:
db = DBSCAN(eps=0.6, min_samples=10).fit(X)
labels = db.labels_
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True

In [None]:
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)
print('Estimated number of clusters: %d' % n_clusters_)
print('Estimated number of noise points: %d' % n_noise_)

In [None]:
loyal.loc[:, 'group'] = labels

In [None]:
group_analysis(loyal)

In [None]:
%%time
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, labels))

## Kmeans

### Données complètes

In [None]:
X = data.loc[:, features].values
X = StandardScaler().fit_transform(X)

In [None]:
scores = list()
for n in tqdm(np.arange(2, 11)):
    kmeans = KMeans(n_clusters=n).fit(X)
    labels = kmeans.labels_
    scores.append(metrics.silhouette_score(X, labels))

plt.plot(np.arange(2, 11), scores)

In [None]:
kmeans = KMeans(n_clusters=7).fit(X)
labels = kmeans.labels_
data.loc[:, 'group'] = labels
group_analysis(data)

In [None]:
kmeans = KMeans(n_clusters=9).fit(X)
labels = kmeans.labels_
data.loc[:, 'group'] = labels
group_analysis(data)

In [None]:
%%time
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, labels))

### Segmentation des clients pontuels

In [None]:
X = ponctual.loc[:, features].values
X = StandardScaler().fit_transform(X)

In [None]:
scores = list()
for n in tqdm(np.arange(2, 11)):
    kmeans = KMeans(n_clusters=n).fit(X)
    labels = kmeans.labels_
    scores.append(metrics.silhouette_score(X, labels))

plt.plot(np.arange(2, 11), scores)

In [None]:
kmeans = KMeans(n_clusters=7).fit(X)
labels = kmeans.labels_

In [None]:
ponctual.loc[:, 'group'] = labels

In [None]:
group_analysis(ponctual)

In [None]:
%%time
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, labels))

### segmentation clients réguliers

In [None]:
X = loyal.loc[:, features + ['frequency']].values
X = StandardScaler().fit_transform(X)

In [None]:
scores = list()
for n in np.arange(2, 11):
    kmeans = KMeans(n_clusters=n).fit(X)
    labels = kmeans.labels_
    scores.append(metrics.silhouette_score(X, labels))

plt.plot(np.arange(2, 11), scores)

In [None]:
kmeans = KMeans(n_clusters=8).fit(X)
labels = kmeans.labels_

In [None]:
loyal.loc[:, 'group'] = labels

In [None]:
group_analysis(loyal)

In [None]:
%%time
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, labels))

Agglomerative_clustering
--------------------------------------

### Données complètes

In [None]:
X = data.loc[:, features + ['frequency']].values
X = StandardScaler().fit_transform(X)

scores = list()
for n in tqdm(np.arange(2, 13)):
    agg = AgglomerativeClustering(n_clusters=n).fit(X)
    labels = agg.labels_
    scores.append(metrics.silhouette_score(X, labels))

plt.plot(np.arange(2, 13), scores)

In [None]:
agg = AgglomerativeClustering(n_clusters=11).fit(X)
labels = agg.labels_
data.loc[:, 'group'] = labels
group_analysis(data)

In [None]:
%%time
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, labels))

### Segmentation clients ponctuels

In [None]:
X = ponctual.loc[:, features].values
X = StandardScaler().fit_transform(X)

scores = list()
for n in tqdm(np.arange(2, 13)):
    agg = AgglomerativeClustering(n_clusters=n).fit(X)
    labels = agg.labels_
    scores.append(metrics.silhouette_score(X, labels))

plt.plot(np.arange(2, 13), scores)

In [None]:
agg = AgglomerativeClustering(n_clusters=10)
agg.fit(X)

In [None]:
ponctual.loc[:, 'group'] = agg.labels_

In [None]:
group_analysis(ponctual)

In [None]:
%%time
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, labels))

### Clients réguliers

In [None]:
X = loyal.loc[:, features + ['frequency']].values
X = StandardScaler().fit_transform(X)

scores = list()
for n in tqdm(np.arange(2, 11)):
    agg = AgglomerativeClustering(n_clusters=n).fit(X)
    labels = agg.labels_
    scores.append(metrics.silhouette_score(X, labels))

plt.plot(np.arange(2, 11), scores)

In [None]:
agg = AgglomerativeClustering(n_clusters=2)
agg.fit(X)

In [None]:
loyal.loc[:, 'group'] = agg.labels_

In [None]:
group_analysis(loyal)

In [None]:
%%time
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, agg.labels_))