Segmentez des clients d'un site de e-commerce
==========================================

![logo](https://olist.com/wp-content/uploads/2018/04/Logo-01.png)

In [None]:
import os
from pathlib import Path
from datetime import datetime

import matplotlib
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from ipywidgets import widgets, interact
from IPython.display import display

from src.features import build_features
from src.visualization.visualize import piechart, distplot, barplot

sns.set(font_scale=1.5)

In [None]:
data = build_features.load_data(Path('../data/raw').resolve())

In [None]:
def make_dataset(data, date_start, date_end):
    orders = build_features.get_orders_between_two_dates(data, date_start, date_end)
    customers = build_features.customer_table(data)
    customers = build_features.frequencies(customers, orders, data)
    customers = build_features.recencies(customers, orders, data)
    customers = build_features.monetary(customers, orders, data)
    customers = build_features.items_per_cart(customers, orders, data)
    customers = build_features.monetary_per_categ(customers, orders, data)
    customers = build_features.reviews(customers, data)
    customers = customers[customers['frequency'] > 0]
    customers = customers[customers['recency'].notna()]
    customers.fillna(0.0, inplace=True)
    customers['recency'] = customers['recency'].apply(lambda x: x.days)
    customers.reset_index(drop=False, inplace=True)
    customers.drop_duplicates(inplace=True)
    customers = customers[customers['monetary'] > 0]
    return customers

In [None]:
customers = make_dataset(data, '1970-01-01', '2020-01-01')

In [None]:
customers.describe()

In [None]:
piechart(customers['frequency'].apply(lambda x: 'unique' if x == 1 else 'régulier' ))

In [None]:
piechart(customers['item_per_c'].apply(lambda x: 1 if x == 1 else '1+'))

In [None]:
distplot(customers['recency'])
plt.show()

In [None]:
distplot(customers['monetary'])
plt.show()

In [None]:
f, ax = plt.subplots(1, figsize=(8, 6))
plt.hist(customers['review_score'], range=(0, 5), bins=5)
plt.show()

In [None]:
categs = ['clothing', 'food', 'hobbies', 'high-tech', 'home', 'other']

In [None]:
cat_count = customers[categs].astype(bool).sum()
_, ax = plt.subplots(1, figsize=(8, 8))
ax.pie(cat_count.values, labels=cat_count.index, autopct='%1.1f%%')
plt.show()

In [None]:
revenues_repartition = customers[categs].sum()
_, ax = plt.subplots(1, figsize=(8, 8))
ax.pie(revenues_repartition.values, labels=revenues_repartition.index, autopct='%1.1f%%')
plt.show()

In [None]:
customers = make_dataset(data, '2017-01-01', '2018-01-01')

In [None]:
from sklearn.decomposition import PCA
import plotly.express as px

In [None]:
pca = PCA(n_components=3).fit(customers.iloc[:, 1:].values)

In [None]:
y = pca.transform(customers.iloc[:, 1:].values)

In [None]:
fig = px.scatter_3d(x=y[:, 0], y=y[:, 1], z=y[:, 2])
fig.show()

In [None]:
customers.to_csv('../data/processed/dataset.csv', index=False)