In [292]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import datetime

%matplotlib inline
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from yellowbrick.cluster import KElbowVisualizer

from sklearn.metrics.cluster import adjusted_rand_score

from sklearn.preprocessing import StandardScaler

## Télécharger des datasets

<span style="color: #00FFFF">@ Télécharger le fichier de format csv et puis le palce dans un dataframe nommé `loandown`. Les colonnes du fichier sont uniquement séparées par le caractère de tabulation `/t`. </span> 

In [293]:
output = pd.read_csv("olist-jointure.csv", sep=',')
initial_data = pd.read_csv("olist-customers-segmentation-label.csv", sep=',')

In [294]:
initial_data.dtypes;

In [295]:
datetime_cols = ["order_purchase_timestamp"]
for col in datetime_cols:
    output[col] = pd.to_datetime(output[col])

In [296]:
initial_date = output["order_purchase_timestamp"].max()
labels = initial_data["kmeans_label"].values
np.unique(labels)

array([0, 1, 2, 3])

In [297]:
X = initial_data.iloc[:,1:5].values
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [298]:
output.head()

Unnamed: 0,customer_unique_id,order_id,customer_id,order_purchase_timestamp,review_score,payment_value
0,7c396fd4830fd04220f754e42b4e5bff,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,2017-10-02 10:56:33,4,18.12
1,7c396fd4830fd04220f754e42b4e5bff,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,2017-10-02 10:56:33,4,2.0
2,7c396fd4830fd04220f754e42b4e5bff,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,2017-10-02 10:56:33,4,18.59
3,af07308b275d755c9edb36a90c618231,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,2018-07-24 20:41:37,4,141.46
4,3a653a41f6f9fc3d2a113cf8398680e8,47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,2018-08-08 08:38:49,5,179.12


In [299]:
output["order_purchase_timestamp"].max()

Timestamp('2018-10-17 17:30:18')

## Fonctions

In [300]:
def data_sythetic(req_data):
  output_syn = req_data.groupby(by="customer_unique_id").agg({"order_id":'nunique',
                                                              "payment_value":'sum',
                                                              "review_score":'mean',
                                                            })
  output_syn= output_syn.rename(columns ={"order_id":"order_times",
                                    "payment_value":"conso_total",
                                    "review_score":"review_mean",
                                   })

  req_data=pd.merge(req_data, output_syn, how='left', on="customer_unique_id")
  
  import datetime
  # le nombre de jours depuis le dernier achat
  req_data["days_no_purchase"] = req_data["order_purchase_timestamp"].max()- req_data["order_purchase_timestamp"]
  req_data["days_no_purchase"]=round(req_data["days_no_purchase"]/ np.timedelta64(1, 'D'),2)
  
  req_data.drop_duplicates(subset="customer_unique_id", keep="first", inplace=True)
  req_data = req_data[[ 'customer_unique_id','review_mean', 'conso_total','order_times', 'days_no_purchase']]
  return req_data
  

In [305]:
def get_predictions_model (req_data):
    req_data = data_sythetic(req_data)
    X = req_data.iloc[:,-4:].values
    
    # on scale les données.
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
   
    # Instantiate the clustering model KMeans with best K and visualizer 
    kmeans_model = KMeans(4)
    kmeans_model.fit(X_scaled)

    # Kmeans labels
    req_data["kmeans_label"] = kmeans_model.labels_

    return kmeans_model

## Prédictions

In [308]:
ARIs = []
print("Initial label: ", np.unique(labels))
for i in range(1,13):
      req_data_i_months_ago = output.loc[output["order_purchase_timestamp"]< initial_date + pd.DateOffset(months=-i)]
      kmeans_i_months_ago = get_predictions_model(req_data_i_months_ago)
      pred_i_months_ago = kmeans_i_months_ago.predict(X_scaled)
      print("Adjusted Rand Index ",i,"months ago: %0.3f"% adjusted_rand_score(pred_i_months_ago, labels))
      ARIs.append("%0.3f"% adjusted_rand_score(pred_i_months_ago, labels))

Initial label:  [0 1 2 3]
Adjusted Rand Index  1 months ago: 0.985
Adjusted Rand Index  2 months ago: 0.991
Adjusted Rand Index  3 months ago: 0.903
Adjusted Rand Index  4 months ago: 0.910
Adjusted Rand Index  5 months ago: 0.908
Adjusted Rand Index  6 months ago: 0.907
Adjusted Rand Index  7 months ago: 0.899
Adjusted Rand Index  8 months ago: 0.938
Adjusted Rand Index  9 months ago: 0.950
Adjusted Rand Index  10 months ago: 0.955
Adjusted Rand Index  11 months ago: 0.959
Adjusted Rand Index  12 months ago: 0.968
