<a href="https://colab.research.google.com/github/vicmcl/e-commerce/blob/simulation/notebook_simulation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import

In [112]:
%reload_ext autoreload
%autoreload 2

In [113]:
from pathlib import Path
from google.colab import drive
import sys

content_path = Path('/').absolute() / 'content'
drive_path = content_path / 'drive'
drive.flush_and_unmount()
drive.mount(str(drive_path))

sys.path.append(str(drive_path / "My Drive" / "OCR" / "utils"))

Mounted at /content/drive


In [193]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import pickle

from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer
from sklearn.metrics import silhouette_score, adjusted_rand_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

%matplotlib inline
sns.set_style('white')

In [115]:
df = pd.read_json(drive_path / "My Drive" / "OCR" / "4_Olist" / "orders.json").sort_values("time_since_order")

In [116]:
df[["time_since_order", "delay"]] = df[["time_since_order", "delay"]].applymap(lambda x: pd.Timedelta(x, unit="ms"))

In [117]:
df

Unnamed: 0,order_id,customer_unique_id,total_payments,mean_review_score,time_since_order,delay
6511,10a045cdf6a5650c21e9cfeb60384c16,87ab9fec999db8bd5774917de3cdf01c,89.71,1.0,0 days 00:00:00,0 days 00:00:00
68317,b059ee4de278302d550a3035c4cdb740,262e1f1e26e92e86375f86840b4ffd63,222.03,5.0,0 days 21:14:16,0 days 00:00:00
62892,a2ac6dad85cf8af5b0afb510a240fe8c,af5454198a97379394cacf676e1e96cb,197.55,1.0,13 days 22:34:49,0 days 00:00:00
37735,616fa7d4871b87832197b2a137a115d2,634420a0ea42302205032ed44ac7fccc,80.38,2.0,16 days 02:00:09,0 days 00:00:00
22190,392ed9afd714e3c74767d0c4d3e3f477,9bb92bebd4cb7511e1a02d5e50bc4655,137.03,1.0,18 days 08:17:15,0 days 00:00:00
...,...,...,...,...,...,...
43966,71303d7e93b399f5bcd537d124c0bcfa,0eb1ee9dba87f5b36b4613a65074337c,109.34,1.0,744 days 19:22:26,0 days 00:00:00
74448,bfbd0f9bdef84302105ad712db648a6c,830d5b7aaa3b6f1e9ad63703bec97d23,0.00,1.0,762 days 05:13:40,36 days 07:47:38
49869,809a282bbd5dbcabb6f2f724fca862ec,009b0127b727ab0ba422f6d9604487c7,40.95,1.0,764 days 02:05:59,0 days 00:00:00
89489,e5fa5a7210941f7d56d0208e4e071d35,4854e9b3feff728c13ee5fc7d1547e92,75.06,1.0,772 days 17:14:44,0 days 00:00:00


# Train sets definition

In [118]:
test_delays = range(2, 7) # months
train_periods = [2, 4, 6]

In [136]:
six_months = pd.Timedelta(180, unit='D')
train_orders = dict()

for i in train_periods:
    train_period = pd.Timedelta(i * 30, unit='D')
    df_train = df[df["time_since_order"] > six_months]
    df_train = df_train[df_train["time_since_order"] < train_period + six_months]
    train_orders[i] = df_train

# Test sets definition

In [137]:
test_orders = dict()
test_set_length = 2 # months

for j in test_delays:
    start_time = pd.Timedelta(180, unit='D') - pd.Timedelta((j - 2) * 30, unit='D')
    end_time = start_time - pd.Timedelta(test_set_length * 30, unit='D')
    df_test = df[df["time_since_order"] < start_time]
    df_test = df_test[df_test["time_since_order"] > end_time]
    test_orders[j] = df_test

# Scaler & imputer

In [138]:
imputer = SimpleImputer(strategy="mean")
scaler = StandardScaler()

# Train sets aggregation

In [139]:
train_customers = dict()

for key in train_orders.keys():
    tr = train_orders[key]

    # Aggregation by customer
    tr = tr.groupby("customer_unique_id").agg(
        {
            "order_id": "count",
            "total_payments": "sum",
            "mean_review_score": "mean",
            "time_since_order": "min",
            "delay": "mean",
        }
    )

    tr.columns = [
        "n_orders",
        "total_payments",
        "mean_review_score",
        "time_since_order",
        "mean_delay",
    ]

    # Impute mean value to review score
    tr["mean_review_score"] = imputer.fit_transform(
    np.array(tr["mean_review_score"]).reshape(-1, 1)
    )

    # Convert datetime to seconds
    tr[["time_since_order", "mean_delay"]] = (
        tr[["time_since_order", "mean_delay"]]
        .applymap(lambda cell: cell.total_seconds())
    )

    # Log tranform
    tr[["n_orders", "total_payments"]] = (
        tr[["n_orders", "total_payments"]]
        .applymap(lambda cell: np.log1p(cell))
    )

    # Standardization
    for col in [
        "n_orders",
        "total_payments",
        "mean_review_score",
        "mean_delay",
        "time_since_order"
    ]:
        tr[col] = scaler.fit_transform(
            np.array(tr[col]).reshape(-1, 1)
        )

    train_customers[key] = tr

# Test sets aggregation

In [140]:
test_customers = dict()

for key in test_orders.keys():
    tt = test_orders[key]

    # Aggregation by customer
    tt = tt.groupby("customer_unique_id").agg(
        {
            "order_id": "count",
            "total_payments": "sum",
            "mean_review_score": "mean",
            "time_since_order": "min",
            "delay": "mean",
        }
    )

    tt.columns = [
        "n_orders",
        "total_payments",
        "mean_review_score",
        "time_since_order",
        "mean_delay",
    ]

    # Impute mean value to review score
    tt["mean_review_score"] = imputer.fit_transform(
    np.array(tt["mean_review_score"]).reshape(-1, 1)
    )

    # Convert datetime to seconds
    tt[["time_since_order", "mean_delay"]] = (
        tt[["time_since_order", "mean_delay"]]
        .applymap(lambda cell: cell.total_seconds())
    )

    # Log tranform
    tt[["n_orders", "total_payments"]] = (
        tt[["n_orders", "total_payments"]]
        .applymap(lambda cell: np.log1p(cell))
    )

    # Standardization
    for col in [
        "n_orders",
        "total_payments",
        "mean_review_score",
        "mean_delay",
        "time_since_order"
    ]:
        tt[col] = scaler.fit_transform(
            np.array(tt[col]).reshape(-1, 1)
        )

    test_customers[key] = tt

# K-Means

In [141]:
test_customers[2]

Unnamed: 0_level_0,n_orders,total_payments,mean_review_score,time_since_order,mean_delay
customer_unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0000366f3b9a7992bf8c76cfdf3221e2,-0.102348,0.260650,0.631592,0.500345,0.451003
0000b849f77a49e4a4ce2b2a4ca5be3f,-0.102348,-1.727653,-0.172740,0.671802,0.444750
000949456b182f53c18b68d6babc79c1,-0.102348,-0.404130,-0.172740,1.477739,-0.103454
000ed48ceeb6f4bf8ad021a10a3c7b43,-0.102348,-2.045347,0.631592,-0.086301,0.284322
0019e8c501c85848ac0966d45226fa1d,-0.102348,0.661801,0.631592,1.020131,-0.361224
...,...,...,...,...,...
ffe6305176b9431a3eda3cf8904d7eb7,-0.102348,-0.836096,0.000000,-1.243461,6.846688
ffe76cb2f4bb39384c432d65ece67441,-0.102348,-0.666091,0.631592,1.361813,-0.085226
ffecceca389973ef16660d58696f281e,-0.102348,-0.549441,0.631592,1.357762,-0.102232
ffeddf8aa7cdecf403e77b2e9a99e2ea,-0.102348,0.703884,-1.781404,0.316007,1.046700


In [167]:
kmeans_model = KMeans(n_clusters=5, random_state=42)

sim_data = dict()

for t_train in train_periods:
    print("t_train =", t_train)
    train_set = train_customers[t_train]
    # K-means on train_set and store in df_cluster
    train_clusters = kmeans_model.fit_predict(train_set)

    test_clusters = dict()
    for t_delay in test_delays:
        print("t_delay_test =", t_delay)
        test_set = test_customers[t_delay].copy()

        # K-means on test_set and store in df_cluster
        test_clusters[f"test_delay_{t_delay}"] = kmeans_model.predict(test_set)


    sim_data[f"train_period_{t_train}"] = (train_clusters, test_clusters)

t_train = 2




t_delay_test = 2
t_delay_test = 3
t_delay_test = 4
t_delay_test = 5
t_delay_test = 6
t_train = 4




t_delay_test = 2
t_delay_test = 3
t_delay_test = 4
t_delay_test = 5
t_delay_test = 6
t_train = 6




t_delay_test = 2
t_delay_test = 3
t_delay_test = 4
t_delay_test = 5
t_delay_test = 6


In [200]:
# Adjusted rand score between each train and test sets

ari = dict()
for train_period in sim_data.keys():
    for test_delay in sim_data[train_period][1].keys():
        n_sample = min(sim_data[train_period][0].shape[0], sim_data[train_period][1][test_delay].shape[0])
        _, sampled_train, _, _ = train_test_split(
            sim_data[train_period][0],
            sim_data[train_period][0],
            test_size=n_sample - 5,
            stratify=sim_data[train_period][0],
            random_state=42
        )
        _, sampled_test, _, _ = train_test_split(
            sim_data[train_period][1][test_delay],
            sim_data[train_period][1][test_delay],
            test_size=n_sample - 5,
            stratify=sim_data[train_period][1][test_delay],
            random_state=42
        )
        ari[f"{train_period}_{test_delay}"] = adjusted_rand_score(sampled_train, sampled_test)

sorted_ari = sorted(ari.items(), key=lambda x: x[1], reverse=True)
for key, val in sorted_ari:
    print(key, f"{val:.5f}")

train_period_2_test_delay_5 0.00140
train_period_2_test_delay_6 0.00131
train_period_4_test_delay_3 0.00096
train_period_6_test_delay_2 0.00094
train_period_4_test_delay_4 0.00070
train_period_6_test_delay_6 0.00005
train_period_2_test_delay_3 -0.00007
train_period_2_test_delay_2 -0.00026
train_period_6_test_delay_4 -0.00043
train_period_6_test_delay_5 -0.00077
train_period_4_test_delay_5 -0.00099
train_period_4_test_delay_2 -0.00110
train_period_2_test_delay_4 -0.00133
train_period_6_test_delay_3 -0.00158
train_period_4_test_delay_6 -0.00337
