In [7]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [8]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from customer_segmentation import CustomerStandardizer, CustomerSummary
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

%matplotlib inline
sns.set_style('white')

In [9]:
class Periodizer(BaseEstimator, TransformerMixin):
    def __init__(self, months=3):
        self.months = months

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.sort_values("time_since_order")
        for col in ["time_since_order", "delay"]:
            X[col] = pd.to_timedelta(
                X[col].apply(lambda row: row / (60 * 60 * 24 * 1000)), unit="D"
            )
        oldest_order = X["time_since_order"].max()
        time_interval = pd.Timedelta(30 * self.months, unit="D")
        bin_edges = [oldest_order]
        while bin_edges[-1] - time_interval > pd.Timedelta(0):
            next_bin_edge = bin_edges[-1] - time_interval
            idx_bin_edge = (X["time_since_order"] < next_bin_edge).idxmin()
            bin_edges.append(X.loc[idx_bin_edge, "time_since_order"])
        X["period_id"] = (
            pd.cut(X["time_since_order"], bins=bin_edges[::-1], labels=False)
            .map(lambda x: len(bin_edges) - x - 1 if x == x else len(bin_edges) - 1)
            .astype(int)
        )
        return X

In [10]:
df = pd.read_json("orders.json")

In [12]:
prdz = Periodizer(months=3)

df = prdz.fit_transform(df)

In [14]:
df_agg_period = df.groupby("period_id")
periods = dict()
csum = CustomerSummary()
cstd = CustomerStandardizer()

for trim in range(1, df["period_id"].max() + 1):
    periods[trim] = (
        cstd.fit_transform(
            csum.fit_transform(
                df_agg_period.get_group(trim).drop("period_id", axis=1))))