In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
import os
import datetime
from sklearn.pipeline import Pipeline
from dateutil.relativedelta import relativedelta

In [2]:
df = pd.read_csv('../../data/data.csv', encoding='ISO-8859-1')

In [3]:
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])

In [4]:
# Suppress the SettingWithCopyWarning
pd.set_option('mode.chained_assignment', None)

# Function for one month duration snapshot

In [5]:
# Function to get the snapshot of the data at a given time in the past, the time_shot of the snapshot is one month
## Input
##      df: the original data
##      time_snapshot: the time of the snapshot
##      time_shot: the time_shot of the snapshot (1 month ago, 2 months ago, 3 months ago, etc.)
## Output
##      df_snapshot: the snapshot of the data at the given time in the past. The duration of the snapshot is one month
##                    The snapshot includes the total amount of successful orders and the number of successful orders for each customer 

def snap_shot_month(df: pd.DataFrame, time_snapshot: datetime.datetime, time_shot: int = 1) -> pd.DataFrame:
    # Filter the data to get the snapshot of one month
    df = df[(df['InvoiceDate'] >= time_snapshot - relativedelta(months=time_shot)) & (df['InvoiceDate'] <= time_snapshot- relativedelta(months=time_shot-1))]

    # Create a new column to store the total amount of each transaction
    df['total_amount'] = df['Quantity'] * df['UnitPrice']  

    # Create a new Dataframe to store the snapshot of the data  
    df_snapshot = pd.DataFrame(columns=['CustomerID', 'total_successful_amount_last_1_month', 'num_successful_orders_last_1_month'])

    # Calculate the total amount of successful orders and the number of successful orders for each customer
    for customer_id in df['CustomerID'].unique():
        # Filter the data to get the successful orders of the customer
        df_customer = df[df['CustomerID'] == customer_id]

        # Calculate the total amount of successful orders and the number of successful orders for the customer
        total_amount = df_customer['total_amount'].sum()
        num_orders = df_customer.shape[0]

        # Add the customer's total amount and number of orders to the snapshot dataframe
        df_customer = pd.DataFrame({'CustomerID': customer_id, 'total_successful_amount_last_1_month': total_amount, 'num_successful_orders_last_1_month': num_orders}, index=[0])   
        df_snapshot = pd.concat([df_snapshot, df_customer], ignore_index=True) if df_snapshot.shape[0] > 0 else df_customer
    
    # Add the time_snapshot to the snapshot dataframe
    df_snapshot['time_snapshot'] = time_snapshot
    
    # Drop the rows with missing values (CustomerID)
    df_snapshot.dropna(inplace=True)
    return df_snapshot

In [6]:
# Test the function
time_snapshot = datetime.datetime(2011, 1, 1)
df_snap = snap_shot_month(df, time_snapshot)
df_snap

Unnamed: 0,CustomerID,total_successful_amount_last_1_month,num_successful_orders_last_1_month,time_snapshot
0,17850.0,5391.21,297,2011-01-01
1,13047.0,366.63,17,2011-01-01
2,12583.0,855.86,20,2011-01-01
3,13748.0,204.00,1,2011-01-01
4,15100.0,492.75,4,2011-01-01
...,...,...,...,...
944,13922.0,172.25,7,2011-01-01
945,13817.0,128.70,9,2011-01-01
946,12585.0,1262.85,65,2011-01-01
947,13165.0,354.64,13,2011-01-01


In [7]:
# Function to get the first days of each month between two dates
def get_first_days_of_months(start_date, end_date):
    # List to store the first days of each month
    first_days = []
    
    # Start from the first day of the start date's month
    current_date = start_date.replace(day=1)
    
    # Loop until the current_date is past the end_date
    while current_date <= end_date:
        first_days.append(current_date)
        current_date += relativedelta(months=1)  # Add one month
        
    return first_days

In [8]:
# Function to get all snapshots of the data at a given time in the past

def snap_shot_all(df: pd.DataFrame,  time_shot: int = 1) -> pd.DataFrame:
    
    # Get time snapshots
    time_snapshots = df['InvoiceDate'].unique()
    time_min = time_snapshots.min().date()
    time_max = time_snapshots.max().date()

    # Create the beginning and ending time of the snapshots
    time_begin = pd.to_datetime(time_min.replace(day=1) + relativedelta(months=time_shot))
    time_end = pd.to_datetime(time_max.replace(day=1)+ relativedelta(months=time_shot))

    # Get the first days of each month between the beginning and ending time 
    first_days = get_first_days_of_months(time_begin, time_end)

    # Create a new Dataframe to store the snapshot of the data  
    df_snapshot_all = pd.DataFrame(columns=['CustomerID', 'total_successful_amount_last_1_month', 'num_successful_orders_last_1_month', 'time_snapshot'])

    # Get the snapshot of the data at each time in the past
    for time_snapshot in first_days:
        df_snapshot = snap_shot_month(df, time_snapshot, time_shot=time_shot)
        df_snapshot_all = pd.concat([df_snapshot_all, df_snapshot], ignore_index=True) if df_snapshot_all.shape[0] > 0 else df_snapshot
    
    return df_snapshot_all

In [9]:
snap_shot_all(df)

Unnamed: 0,CustomerID,total_successful_amount_last_1_month,num_successful_orders_last_1_month,time_snapshot
0,17850.0,5391.21,297,2011-01-01
1,13047.0,366.63,17,2011-01-01
2,12583.0,855.86,20,2011-01-01
3,13748.0,204.00,1,2011-01-01
4,15100.0,492.75,4,2011-01-01
...,...,...,...,...
13670,12713.0,848.55,38,2012-01-01
13671,17581.0,984.68,35,2012-01-01
13672,15804.0,329.05,21,2012-01-01
13673,13113.0,339.20,4,2012-01-01


# Function looking back to the past and looking forward the future

In [10]:
# Function to that have snapshot_time as input and return the snapshot of the data from the past and the future
## Input
##      df: the original data
##      snapshot_time: the time of the snapshot
##      past_time_shot: the number of time_spots of the snapshot in the past (1 month ago, 2 months ago, 3 months ago, etc.)
##      future_time_shot: the number of time_spots of the snapshot in the future (1 month later, 2 months later, 3 months later, etc.)
## Output
##      df_snapshot: the snapshot of the data at the given time in the past and the future. The duration of the snapshot is one month
##                   The snapshot includes the total amount of successful orders and the number of successful orders for each customer


def snap_shot_past_future(df: pd.DataFrame, snapshot_time: datetime.datetime, past_time_shot: int = 2, future_time_shot: int = 2) -> pd.DataFrame:
    
    # Get time snapshots
    time_snapshots = df['InvoiceDate'].unique()
    time_min = time_snapshots.min().date()
    time_max = time_snapshots.max().date()
    time_min_snapshot = pd.to_datetime(time_min.replace(day=1))
    time_max_snapshot = pd.to_datetime(time_max.replace(day=1)) + relativedelta(months=1)

    # Get the snapshot of the data in the past at the snapshot_time
    df_past= snap_shot_month(df, snapshot_time, time_shot=1) # Get the snapshot one month ago

    # Loop to get the snapshots of the data in the past at the snapshot_time
    for i in range(2, past_time_shot+1):
        df_past_temp = snap_shot_month(df, snapshot_time, time_shot=i) # Get the snapshot i months ago
        df_past_temp.drop(columns=['time_snapshot'], inplace=True) # Drop the time_snapshot column
        df_past_temp.rename(columns={'total_successful_amount_last_1_month': f'total_successful_amount_last_{i}_month', 'num_successful_orders_last_1_month': f'num_successful_orders_last_{i}_month'}, inplace=True) # Rename the columns name
        df_past = pd.merge(df_past_temp, df_past,  on='CustomerID', how='right') # Merge the past snapshots on the CustomerID

        # Check if the snapshot time is greater than the minimum time snapshot
        ## Fill the missing values with 0 if the snapshot time is greater than the minimum time snapshot
        ## Otherwise, keep the missing values
        if snapshot_time - relativedelta(months=i) >= time_min_snapshot:
            df_past.fillna(0, inplace=True) # Fill the missing values with 0
        

    # Loop to get the snapshots of the data in the future at the snapshot_time
    for i in range(1, future_time_shot+1):
        df_future_temp = snap_shot_month(df, snapshot_time + relativedelta(months=i))
        df_future_temp.drop(columns=['time_snapshot'], inplace=True) # Drop the time_snapshot column   
        df_future_temp.rename(columns={'total_successful_amount_last_1_month': f'total_successful_amount_future_{i}_month', 'num_successful_orders_last_1_month': f'num_successful_orders_future_{i}_month'}, inplace=True)
        df_past = pd.merge(df_past, df_future_temp,  on='CustomerID', how='left') # Merge the future snapshots on the CustomerID
        # Check if the snapshot time is less than the maximum time snapshot
        ## Fill the missing values with 0 if the snapshot time is less than the maximum time snapshot
        ## Otherwise, keep the missing values
        if snapshot_time + relativedelta(months=i) <= time_max_snapshot:
            df_past[df_future_temp.columns.to_list()] = df_past[df_future_temp.columns.to_list()].fillna(0)
    
    # Reorder the columns snapshot_time to the end of the dataframe
    col = df_past.pop('time_snapshot')
    df_past['time_snapshot'] = col
        
    return df_past

In [11]:
# Function to get all snapshots of the data at a given time in the past and the future

def snap_shot_all_past_future(df: pd.DataFrame, past_time_shot: int = 2, future_time_shot: int = 2) -> pd.DataFrame:
    
    # Get time snapshots
    time_snapshots = df['InvoiceDate'].unique()
    time_min = time_snapshots.min().date()
    time_max = time_snapshots.max().date()

    # Create the beginning and ending time of the snapshots
    time_begin = pd.to_datetime(time_min.replace(day=1)) 
    time_end = pd.to_datetime(time_max.replace(day=1)) + relativedelta(months=1)

    # Get the first days of each month between the beginning and ending time 
    first_days = get_first_days_of_months(time_begin, time_end)

    # Create a new Dataframe to store the snapshot of the data  
    df_snapshot_all = pd.DataFrame(columns=['CustomerID', 'total_successful_amount_last_1_month', 'num_successful_orders_last_1_month', 'time_snapshot'])

    # Get the snapshot of the data at each time in the past
    for time_snapshot in first_days:
        df_snapshot = snap_shot_past_future(df, time_snapshot, past_time_shot=past_time_shot, future_time_shot=future_time_shot)
        df_snapshot_all = pd.concat([df_snapshot_all, df_snapshot], ignore_index=True) if df_snapshot_all.shape[0] > 0 else df_snapshot
    
    return df_snapshot_all

In [12]:
# Table of the snapshot of the data at each time in the past and the future 

snap_shot_all_past_future(df, 2, 2) # Two months in the past and two months in the future

  df_snapshot_all = pd.concat([df_snapshot_all, df_snapshot], ignore_index=True) if df_snapshot_all.shape[0] > 0 else df_snapshot
  df_snapshot_all = pd.concat([df_snapshot_all, df_snapshot], ignore_index=True) if df_snapshot_all.shape[0] > 0 else df_snapshot
  df_snapshot_all = pd.concat([df_snapshot_all, df_snapshot], ignore_index=True) if df_snapshot_all.shape[0] > 0 else df_snapshot


Unnamed: 0,CustomerID,total_successful_amount_last_2_month,num_successful_orders_last_2_month,total_successful_amount_last_1_month,num_successful_orders_last_1_month,total_successful_amount_future_1_month,num_successful_orders_future_1_month,total_successful_amount_future_2_month,num_successful_orders_future_2_month,time_snapshot
0,17850.0,,,5391.21,297,0.00,0.0,-102.58,15.0,2011-01-01
1,13047.0,,,366.63,17,-2.95,1.0,458.90,26.0,2011-01-01
2,12583.0,,,855.86,20,730.16,22.0,303.56,14.0,2011-01-01
3,13748.0,,,204.00,1,0.00,0.0,0.00,0.0,2011-01-01
4,15100.0,,,492.75,4,142.35,2.0,0.00,0.0,2011-01-01
...,...,...,...,...,...,...,...,...,...,...
13670,12713.0,0.00,0.0,848.55,38,,,,,2012-01-01
13671,17581.0,1682.29,66.0,984.68,35,,,,,2012-01-01
13672,15804.0,692.12,52.0,329.05,21,,,,,2012-01-01
13673,13113.0,336.96,5.0,339.20,4,,,,,2012-01-01
