In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
import os
import datetime
from sklearn.pipeline import Pipeline

In [22]:
df = pd.read_csv('../../data/data.csv', encoding='ISO-8859-1')

In [23]:
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])

# Function for one month snapshot

In [53]:
# Function to get the snapshot of the data at a given time in the past, the duration of the snapshot is one month

def snap_shot_one_month(df: pd.DataFrame, time_snapshot: datetime.datetime, duration: int = 30) -> pd.DataFrame:
    # Filter the data to get the snapshot of one month
    df = df[(df['InvoiceDate'] >= time_snapshot - datetime.timedelta(days=duration)) & (df['InvoiceDate'] <= time_snapshot)]

    # Create a new column to store the total amount of each transaction
    df['total_amount'] = df['Quantity'] * df['UnitPrice']  

    # Create a new Dataframe to store the snapshot of the data  
    df_snapshot = pd.DataFrame(columns=['CustomerID', 'total_successful_amount_last_1_month', 'num_successful_orders_last_1_month'])

    # Calculate the total amount of successful orders and the number of successful orders for each customer
    for customer_id in df['CustomerID'].unique():
        # Filter the data to get the successful orders of the customer
        df_customer = df[df['CustomerID'] == customer_id]

        # Calculate the total amount of successful orders and the number of successful orders for the customer
        total_amount = df_customer['total_amount'].sum()
        num_orders = df_customer.shape[0]

        # Add the customer's total amount and number of orders to the snapshot dataframe
        df_customer = pd.DataFrame({'CustomerID': customer_id, 'total_successful_amount_last_1_month': total_amount, 'num_successful_orders_last_1_month': num_orders}, index=[0])   
        df_snapshot = pd.concat([df_snapshot, df_customer], ignore_index=True) if df_snapshot.shape[0] > 0 else df_customer
    
    # Add the time_snapshot to the snapshot dataframe
    df_snapshot['time_snapshot'] = time_snapshot
    
    # Drop the rows with missing values (CustomerID)
    df_snapshot.dropna(inplace=True)
    return df_snapshot

In [47]:
time_snapshot = datetime.datetime(2011, 10, 1)
df_snap = snap_shot_one_month(df, time_snapshot)
df_snap.dropna(inplace=True)
df_snap

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['total_amount'] = df['Quantity'] * df['UnitPrice']


Unnamed: 0,CustomerID,total_successful_amount_last_1_month,num_successful_orders_last_1_month,time_snapshot
0,13509.0,40.00,2,2011-10-01
2,13305.0,385.50,16,2011-10-01
3,16187.0,1610.67,63,2011-10-01
4,17306.0,1082.92,11,2011-10-01
5,12474.0,437.58,28,2011-10-01
...,...,...,...,...
1298,13021.0,466.81,17,2011-10-01
1299,15867.0,357.71,26,2011-10-01
1300,17227.0,139.45,10,2011-10-01
1301,12842.0,808.26,20,2011-10-01


# Function looking back to the past and looking forward the future

In [51]:
# Function to that have snapshot_time as input and return the snapshot of the data from the past and the future

def snap_shot_past_future(df: pd.DataFrame, snapshot_time: datetime.datetime, past_duration: int = 30, future_duration: int = 30) -> pd.DataFrame:
    # Get the snapshot of the data at the snapshot_time
    df_past = snap_shot_one_month(df, snapshot_time, duration=past_duration)
    
    # Get the snapshot of the data at the snapshot_time + 1 month
    df_future = snap_shot_one_month(df, snapshot_time + datetime.timedelta(days=future_duration), duration=future_duration)
    
    # Merge the past and future snapshots
    df_snap_past_future = pd.merge(df_past, df_future, on='CustomerID', how='inner', suffixes=('_past', '_future'))
    return df_snap_past_future

In [54]:
time_snapshot = datetime.datetime(2011, 10, 1)
df_snap_past_future = snap_shot_past_future(df, time_snapshot)
df_snap_past_future.dropna(inplace=True)
df_snap_past_future

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['total_amount'] = df['Quantity'] * df['UnitPrice']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['total_amount'] = df['Quantity'] * df['UnitPrice']


Unnamed: 0,CustomerID,total_successful_amount_last_1_month_past,num_successful_orders_last_1_month_past,time_snapshot_past,total_successful_amount_last_1_month_future,num_successful_orders_last_1_month_future,time_snapshot_future
0,13509.0,40.00,2,2011-10-01,176.60,8,2011-10-31
1,16187.0,1610.67,63,2011-10-01,312.89,14,2011-10-31
2,12474.0,437.58,28,2011-10-01,1796.55,114,2011-10-31
3,13276.0,137.38,7,2011-10-01,453.55,23,2011-10-31
4,16145.0,648.58,46,2011-10-01,490.17,26,2011-10-31
...,...,...,...,...,...,...,...
535,13954.0,227.41,13,2011-10-01,-61.33,5,2011-10-31
536,15287.0,340.98,15,2011-10-01,-1.45,1,2011-10-31
537,17675.0,1050.98,47,2011-10-01,-99.98,5,2011-10-31
538,15411.0,609.05,33,2011-10-01,-12.50,1,2011-10-31


In [56]:
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
