In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings


In [4]:
df = pd.read_csv('../../data/data.csv', encoding='ISO-8859-1')

In [5]:
# Suppress the SettingWithCopyWarning
pd.set_option('mode.chained_assignment', None)
pd.set_option('future.no_silent_downcasting', True)
warnings.simplefilter(action='ignore', category=FutureWarning)

In [6]:
df.describe()

Unnamed: 0,Quantity,UnitPrice,CustomerID
count,541909.0,541909.0,406829.0
mean,9.55225,4.611114,15287.69057
std,218.081158,96.759853,1713.600303
min,-80995.0,-11062.06,12346.0
25%,1.0,1.25,13953.0
50%,3.0,2.08,15152.0
75%,10.0,4.13,16791.0
max,80995.0,38970.0,18287.0


In [None]:
def data_cleaning(df:pd.DataFrame) -> pd.DataFrame:
    # Convert InvoiceDate to the datetime type
    df["InvoiceDate"] = pd.to_datetime(df.InvoiceDate, errors="coerce")


    # There are two bad debts with negative UnitPrice
    df = df[df.UnitPrice >= 0]

    # Create revenue column 
    df["Revenue"] = df["Quantity"] * df["UnitPrice"]

    #Invoice with negative Quantity is cancelled and need to be set to 0
    df.loc[df.Quantity < 0, "Quantity"] = 0

    # There are some transactions which are not related to sales
    lst_delete_description = ["AMAZON FEE", "DOTCOM POSTAGE", "Bank Charges", "POSTAGE", "Adjust bad debt"]
    df = df[~df.Description.isin(lst_delete_description)]  

    # Add some features
    df["date_only"] = df["InvoiceDate"].dt.date
    df["month"] = df["InvoiceDate"].dt.month
    df["year"] = df["InvoiceDate"].dt.year
    df["hour"] = df["InvoiceDate"].dt.hour
    df["weekday"] = df["InvoiceDate"].dt.weekday
    return df

In [15]:
df_clean = data_cleaning(df)
df_clean.describe()

Unnamed: 0,Quantity,InvoiceDate,UnitPrice,CustomerID,Revenue,month,year,hour
count,539874.0,539874,539874.0,405605.0,539874.0,539874.0,539874.0,539874.0
mean,10.478567,2011-07-04 14:37:36.306471168,3.705822,15294.795079,17.994995,7.555213,2010.921559,13.079937
min,0.0,2010-12-01 08:26:00,0.0,12346.0,-168469.6,1.0,2010.0,6.0
25%,1.0,2011-03-28 11:49:00,1.25,13969.0,3.4,5.0,2011.0,11.0
50%,3.0,2011-07-20 11:07:00,2.08,15159.0,9.36,8.0,2011.0,13.0
75%,10.0,2011-10-19 11:41:00,4.13,16794.0,17.4,11.0,2011.0,15.0
max,80995.0,2011-12-09 12:50:00,38970.0,18287.0,168469.6,12.0,2011.0,20.0
std,155.5839,,59.220321,1710.329579,371.656825,3.508993,0.268864,2.442719
