# Clustering

## Import Libraries

In [1]:
import pandas as pd
import numpy as np


## Load Dataset

In [3]:
# load dataset and limit
df = pd.read_csv("retail.csv", encoding='ISO-8859-1')

# Randomly sample 10,000 rows
df = df.sample(n=50000, random_state=43) 
df.reset_index(drop=True, inplace=True)

In [4]:
# Show Dataset
df

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,563238,22679,FRENCH BLUE METAL DOOR SIGN 4,10,8/15/2011 9:59,1.25,15093.0,United Kingdom
1,553566,22699,ROSES REGENCY TEACUP AND SAUCER,12,5/18/2011 9:09,2.95,12690.0,France
2,546084,22303,COFFEE MUG APPLES DESIGN,6,3/9/2011 11:28,2.55,14112.0,United Kingdom
3,572302,23533,WALL ART GARDEN HAVEN,1,10/23/2011 14:47,5.95,15427.0,United Kingdom
4,558614,22993,SET OF 4 PANTRY JELLY MOULDS,3,6/30/2011 15:56,2.46,,United Kingdom
...,...,...,...,...,...,...,...,...
49995,580848,72800E,4 IVORY DINNER CANDLES SILVER FLOCK,1,12/6/2011 11:51,0.79,18005.0,United Kingdom
49996,567742,23301,GARDENERS KNEELING PAD KEEP CALM,12,9/22/2011 10:47,1.65,14261.0,United Kingdom
49997,547387,22699,ROSES REGENCY TEACUP AND SAUCER,6,3/22/2011 16:00,2.95,12539.0,Spain
49998,563037,22352,LUNCH BOX WITH CUTLERY RETROSPOT,6,8/11/2011 15:02,2.55,12362.0,Belgium


In [5]:
# Check Data Info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   InvoiceNo    50000 non-null  object 
 1   StockCode    50000 non-null  object 
 2   Description  49884 non-null  object 
 3   Quantity     50000 non-null  int64  
 4   InvoiceDate  50000 non-null  object 
 5   UnitPrice    50000 non-null  float64
 6   CustomerID   37337 non-null  float64
 7   Country      50000 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 3.1+ MB


## Check Missing Value

In [6]:
# Menghitung jumlah nilai null di setiap kolom
null_counts = df.isnull().sum()

# Menghitung persentase nilai null
null_percentages = (null_counts / len(df)) * 100


null_percentages


InvoiceNo       0.000
StockCode       0.000
Description     0.232
Quantity        0.000
InvoiceDate     0.000
UnitPrice       0.000
CustomerID     25.326
Country         0.000
dtype: float64

## Handling Missing Value

In [7]:
df = df.dropna(subset=['Description','CustomerID'])

In [8]:
df.isnull().sum()

InvoiceNo      0
StockCode      0
Description    0
Quantity       0
InvoiceDate    0
UnitPrice      0
CustomerID     0
Country        0
dtype: int64

In [10]:
# Change Data type Columns
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'], format='%m/%d/%Y %H:%M', errors='coerce')
df['CustomerID'] = df['CustomerID'].astype('int64')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'], format='%m/%d/%Y %H:%M', errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['CustomerID'] = df['CustomerID'].astype('int64')


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 37337 entries, 0 to 49999
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   InvoiceNo    37337 non-null  object        
 1   StockCode    37337 non-null  object        
 2   Description  37337 non-null  object        
 3   Quantity     37337 non-null  int64         
 4   InvoiceDate  37337 non-null  datetime64[ns]
 5   UnitPrice    37337 non-null  float64       
 6   CustomerID   37337 non-null  int64         
 7   Country      37337 non-null  object        
dtypes: datetime64[ns](1), float64(1), int64(2), object(4)
memory usage: 2.6+ MB


## Create New Columns

In [12]:
# Convert InvoiceDate to datetime and extract only the date
df['InvoiceDay'] = df['InvoiceDate'].dt.date

# Find the most recent purchase date for each customer
customer_data = df.groupby('CustomerID')['InvoiceDay'].max().reset_index()

# Find the most recent date in the entire dataset
most_recent_date = df['InvoiceDay'].max()

# Convert InvoiceDay to datetime type before subtraction
customer_data['InvoiceDay'] = pd.to_datetime(customer_data['InvoiceDay'])
most_recent_date = pd.to_datetime(most_recent_date)

# Calculate the number of days since the last purchase for each customer
customer_data['Days_Since_Last_Purchase'] = (most_recent_date - customer_data['InvoiceDay']).dt.days

# Remove the InvoiceDay column
customer_data.drop(columns=['InvoiceDay'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['InvoiceDay'] = df['InvoiceDate'].dt.date


In [13]:
customer_data.head()

Unnamed: 0,CustomerID,Days_Since_Last_Purchase
0,12346,325
1,12347,39
2,12348,318
3,12349,18
4,12350,310


In [14]:
# Calculate the total number of transactions made by each customer
total_transactions = df.groupby('CustomerID')['InvoiceNo'].nunique().reset_index()
total_transactions.rename(columns={'InvoiceNo': 'Total_Transactions'}, inplace=True)

# Calculate the total number of products purchased by each customer
total_products_purchased = df.groupby('CustomerID')['Quantity'].sum().reset_index()
total_products_purchased.rename(columns={'Quantity': 'Total_Products_Purchased'}, inplace=True)

# Merge the new features into the customer_data dataframe
customer_data = pd.merge(customer_data, total_transactions, on='CustomerID')
customer_data = pd.merge(customer_data, total_products_purchased, on='CustomerID')

# Display the first few rows of the customer_data dataframe
customer_data.head()

Unnamed: 0,CustomerID,Days_Since_Last_Purchase,Total_Transactions,Total_Products_Purchased
0,12346,325,1,74215
1,12347,39,5,223
2,12348,318,2,264
3,12349,18,1,46
4,12350,310,1,37


In [15]:
# Calculate the total spend by each customer
df['Total_Spend'] = df['UnitPrice'] * df['Quantity']
total_spend = df.groupby('CustomerID')['Total_Spend'].sum().reset_index()

# Merge the new features into the customer_data dataframe
customer_data = pd.merge(customer_data, total_spend, on='CustomerID')


# Display the first few rows of the customer_data dataframe
customer_data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Total_Spend'] = df['UnitPrice'] * df['Quantity']


Unnamed: 0,CustomerID,Days_Since_Last_Purchase,Total_Transactions,Total_Products_Purchased,Total_Spend
0,12346,325,1,74215,77183.6
1,12347,39,5,223,494.65
2,12348,318,2,264,92.16
3,12349,18,1,46,101.69
4,12350,310,1,37,100.0


In [16]:
customer_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3758 entries, 0 to 3757
Data columns (total 5 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   CustomerID                3758 non-null   int64  
 1   Days_Since_Last_Purchase  3758 non-null   int64  
 2   Total_Transactions        3758 non-null   int64  
 3   Total_Products_Purchased  3758 non-null   int64  
 4   Total_Spend               3758 non-null   float64
dtypes: float64(1), int64(4)
memory usage: 146.9 KB


## Save New Dataframe to CSV File

In [17]:
customer_data.to_csv('retail_clustering.csv', index=False)