In [7]:
#installing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

In [8]:
# Load dataset
df = pd.read_csv("dataset.csv")

In [9]:
# Normalize column names
df.columns = [col.strip().lower().replace(" ", "_") for col in df.columns]

In [10]:
# Rename the date column for consistency
df.rename(columns={'invoicedate': 'invoice_date'}, inplace=True)

# Preprocessing

In [11]:
# Convert invoice_date to datetime
df['invoice_date'] = pd.to_datetime(df['invoice_date'], errors='coerce')

In [12]:
# Drop rows with missing customer IDs or dates
df.dropna(subset=['customerid', 'invoice_date'], inplace=True)

In [13]:
# Ensure numeric types
df['quantity'] = pd.to_numeric(df['quantity'], errors='coerce')
df['unitprice'] = pd.to_numeric(df['unitprice'], errors='coerce')

In [14]:
# Remove returns and invalid values
df = df[(df['quantity'] > 0) & (df['unitprice'] > 0)]

In [15]:
# Create sales column
df['sales'] = df['quantity'] * df['unitprice']

#  RFM Metric Calculation

In [16]:
snapshot_date = df['invoice_date'].max() + pd.Timedelta(days=1)

In [17]:
rfm = df.groupby('customerid').agg({
    'invoice_date': lambda x: (snapshot_date - x.max()).days,  # Recency
    'invoiceno': 'nunique',                                    # Frequency
    'sales': 'sum'                                             # Monetary
}).rename(columns={'invoice_date': 'recency', 'invoiceno': 'frequency', 'sales': 'monetary'})

#  RFM Scoring

In [18]:
rfm['r_score'] = pd.qcut(rfm['recency'], 5, labels=[5, 4, 3, 2, 1])
rfm['f_score'] = pd.qcut(rfm['frequency'].rank(method='first'), 5, labels=[1, 2, 3, 4, 5])
rfm['m_score'] = pd.qcut(rfm['monetary'], 5, labels=[1, 2, 3, 4, 5])

rfm['rfm_score'] = rfm['r_score'].astype(str) + rfm['f_score'].astype(str) + rfm['m_score'].astype(str)

#  Customer Segmentation (KMeans)

In [19]:
scaler = StandardScaler()
scaled = scaler.fit_transform(rfm[['recency', 'frequency', 'monetary']])

kmeans = KMeans(n_clusters=4, random_state=42)
rfm['cluster'] = kmeans.fit_predict(scaled)

#  Visualization

In [20]:
plt.figure(figsize=(8, 6))
sns.scatterplot(data=rfm, x='recency', y='monetary', hue='cluster', palette='Set2')
plt.title("Customer Segmentation (Recency vs Monetary)")
plt.tight_layout()
plt.savefig("rfm_cluster_plot.png")
plt.close()

# Export

In [21]:
rfm.reset_index().to_csv("rfm_segmented_customers.csv", index=False)
print("Success: 'rfm_segmented_customers.csv' and 'rfm_cluster_plot.png' saved.")


Success: 'rfm_segmented_customers.csv' and 'rfm_cluster_plot.png' saved.
