<a href="https://colab.research.google.com/github/syankov-ai/Medium/blob/main/RFM_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install faker -q

## Raw Data

In [None]:
from faker import Faker
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

np.random.seed(42)

fake = Faker()

# Specify the number of dataset rows and customers
num_rows = 1000
num_customers = 300

# Simulate repeat customers
customer_ids = np.random.choice(np.arange(100, 10000), num_customers, replace=False)
repeat_ids = np.random.choice(customer_ids, num_rows)

# Random purchase dates over past 2 years
purchase_dates = [fake.date_between(start_date='-2y', end_date='today') for _ in range(num_rows)]

# Amounts between 10 and 1000
transaction_amounts = np.round(np.random.uniform(10, 1000, num_rows), 2)

# 4 possible products
product_info = np.random.choice(['Tasty Bites', 'T-Shirt Mouse', 'Penguin Snack', 'Handy Thing'], num_rows)

# 6-digit unique order ids
order_ids = np.random.randint(100000, 999999, num_rows)

# Use Faker for locations
locations = [fake.city() for _ in range(num_rows)]

df = pd.DataFrame({
    'customer_id': repeat_ids,
    'purchase_date': purchase_dates,
    'transaction_amount': transaction_amounts,
    'product_information': product_info,
    'order_id': order_ids,
    'location': locations
})

df.head()


## Recency

In [None]:

# Ensure date column is interpreted as datetime
df['purchase_date'] = pd.to_datetime(df['purchase_date'])

# Compute each customer's most recent purchase
df_recency = df.groupby(by='customer_id', as_index=False)['purchase_date'].max()
df_recency.columns = ['customer_id', 'last_purchase_date']

# Find most recent date in dataset for recency calculation
recent_date = df_recency['last_purchase_date'].max()

# Calculate recency in days (how long since last purchase)
df_recency['recency'] = df_recency['last_purchase_date'].apply(lambda x: (recent_date - x).days)

# Visualize distribution of recency using seaborn
plt.figure(figsize=(10, 6))
sns.histplot(df_recency['recency'], bins=30, kde=True, color="skyblue")
plt.title('Distribution of Customer Recency')
plt.xlabel('Recency (days since last purchase)')
plt.ylabel('Number of Customers')
plt.tight_layout()
plt.show()


## Frequency

In [None]:
# Remove duplicate purchase records to count only unique transactions
frequency_df = df.drop_duplicates().groupby(by=['customer_id'], as_index=False)['purchase_date'].count()

# Rename columns for clarity
frequency_df.columns = ['customer_id', 'frequency']

# Display first few rows
frequency_df.head()

# Visualization: Distribution of purchase frequency
plt.figure(figsize=(8, 4))
sns.histplot(frequency_df['frequency'], bins=9, kde=False, color="salmon")
plt.title('Distribution of Purchase Frequency Among Customers')
plt.xlabel('Number of Unique Purchases')
plt.ylabel('Number of Customers')
plt.tight_layout()
plt.show()


## Monetary Value

In [None]:
# Create a new column 'total' to store transaction amounts for each purchase
df['total'] = df['transaction_amount']

# Group data by customer_id and sum the total transaction amounts per customer
monetary_df = df.groupby(by='customer_id', as_index=False)['total'].sum()

# Rename the columns to lowercase and meaningful names
monetary_df.columns = ['customer_id', 'monetary']

# Display the first few rows
monetary_df.head()


In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(monetary_df['monetary'], bins=30, kde=True, color='green')
plt.title('Distribution of Total Customer Spend')
plt.xlabel('Monetary Value (Total Spend)')
plt.ylabel('Number of Customers')
plt.tight_layout()
plt.show()


## RFM Score

In [None]:
# Merge recency and frequency dataframes on customer_id to combine their metrics
rf_df = df_recency.merge(frequency_df, on='customer_id')

# Merge the above result with monetary dataframe and drop the redundant last_purchase_date column
rfm_df = rf_df.merge(monetary_df, on='customer_id').drop(columns='last_purchase_date')

# Assign ranks to customers: lower recency (more recent) ranks higher (descending),
# while higher frequency and monetary values rank higher (ascending)
rfm_df['R_rank'] = rfm_df['recency'].rank(ascending=False)
rfm_df['F_rank'] = rfm_df['frequency'].rank(ascending=True)
rfm_df['M_rank'] = rfm_df['monetary'].rank(ascending=True)

# Normalize ranks to a 0-100 scale for comparability across the metrics
rfm_df['R_rank_norm'] = (rfm_df['R_rank'] / rfm_df['R_rank'].max()) * 100
rfm_df['F_rank_norm'] = (rfm_df['F_rank'] / rfm_df['F_rank'].max()) * 100
rfm_df['M_rank_norm'] = (rfm_df['M_rank'] / rfm_df['M_rank'].max()) * 100

# After normalization, drop the intermediate rank columns as they are no longer needed
rfm_df.drop(columns=['R_rank', 'F_rank', 'M_rank'], inplace=True)

# Show the first few rows of the final dataframe
rfm_df.head()


In [None]:
# Define weights for each RFM component based on business priorities
weight_recency = 0.15
weight_frequency = 0.3
weight_monetary = 0.55

# Calculate the composite RFM score as a weighted sum of normalized ranks
rfm_df['RFM_Score'] = \
    weight_recency * rfm_df['R_rank_norm'] + \
    weight_frequency * rfm_df['F_rank_norm'] + \
    weight_monetary * rfm_df['M_rank_norm']

# Scale down the RFM score for easier interpretation or alignment with business KPIs
rfm_df['RFM_Score'] *= 0.05
rfm_df = rfm_df.round(2)

rfm_df.head()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 4))
sns.histplot(rfm_df['RFM_Score'], bins=20, kde=True, color='purple')
plt.title('Distribution of Composite RFM Scores')
plt.xlabel('RFM Score')
plt.ylabel('Number of Customers')
plt.tight_layout()
plt.show()


## Segment Customers

In [None]:
import numpy as np

# Define thresholds at 20th, 50th, and 80th percentiles of the RFM score
percentiles = np.percentile(rfm_df['RFM_Score'], [20, 50, 80])

# Apply conditions based on percentile thresholds
rfm_df['Customer_Segment'] = np.where(rfm_df['RFM_Score'] >= percentiles[2], 'Top Customers',
                             np.where(rfm_df['RFM_Score'] >= percentiles[1], 'High Value Customers',
                             np.where(rfm_df['RFM_Score'] >= percentiles[0], 'Medium Value Customers', 'Low Value Customers')))

# Display segment distribution
print(rfm_df['Customer_Segment'].value_counts())


In [None]:
import matplotlib.pyplot as plt

# Count the occurrences of each segment
segment_counts = rfm_df['Customer_Segment'].value_counts()

# Create a pie chart
plt.figure(figsize=(6, 6))
plt.pie(segment_counts, labels=segment_counts.index, autopct='%1.1f%%',
        startangle=140, colors=['gold', 'lightgreen', 'lightskyblue', 'lightcoral'],
        labeldistance=1.03)
plt.title("Customer Segment Distribution by RFM Score Percentiles")
plt.axis('equal')  # Equal aspect ratio ensures the pie chart is circular
plt.show()
