In [1]:
#1. Import Necessary Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import shap
import warnings
import missingno as msno
from ydata_profiling import ProfileReport
from scipy import stats
from yellowbrick.cluster import KElbowVisualizer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, roc_auc_score, accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.manifold import TSNE
from sklearn.feature_selection import RFE
from sklearn.ensemble import VotingClassifier
#turn off alert
warnings.filterwarnings('ignore')
#adjust the style settings to make the graphics more stylish
plt.style.use('ggplot')
sns.set(style="whitegrid")  

In [2]:
##2. Load the Dataset
data = pd.read_csv('customer_segmentation_data.csv')

In [3]:
#View the first few rows of the dataset
print(data.head())

   id  age  gender  income  spending_score  membership_years  \
0   1   38  Female   99342              90                 3   
1   2   21  Female   78852              60                 2   
2   3   60  Female  126573              30                 2   
3   4   40   Other   47099              74                 9   
4   5   65  Female  140621              21                 3   

   purchase_frequency preferred_category  last_purchase_amount  
0                  24          Groceries                113.53  
1                  42             Sports                 41.93  
2                  28           Clothing                424.36  
3                   5      Home & Garden                991.93  
4                  25        Electronics                347.08  


In [4]:
#Evaluation
#The dataset has been successfully loaded and the first 5 rows have been examined. This allowed us to get a general idea of ​​the columns in the dataset.

In [5]:
#profile report of the dataset
profile = ProfileReport(data, title="Customer Segmentation Data Profile", explorative=True)
profile.to_notebook_iframe()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

In [6]:
#Check for missing values
print(data.isnull().sum())

id                      0
age                     0
gender                  0
income                  0
spending_score          0
membership_years        0
purchase_frequency      0
preferred_category      0
last_purchase_amount    0
dtype: int64


In [7]:
#Get a summary of the dataset
print(data.describe())

                id          age         income  spending_score  \
count  1000.000000  1000.000000    1000.000000     1000.000000   
mean    500.500000    43.783000   88500.800000       50.685000   
std     288.819436    15.042213   34230.771122       28.955175   
min       1.000000    18.000000   30004.000000        1.000000   
25%     250.750000    30.000000   57911.750000       26.000000   
50%     500.500000    45.000000   87845.500000       50.000000   
75%     750.250000    57.000000  116110.250000       76.000000   
max    1000.000000    69.000000  149973.000000      100.000000   

       membership_years  purchase_frequency  last_purchase_amount  
count        1000.00000         1000.000000           1000.000000  
mean            5.46900           26.596000            492.348670  
std             2.85573           14.243654            295.744253  
min             1.00000            1.000000             10.400000  
25%             3.00000           15.000000            218.762500

In [8]:
#Check the data types of each column
print(data.dtypes)

id                        int64
age                       int64
gender                   object
income                    int64
spending_score            int64
membership_years          int64
purchase_frequency        int64
preferred_category       object
last_purchase_amount    float64
dtype: object


In [33]:
#Distribution of Age
plt.figure(figsize=(10,6))
sns.histplot(data['age'], kde=True)
plt.title('Age Distribution')
plt.show()

In [10]:
#Gender Count
plt.figure(figsize=(10,6))
sns.countplot(x='gender', data=data)
plt.title('Gender Count')
plt.show()

In [11]:
#Income Distribution
plt.figure(figsize=(10,6))
sns.histplot(data['income'], kde=True)
plt.title('Income Distribution')
plt.show()

In [12]:
#Spending Score vs Income
plt.figure(figsize=(10,6))
sns.scatterplot(x='income', y='spending_score', data=data, hue='gender')
plt.title('Spending Score vs Income')
plt.show()

In [13]:
#Boxplot of Last Purchase Amount by Preferred Category
plt.figure(figsize=(12,8))
sns.boxplot(x='preferred_category', y='last_purchase_amount', data=data)
plt.title('Last Purchase Amount by Preferred Category')
plt.xticks(rotation=45)
plt.show()

In [14]:
# Plot histograms
data[['age', 'income', 'spending_score', 'membership_years', 'purchase_frequency', 'last_purchase_amount']].hist(figsize=(14, 10), bins=30)
plt.tight_layout()
plt.show()

In [15]:
# Plot box plots
fig, axs = plt.subplots(2, 3, figsize=(15, 10))
sns.boxplot(y=data['age'], ax=axs[0, 0])
sns.boxplot(y=data['income'], ax=axs[0, 1])
sns.boxplot(y=data['spending_score'], ax=axs[0, 2])
sns.boxplot(y=data['membership_years'], ax=axs[1, 0])
sns.boxplot(y=data['purchase_frequency'], ax=axs[1, 1])
sns.boxplot(y=data['last_purchase_amount'], ax=axs[1, 2])
plt.tight_layout()
plt.show()

In [16]:
# Plot bar plots for categorical columns
fig, axs = plt.subplots(1, 2, figsize=(14, 5))
sns.countplot(data=data, x='gender', ax=axs[0])
sns.countplot(data=data, x='preferred_category', ax=axs[1])
plt.tight_layout()
plt.show()

In [17]:
# Scatter plots
sns.pairplot(data[['age', 'income', 'spending_score', 'membership_years', 'purchase_frequency', 'last_purchase_amount']])
plt.show()

In [18]:
# Correlation matrix
correlation_matrix = data[['age', 'income', 'spending_score', 'membership_years', 'purchase_frequency', 'last_purchase_amount']].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.show()

In [19]:
# Box plots for numerical vs categorical
fig, axs = plt.subplots(1, 2, figsize=(14, 5))
sns.boxplot(data=data, x='gender', y='income', ax=axs[0])
sns.boxplot(data=data, x='preferred_category', y='last_purchase_amount', ax=axs[1])
plt.tight_layout()
plt.show()

In [20]:
# Standardize data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data[['age', 'income', 'spending_score', 'membership_years', 'purchase_frequency', 'last_purchase_amount']])

In [21]:
# PCA
pca = PCA(n_components=2)
pca_result = pca.fit_transform(scaled_data)

In [22]:
# Create DataFrame for PCA results
pca_df = pd.DataFrame(data=pca_result, columns=['PCA1', 'PCA2'])
pca_df['preferred_category'] = data['preferred_category']

In [23]:
# Plot PCA
sns.scatterplot(data=pca_df, x='PCA1', y='PCA2', hue='preferred_category', palette='viridis')
plt.title('PCA of Numerical Features')
plt.show()

In [24]:
# Define the current year for recency calculation
current_year = 2024

# Calculate Recency
data['Recency'] = current_year - data['membership_years']

# Use purchase_frequency directly for Frequency
data['Frequency'] = data['purchase_frequency']

# Calculate Monetary value
data['Monetary'] = data['last_purchase_amount'] * data['purchase_frequency']

# Select relevant columns for RFM analysis
rfm_data = data[['id', 'Recency', 'Frequency', 'Monetary']]

In [25]:
# Display the first few rows of the RFM dataset
rfm_data.head()

Unnamed: 0,id,Recency,Frequency,Monetary
0,1,2021,24,2724.72
1,2,2022,42,1761.06
2,3,2022,28,11882.08
3,4,2015,5,4959.65
4,5,2021,25,8677.0


In [26]:
# Define quantiles for scoring
quantiles = rfm_data[['Recency', 'Frequency', 'Monetary']].quantile(q=[0.2, 0.4, 0.6, 0.8]).to_dict()

# Function to assign scores based on quantiles
def r_score(x):
    if x <= quantiles['Recency'][0.2]:
        return 5
    elif x <= quantiles['Recency'][0.4]:
        return 4
    elif x <= quantiles['Recency'][0.6]:
        return 3
    elif x <= quantiles['Recency'][0.8]:
        return 2
    else:
        return 1

def fm_score(x, col):
    if x <= quantiles[col][0.2]:
        return 1
    elif x <= quantiles[col][0.4]:
        return 2
    elif x <= quantiles[col][0.6]:
        return 3
    elif x <= quantiles[col][0.8]:
        return 4
    else:
        return 5

# Assign scores
rfm_data['R_Score'] = rfm_data['Recency'].apply(r_score)
rfm_data['F_Score'] = rfm_data['Frequency'].apply(fm_score, col='Frequency')
rfm_data['M_Score'] = rfm_data['Monetary'].apply(fm_score, col='Monetary')

# Create RFM segment
rfm_data['RFM_Segment'] = rfm_data['R_Score'].astype(str) + rfm_data['F_Score'].astype(str) + rfm_data['M_Score'].astype(str)

# Display the first few rows of the scored RFM dataset
rfm_data.head()

Unnamed: 0,id,Recency,Frequency,Monetary,R_Score,F_Score,M_Score,RFM_Segment
0,1,2021,24,2724.72,2,3,1,231
1,2,2022,42,1761.06,2,5,1,251
2,3,2022,28,11882.08,2,3,3,233
3,4,2015,5,4959.65,5,1,2,512
4,5,2021,25,8677.0,2,3,3,233


In [27]:
#Recency Segments
#R1: Customers who purchased most recently.
#R2: Customers who purchased less recently.
#R3: Customers who purchased a while ago.
#R4: Customers who haven't purchased for the longest period.
#Frequency Segments
#F1: Most frequent buyers.
#F2: Frequent buyers.
#F3: Less frequent buyers.
#F4: Infrequent buyers.
#Monetary Segments
#M1: Highest spenders.
#M2: High spenders.
#M3: Moderate spenders.
#M4: Low spenders.

In [28]:
# Define RFM score conditions
conditions = [
    (rfm_data['R_Score'] == 1) & (rfm_data['F_Score'] == 1) & (rfm_data['M_Score'] == 1),
    (rfm_data['R_Score'].isin([1, 2])) & (rfm_data['F_Score'].isin([1, 2])) & (rfm_data['M_Score'].isin([1, 2])),
    (rfm_data['R_Score'].isin([3, 4])) & (rfm_data['F_Score'].isin([2, 3])) & (rfm_data['M_Score'].isin([1, 2])),
    (rfm_data['R_Score'].isin([3, 4])) & (rfm_data['F_Score'].isin([3, 4])) & (rfm_data['M_Score'].isin([3, 4]))
]

# Define segment names
segment_names = ['Champions', 'Loyal Customers', 'At Risk', 'Need Attention']

# Create a new column for customer segments
data['Segment'] = np.select(conditions, segment_names, default='Other')

# Display the dataframe with the new segment column
data.head()

Unnamed: 0,id,age,gender,income,spending_score,membership_years,purchase_frequency,preferred_category,last_purchase_amount,Recency,Frequency,Monetary,Segment
0,1,38,Female,99342,90,3,24,Groceries,113.53,2021,24,2724.72,Other
1,2,21,Female,78852,60,2,42,Sports,41.93,2022,42,1761.06,Other
2,3,60,Female,126573,30,2,28,Clothing,424.36,2022,28,11882.08,Other
3,4,40,Other,47099,74,9,5,Home & Garden,991.93,2015,5,4959.65,Other
4,5,65,Female,140621,21,3,25,Electronics,347.08,2021,25,8677.0,Other


In [29]:
# check unique vlaue of Segmentation
data['Segment'].unique()

array(['Other', 'Need Attention', 'Loyal Customers', 'At Risk',
       'Champions'], dtype=object)

In [30]:
# Plot the segmentation

# Count the number of customers in each segment
segment_counts = data['Segment'].value_counts()

# Plot the segments
plt.figure(figsize=(10, 6))
sns.barplot(x=segment_counts.index, y=segment_counts.values)
plt.xlabel('Customer Segments')
plt.ylabel('Number of Customers')
plt.title('Customer Segmentation based on RFM Scores')
plt.xticks(rotation=45)
plt.show()

In [31]:
 #Champions:

#a). Characteristics: These customers have the highest RFM scores. They are your best customers, purchasing most frequently, spending the most, and recently.
#b). Recommendations:
   # - Exclusive Offers: Provide exclusive discounts or early access to new products.
   # - Loyalty Programs: Offer special loyalty rewards or VIP programs to make them feel valued.
   # - Personalized Communication: Send personalized thank-you notes and tailor marketing messages to their preferences.
#Loyal Customers:

#a). Characteristics: These customers have high RFM scores but might not be as frequent or recent as champions.
#b). Recommendations:
   # - Upsell/Cross-sell: Suggest complementary products or services to increase average order value.
   # - Regular Engagement: Keep them engaged with regular updates, newsletters, and personalized recommendations.
   # - Loyalty Benefits: Offer loyalty points, discounts on future purchases, or refer-a-friend incentives.
 #At Risk:

#a). Characteristics: These customers had high purchase frequency and monetary value but have not purchased recently.
#b). Recommendations:
    #- Re-engagement Campaigns: Send re-engagement emails with special offers or reminders of what they liked about your products.
    #- Surveys and Feedback: Ask for feedback to understand why they haven't purchased recently and address any concerns.
    #- Personalized Offers: Provide personalized offers or discounts to incentivize them to return.
#Need Attention:

#a). Characteristics: These customers have a moderate frequency and monetary value but are starting to drift away.
#b). Recommendations:
    #- Promotional Offers: Send special promotions or limited-time discounts to encourage repeat purchases.
    #- Engagement Initiatives: Engage them with personalized content, product recommendations, or loyalty incentives.
    #- Customer Service: Ensure excellent customer service to address any potential issues that might have caused them to drift away.
#Other:

#a). Characteristics: These customers have lower RFM scores and may not be very engaged.
#b). Recommendations:
    #- Awareness Campaigns: Increase brand awareness through targeted marketing campaigns and social media engagement.
    #- Incentives for First-Time Buyers: Offer discounts or deals for first-time purchases to encourage initial engagement.
    #- Value Proposition: Highlight the unique value and benefits of your products to attract their attention.

In [32]:
#General Recommendations for RFM Segmentation

#- Data Monitoring: Regularly update and monitor your RFM scores to identify changes in customer behavior and adjust your strategies accordingly.
#- A/B Testing: Experiment with different marketing messages and offers for each segment to find the most effective approach.
#- Customer Journey Mapping: Understand the customer journey for each segment and tailor your communication strategies to meet their specific needs at each stage.
