In [None]:
 Customer Segmentation Using Unsupervised Learning
In this project, machine learning clustering techniques are used to segment customers based on
their purchasing behavior, income level, and spending patterns. The project applies K-Means
clustering to identify distinct customer groups and uses data visualization to interpret cluster
characteristics. This helps businesses understand customer behavior and design targeted
marketing strategies

In [55]:
import pandas as pd
import numpy as np

data = pd.read_csv('retail_data.csv')

In [56]:
data.columns

Index(['customer_id', 'age', 'gender', 'income_bracket', 'loyalty_program',
       'membership_years', 'churned', 'marital_status', 'number_of_children',
       'education_level', 'occupation', 'transaction_id', 'transaction_date',
       'product_id', 'product_category', 'quantity', 'unit_price',
       'discount_applied', 'payment_method', 'store_location',
       'transaction_hour', 'day_of_week', 'week_of_year', 'month_of_year',
       'avg_purchase_value', 'purchase_frequency', 'last_purchase_date',
       'avg_discount_used', 'preferred_store', 'online_purchases',
       'in_store_purchases', 'avg_items_per_transaction',
       'avg_transaction_value', 'total_returned_items', 'total_returned_value',
       'total_sales', 'total_transactions', 'total_items_purchased',
       'total_discounts_received', 'avg_spent_per_category',
       'max_single_purchase_value', 'min_single_purchase_value',
       'product_name', 'product_brand', 'product_rating',
       'product_review_count', '

In [57]:
new_data = data[['age','gender','income_bracket','marital_status','product_category','discount_applied'
                ,'day_of_week','avg_purchase_value','purchase_frequency']].copy()

In [58]:
new_data.head(5)

Unnamed: 0,age,gender,income_bracket,marital_status,product_category,discount_applied,day_of_week,avg_purchase_value,purchase_frequency
0,56,Other,High,Divorced,Electronics,0.5,Wednesday,411.13,Weekly
1,69,Female,Medium,Married,Groceries,0.32,Friday,268.71,Daily
2,46,Female,Low,Married,Toys,0.35,Saturday,246.79,Weekly
3,32,Female,Low,Divorced,Toys,0.1,Friday,178.92,Weekly
4,60,Female,Low,Divorced,Clothing,0.17,Monday,214.06,Yearly


In [75]:
new_data['marital_status'].unique()

array(['Divorced', 'Married', 'Single'], dtype=object)

In [60]:
new_data.isnull().sum()

age                   0
gender                0
income_bracket        0
marital_status        0
product_category      0
discount_applied      0
day_of_week           0
avg_purchase_value    0
purchase_frequency    0
dtype: int64

In [61]:
new_data['purchase_frequency'] = new_data['purchase_frequency'].map({
    'Weekly' : 52,
    'Daily' : 365,
    'Yearly' : 1,
    'Monthly' : 12
})

In [62]:
new_data.head(5)

Unnamed: 0,age,gender,income_bracket,marital_status,product_category,discount_applied,day_of_week,avg_purchase_value,purchase_frequency
0,56,Other,High,Divorced,Electronics,0.5,Wednesday,411.13,52
1,69,Female,Medium,Married,Groceries,0.32,Friday,268.71,365
2,46,Female,Low,Married,Toys,0.35,Saturday,246.79,52
3,32,Female,Low,Divorced,Toys,0.1,Friday,178.92,52
4,60,Female,Low,Divorced,Clothing,0.17,Monday,214.06,1


In [76]:
new_data['day_of_week'].unique()

array(['Wednesday', 'Friday', 'Saturday', 'Monday', 'Thursday', 'Sunday',
       'Tuesday'], dtype=object)

In [65]:
new_data.isnull().sum()

age                   0
gender                0
income_bracket        0
marital_status        0
product_category      0
discount_applied      0
day_of_week           0
avg_purchase_value    0
purchase_frequency    0
dtype: int64

In [66]:
new_data['income_bracket'].unique()

array(['High', 'Medium', 'Low'], dtype=object)

In [73]:
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer

data_encoding = ColumnTransformer(transformers = [
    ('tnp1',OneHotEncoder(sparse_output=False,drop='first'),['gender','marital_status','product_category','day_of_week']),
    ('tnp2',OrdinalEncoder(categories=[['Low', 'Medium', 'High']]),['income_bracket'])
],remainder='passthrough')



In [77]:
from sklearn.preprocessing import MinMaxScaler

data_scale = ColumnTransformer(transformers = [
    ('scale',MinMaxScaler(),slice(0,19))
])

In [79]:
from sklearn.cluster import KMeans

In [80]:
data_model =  KMeans(n_clusters=4, random_state=42)

In [84]:
# creating pipe line 
from sklearn.pipeline import Pipeline
pipe = Pipeline([
    ('data_encoding',data_encoding),
    ('data_scale',data_scale),
    ('data_model',data_model)
])

In [87]:
pipe.fit(new_data)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [89]:
new_data['group'] = pipe.fit_predict(new_data)

In [90]:
# export 
import pickle
pickle.dump(pipe,open('pipe.pkl','wb'))
  