- ID	Unique ID
- Gender	Gender of the customer
- Ever_Married	Marital status of the customer
- Age	Age of the customer
- Graduated	Is the customer a graduate?
- Profession	Profession of the customer
- Work_Experience	Work Experience in years
- Spending_Score	Spending score of the customer
- Family_Size	Number of family members for the customer (including the customer)
- Var_1	Anonymised Category for the customer
- Segmentation	(target) Customer Segment of the customer

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('customer_segmentation.csv')

In [3]:
df.shape

(10695, 11)

In [4]:
df.isnull().sum()

ID                    0
Gender                0
Ever_Married        190
Age                   0
Graduated           102
Profession          162
Work_Experience    1098
Spending_Score        0
Family_Size         448
Var_1               108
Segmentation          0
dtype: int64

In [5]:
df.dropna(axis=0,inplace=True)

In [6]:
df.isnull().sum()

ID                 0
Gender             0
Ever_Married       0
Age                0
Graduated          0
Profession         0
Work_Experience    0
Spending_Score     0
Family_Size        0
Var_1              0
Segmentation       0
dtype: int64

In [7]:
df.duplicated().sum()

26

In [8]:
df.drop_duplicates(inplace=True)

In [9]:
df.drop('ID',axis=1,inplace=True)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8793 entries, 0 to 10694
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Gender           8793 non-null   object 
 1   Ever_Married     8793 non-null   object 
 2   Age              8793 non-null   int64  
 3   Graduated        8793 non-null   object 
 4   Profession       8793 non-null   object 
 5   Work_Experience  8793 non-null   float64
 6   Spending_Score   8793 non-null   object 
 7   Family_Size      8793 non-null   float64
 8   Var_1            8793 non-null   object 
 9   Segmentation     8793 non-null   object 
dtypes: float64(2), int64(1), object(7)
memory usage: 755.6+ KB


In [11]:
df.head()

Unnamed: 0,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Segmentation
0,Male,No,22,No,Healthcare,1.0,Low,4.0,Cat_4,D
2,Female,Yes,67,Yes,Engineer,1.0,Low,1.0,Cat_6,B
3,Male,Yes,67,Yes,Lawyer,0.0,High,2.0,Cat_6,B
5,Male,Yes,56,No,Artist,0.0,Average,2.0,Cat_6,C
6,Male,No,32,Yes,Healthcare,1.0,Low,3.0,Cat_6,C


In [12]:
df.drop('Var_1',axis=1,inplace=True)

In [13]:
df.head()

Unnamed: 0,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Segmentation
0,Male,No,22,No,Healthcare,1.0,Low,4.0,D
2,Female,Yes,67,Yes,Engineer,1.0,Low,1.0,B
3,Male,Yes,67,Yes,Lawyer,0.0,High,2.0,B
5,Male,Yes,56,No,Artist,0.0,Average,2.0,C
6,Male,No,32,Yes,Healthcare,1.0,Low,3.0,C


In [14]:
cat_col = [col for col in df.columns if df[col].dtype == 'O']

In [15]:
for col in cat_col:
    print(col,df[col].unique())

Gender ['Male' 'Female']
Ever_Married ['No' 'Yes']
Graduated ['No' 'Yes']
Profession ['Healthcare' 'Engineer' 'Lawyer' 'Artist' 'Doctor' 'Homemaker'
 'Entertainment' 'Marketing' 'Executive']
Spending_Score ['Low' 'High' 'Average']
Segmentation ['D' 'B' 'C' 'A']


In [16]:
a = ['Gender', 'Ever_Married', 'Graduated', 'Profession']
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for col in a:
    df[col] = le.fit_transform(df[col])

In [17]:
from sklearn.preprocessing import OrdinalEncoder
oe = OrdinalEncoder(categories=[['Low', 'Average', 'High']])
df['Spending_Score'] = oe.fit_transform(df[['Spending_Score']])

In [18]:
df['Segmentation'].unique()

array(['D', 'B', 'C', 'A'], dtype=object)

In [19]:
oe = OrdinalEncoder(categories=[['A', 'B', 'C', 'D']])
df['Segmentation'] = oe.fit_transform(df[['Segmentation']])

In [20]:
df.head()

Unnamed: 0,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Segmentation
0,1,0,22,0,5,1.0,0.0,4.0,3.0
2,0,1,67,1,2,1.0,0.0,1.0,1.0
3,1,1,67,1,7,0.0,2.0,2.0,1.0
5,1,1,56,0,0,0.0,1.0,2.0,2.0
6,1,0,32,1,5,1.0,0.0,3.0,2.0


In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8793 entries, 0 to 10694
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Gender           8793 non-null   int64  
 1   Ever_Married     8793 non-null   int64  
 2   Age              8793 non-null   int64  
 3   Graduated        8793 non-null   int64  
 4   Profession       8793 non-null   int64  
 5   Work_Experience  8793 non-null   float64
 6   Spending_Score   8793 non-null   float64
 7   Family_Size      8793 non-null   float64
 8   Segmentation     8793 non-null   float64
dtypes: float64(4), int64(5)
memory usage: 687.0 KB


In [22]:
X = df.drop('Segmentation',axis=1)
y = df['Segmentation']

In [23]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=12)

In [24]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier()

In [25]:
classifier.fit(X_train, y_train)

In [27]:
y_pred = classifier.predict(X_test)

In [28]:
from sklearn.metrics import accuracy_score,classification_report

print(accuracy_score(y_pred,y_test))
print(classification_report(y_pred,y_test))

0.3746446844798181
              precision    recall  f1-score   support

         0.0       0.35      0.32      0.34       507
         1.0       0.29      0.31      0.30       395
         2.0       0.40      0.37      0.38       410
         3.0       0.45      0.49      0.47       447

    accuracy                           0.37      1759
   macro avg       0.37      0.37      0.37      1759
weighted avg       0.37      0.37      0.37      1759



In [38]:
parameter = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 5, 10, 15, 20, 25, 30],
    'min_samples_split': [2, 5, 10, 15, 20],
    'min_samples_leaf': [1, 2, 4, 8, 10],
    'max_features': ['auto', 'sqrt', 'log2', None],
    'splitter': ['best', 'random']
}

In [39]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(classifier, param_grid=parameter, cv = 5, scoring = "accuracy")

In [40]:
grid.fit(X_train, y_train)

In [41]:
grid.best_params_

{'criterion': 'gini',
 'max_depth': 5,
 'max_features': None,
 'min_samples_leaf': 8,
 'min_samples_split': 2,
 'splitter': 'best'}

In [42]:
grid.best_score_

0.47696570995864

In [43]:
y_pred_grid = grid.predict(X_test)

In [44]:
print(accuracy_score(y_pred_grid, y_test))
print(classification_report(y_pred_grid, y_test))

0.4752700397953383
              precision    recall  f1-score   support

         0.0       0.47      0.39      0.43       555
         1.0       0.25      0.40      0.31       266
         2.0       0.54      0.47      0.50       434
         3.0       0.63      0.61      0.62       504

    accuracy                           0.48      1759
   macro avg       0.47      0.47      0.46      1759
weighted avg       0.50      0.48      0.48      1759

