In [313]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

In [314]:
train=pd.read_csv('customer_segmentation_train.csv')
test=pd.read_csv('customer_segmentation_test.csv')

In [315]:
train.shape

(8068, 11)

In [316]:
test.shape

(2627, 11)

In [317]:
train.head()

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Segmentation
0,462809,Male,No,22,No,Healthcare,1.0,Low,4.0,Cat_4,D
1,462643,Female,Yes,38,Yes,Engineer,,Average,3.0,Cat_4,A
2,466315,Female,Yes,67,Yes,Engineer,1.0,Low,1.0,Cat_6,B
3,461735,Male,Yes,67,Yes,Lawyer,0.0,High,2.0,Cat_6,B
4,462669,Female,Yes,40,Yes,Entertainment,,High,6.0,Cat_6,A


In [318]:
id_train=train['ID']
train.drop(columns=['ID'],inplace=True)
id_test=test['ID']
test.drop(columns=['ID'],inplace=True)

In [319]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8068 entries, 0 to 8067
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Gender           8068 non-null   object 
 1   Ever_Married     7928 non-null   object 
 2   Age              8068 non-null   int64  
 3   Graduated        7990 non-null   object 
 4   Profession       7944 non-null   object 
 5   Work_Experience  7239 non-null   float64
 6   Spending_Score   8068 non-null   object 
 7   Family_Size      7733 non-null   float64
 8   Var_1            7992 non-null   object 
 9   Segmentation     8068 non-null   object 
dtypes: float64(2), int64(1), object(7)
memory usage: 630.4+ KB


In [320]:
train.head()

Unnamed: 0,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Segmentation
0,Male,No,22,No,Healthcare,1.0,Low,4.0,Cat_4,D
1,Female,Yes,38,Yes,Engineer,,Average,3.0,Cat_4,A
2,Female,Yes,67,Yes,Engineer,1.0,Low,1.0,Cat_6,B
3,Male,Yes,67,Yes,Lawyer,0.0,High,2.0,Cat_6,B
4,Female,Yes,40,Yes,Entertainment,,High,6.0,Cat_6,A


In [321]:
train.isnull().sum()

Gender               0
Ever_Married       140
Age                  0
Graduated           78
Profession         124
Work_Experience    829
Spending_Score       0
Family_Size        335
Var_1               76
Segmentation         0
dtype: int64

In [322]:
test.isnull().sum()

Gender               0
Ever_Married        50
Age                  0
Graduated           24
Profession          38
Work_Experience    269
Spending_Score       0
Family_Size        113
Var_1               32
Segmentation         0
dtype: int64

In [323]:
train['Ever_Married'].value_counts()

Yes    4643
No     3285
Name: Ever_Married, dtype: int64

In [324]:
train['Graduated'].value_counts()

Yes    4968
No     3022
Name: Graduated, dtype: int64

In [325]:
train['Profession'].value_counts()

Artist           2516
Healthcare       1332
Entertainment     949
Engineer          699
Doctor            688
Lawyer            623
Executive         599
Marketing         292
Homemaker         246
Name: Profession, dtype: int64

In [326]:
train['Work_Experience'].value_counts()

1.0     2354
0.0     2318
9.0      474
8.0      463
2.0      286
3.0      255
4.0      253
6.0      204
7.0      196
5.0      194
10.0      53
11.0      50
12.0      48
13.0      46
14.0      45
Name: Work_Experience, dtype: int64

In [327]:
train['Family_Size'].value_counts()

2.0    2390
3.0    1497
1.0    1453
4.0    1379
5.0     612
6.0     212
7.0      96
8.0      50
9.0      44
Name: Family_Size, dtype: int64

In [328]:
train['Var_1'].value_counts()

Cat_6    5238
Cat_4    1089
Cat_3     822
Cat_2     422
Cat_7     203
Cat_1     133
Cat_5      85
Name: Var_1, dtype: int64

In [329]:
train.columns

Index(['Gender', 'Ever_Married', 'Age', 'Graduated', 'Profession',
       'Work_Experience', 'Spending_Score', 'Family_Size', 'Var_1',
       'Segmentation'],
      dtype='object')

In [330]:
# Fill na Train
for col in train.columns:
    if train[col].isnull().sum():
        if train[col].dtype=='O':
            train[col]=train[col].fillna(train[col].value_counts().index[0])
        else:
            train[col]=train[col].fillna(train[col].mean())

In [331]:
#Fill na test
for col in test.columns:
    if test[col].isnull().sum():
        if test[col].dtype=='O':
            test[col]=test[col].fillna(test[col].value_counts().index[0])
        else:
            test[col]=test[col].fillna(test[col].mean())

In [332]:
train.head()

Unnamed: 0,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Segmentation
0,Male,No,22,No,Healthcare,1.0,Low,4.0,Cat_4,D
1,Female,Yes,38,Yes,Engineer,2.641663,Average,3.0,Cat_4,A
2,Female,Yes,67,Yes,Engineer,1.0,Low,1.0,Cat_6,B
3,Male,Yes,67,Yes,Lawyer,0.0,High,2.0,Cat_6,B
4,Female,Yes,40,Yes,Entertainment,2.641663,High,6.0,Cat_6,A


In [333]:
y_train=train['Segmentation']
y_test=test['Segmentation']
train.drop(columns=['Segmentation'],inplace=True)
test.drop(columns=['Segmentation'],inplace=True)

In [334]:
# Scaling train
one_hot=OneHotEncoder()
scaler=StandardScaler()
le=LabelEncoder()
for col in train.columns:
    if train[col].dtype!='O':
        train[col]=scaler.fit_transform(np.array(train[col]).reshape(-1,1))
        print(col)
    else:
        train[col]=le.fit_transform(np.array(train[col]).reshape(-1,1))
        print(col)    

Gender
Ever_Married
Age
Graduated
Profession
Work_Experience
Spending_Score
Family_Size
Var_1


  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)


In [335]:
# Scaling test
one_hot=OneHotEncoder()
scaler=StandardScaler()
le=LabelEncoder()
for col in test.columns:
    if test[col].dtype!='O':
        test[col]=scaler.fit_transform(np.array(test[col]).reshape(-1,1))
        print(col)
    else:
        test[col]=le.fit_transform(np.array(test[col]).reshape(-1,1))
        print(col)

Gender
Ever_Married
Age
Graduated
Profession
Work_Experience
Spending_Score
Family_Size
Var_1


  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)


In [336]:
train.head()

Unnamed: 0,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1
0,1,0,-1.284623,0,5,-0.508763,2,0.767001,3
1,0,1,-0.327151,1,2,0.0,0,0.099972,3
2,0,1,1.408268,1,2,-0.508763,2,-1.234085,5
3,1,1,1.408268,1,7,-0.818671,1,-0.567056,5
4,0,1,-0.207467,1,3,0.0,1,2.101059,5


In [337]:
# One hot enc and label enc
X_train=one_hot.fit_transform(train)
X_test=one_hot.fit_transform(test)
y_train=le.fit_transform(y_train)
y_test=le.fit_transform(y_test)

In [338]:
# KNN
knn=KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train,y_train)
knn_preds=knn.predict(X_test)
accuracy_score(y_test,knn_preds)

0.3090978302245908

In [341]:
# Decision tree
dec=DecisionTreeClassifier()
dec.fit(X_train,y_train)
dec_preds=dec.predict(X_test)
accuracy_score(y_test,dec_preds)

0.29996193376475067

In [342]:
# SVM
svc=SVC(kernel=a)
svc.fit(X_train,y_train)
svc_preds=svc.predict(X_test)
accuracy_score(y_test,svc_preds)

0.3269889607917777