In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.cluster import KMeans
from scipy.cluster.hierarchy import linkage, dendrogram
from sklearn.cluster import AgglomerativeClustering

In [3]:
data = pd.read_csv("Mall_Customers.csv")
data.head()

Unnamed: 0,CustomerID,Gender,Age,Annual Income (k$),Spending Score (1-100)
0,1,Male,19,15,39
1,2,Male,21,15,81
2,3,Female,20,16,6
3,4,Female,23,16,77
4,5,Female,31,17,40


In [6]:
data.sample(3)

Unnamed: 0,CustomerID,Gender,Age,Annual Income (k$),Spending Score (1-100)
59,60,Male,53,46,46
81,82,Male,38,54,55
195,196,Female,35,120,79


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   CustomerID              200 non-null    int64 
 1   Gender                  200 non-null    object
 2   Age                     200 non-null    int64 
 3   Annual Income (k$)      200 non-null    int64 
 4   Spending Score (1-100)  200 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 7.9+ KB


We Observe that the gender is a categorical column which we need to handle.


In [8]:
# check the null values
data.isnull().sum().sum()

0

No null values

In [9]:
# cheking for duplicates
data.duplicated().sum()

0

No Duplicates

In [11]:
# basic statistical description
data.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
CustomerID,200.0,100.5,57.879185,1.0,50.75,100.5,150.25,200.0
Age,200.0,38.85,13.969007,18.0,28.75,36.0,49.0,70.0
Annual Income (k$),200.0,60.56,26.264721,15.0,41.5,61.5,78.0,137.0
Spending Score (1-100),200.0,50.2,25.823522,1.0,34.75,50.0,73.0,99.0


In [12]:
# Lets check the correlations
data.corr()

ValueError: could not convert string to float: 'Male'

In [13]:
# Encoding the gender column
# 1. Label Encoding
from sklearn.preprocessing import LabelEncoder
# for binary encoding, male=0 and female=1

label_encoder = LabelEncoder()
data['Gender']=label_encoder.fit_transform(data['Gender'])

In [15]:
data['Gender'].value_counts()

Gender
0    112
1     88
Name: count, dtype: int64

In [16]:
# if there are more than 2 categories, use One-Hot Encoding
# df = pd.get_dummies(df, columns=['Gender'], prefix = ['Gender'])

In [17]:
data.corr()

Unnamed: 0,CustomerID,Gender,Age,Annual Income (k$),Spending Score (1-100)
CustomerID,1.0,0.0574,-0.026763,0.977548,0.013835
Gender,0.0574,1.0,0.060867,0.05641,-0.058109
Age,-0.026763,0.060867,1.0,-0.012398,-0.327227
Annual Income (k$),0.977548,0.05641,-0.012398,1.0,0.009903
Spending Score (1-100),0.013835,-0.058109,-0.327227,0.009903,1.0


Visualize the "Gender column"