In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans

**DataCleaning**

In [None]:
df = pd.read_csv ('E:\githubb\TungProject\CustomerSegment\Data\Mall_Customers.csv')



In [None]:
df.shape


In [None]:
df.head ()

In [None]:
df.tail()

In [None]:
df.describe() 

In [None]:
df.isnull().sum()

**Data Analysis**

In [None]:
#Overall figure size
plt.figure (figsize=(15,5))
#Subplots : 1 row, 3 columns
fig, axes = plt.subplots(1, 3, figsize=(16,4))
#Plot1: Age distribution
sns.histplot (df['Age'] , kde= True, bins = 26, ax = axes[0], color= 'g' )
axes[0].set_title ('Distribution of Age') 
axes[0].set_xlabel('Age')
axes[0].set_ylabel('Count')
#Plot2: Anual Income distribution
sns.histplot (df['Annual Income (k$)'] , kde= True, bins = 14, ax = axes[1],color= 'b' )
axes[1].set_title ('Distribution of Annual Income ') 
axes[1].set_xlabel('Annual Income (k$)')
axes[1].set_ylabel('Count')
#Plot2: Spending Score distribution
sns.histplot (df['Spending Score (1-100)'] , kde= True, bins = 20, ax = axes[2],color= 'r' )
axes[2].set_title ('Distribution of Spending Score (1-100)') 
axes[2].set_xlabel('Spending Score (1-100)')
axes[2].set_ylabel('Count')
plt.show()

In [None]:
#Overall figure size
plt.figure (figsize=(15,5))
sns.countplot (data=df, y='Gender', hue='Gender', palette='Set2', legend=False)
plt.show()

**Distributions of gender**

In [None]:
# Set the overall figure size
plt.figure(figsize=(8, 6))

# Create subplots: 3 rows, 2 columns (for 6 plots)
fig, axes = plt.subplots(3, 2, figsize=(8, 8))

# Plot 1: Age distribution for Male
sns.histplot(df[df['Gender'] == 'Male']['Age'], kde=True, color='g', ax=axes[0, 0], bins=26)
axes[0, 0].set_title('Age Distribution (Male)')
axes[0, 0].set_xlabel('Age')
axes[0, 0].set_ylabel('Count')
# Plot 2: Age distribution for Female
sns.histplot(df[df['Gender'] == 'Female']['Age'], kde=True, color='r', ax=axes[0, 1], bins=26)
axes[0, 1].set_title('Age Distribution (Female)')
axes[0, 1].set_xlabel('Age')
axes[0, 1].set_ylabel('Count')

# Plot 3: Annual Income distribution for Male
sns.histplot(df[df['Gender'] == 'Male']['Annual Income (k$)'], kde=True, color='g', ax=axes[1, 0], bins=14)
axes[1, 0].set_title('Annual Income Distribution (Male)')
axes[1, 0].set_xlabel('Annual Income (k$)')
axes[1, 0].set_ylabel('Count')

# Plot 4: Annual Income distribution for Female
sns.histplot(df[df['Gender'] == 'Female']['Annual Income (k$)'], kde=True, color='r', ax=axes[1, 1], bins=14)
axes[1, 1].set_title('Annual Income Distribution (Female)')
axes[1, 1].set_xlabel('Annual Income (k$)')
axes[1, 1].set_ylabel('Count')

# Plot 5: Spending Score distribution for Male
sns.histplot(df[df['Gender'] == 'Male']['Spending Score (1-100)'], kde=True, color='g', ax=axes[2, 0], bins=20)
axes[2, 0].set_title('Spending Score Distribution (Male)')
axes[2, 0].set_xlabel('Spending Score (1-100)')
axes[2, 0].set_ylabel('Count')

# Plot 6: Spending Score distribution for Female
sns.histplot(df[df['Gender'] == 'Female']['Spending Score (1-100)'], kde=True, color='r', ax=axes[2, 1], bins=20)
axes[2, 1].set_title('Spending Score Distribution (Female)')
axes[2, 1].set_xlabel('Spending Score (1-100)')
axes[2, 1].set_ylabel('Count')

# Adjust the layout to avoid overlap
plt.tight_layout()

# Show the plots
plt.show()

**Clustering using Kmeans**

In [None]:
#Choose the number of clusters for Age and Spending Score
wcss = []
for k in range (1,11) : 
    kmeans = KMeans (n_clusters = k, random_state= 42 )
    kmeans.fit (df[['Age' , 'Spending Score (1-100)']])
    wcss.append (kmeans.inertia_ ) 
plt.figure ( figsize= (15,5))
plt.grid ()
plt.plot ( range(1,11), wcss, marker = '8' , color = 'blue'   )
plt.xlabel ('K Value')
plt.ylabel ('WCSS')




**Let's try with k =4**

In [None]:
# K= 4
X1 = df.loc [ :, ['Age' , 'Spending Score (1-100)']].values 
kmeans = KMeans ( n_clusters= 4)
label = kmeans.fit_predict (X1)
print (label)


In [None]:
print (kmeans.cluster_centers_)

In [None]:
plt.scatter ( X1[ :,0] , X1 [ :, 1], c= kmeans.labels_, cmap= 'rainbow')
plt.scatter (kmeans.cluster_centers_[:,0] ,kmeans.cluster_centers_[:,1] , color = 'black' )
plt.xlabel('Age')
plt.ylabel('Spending Score (1-100)')
plt.show()

In [None]:
#Choose the number of clusters for Annual Income and Spending ScoreScore
X2 = df.loc [ :, ['Annual Income (k$)' , 'Spending Score (1-100)']].values 
wcss = []
for k in range (1,11) : 
    kmeans = KMeans (n_clusters = k, random_state= 42 )
    kmeans.fit (df[['Annual Income (k$)' , 'Spending Score (1-100)']])
    wcss.append (kmeans.inertia_ ) 
plt.figure ( figsize= (15,5))
plt.grid ()
plt.plot ( range(1,11), wcss, marker = '8' , color = 'blue'   )
plt.xlabel ('K Value')
plt.ylabel ('WCSS')


**Let 's try with k = 5**

In [None]:
# K= 5
kmeans = KMeans ( n_clusters= 5)
label = kmeans.fit_predict (X2)
print (label)

In [None]:
print (kmeans.cluster_centers_)

In [None]:
plt.scatter ( X2[:,0] , X2[:, 1], c= kmeans.labels_, cmap= 'rainbow')
plt.scatter (kmeans.cluster_centers_[:,0] ,kmeans.cluster_centers_[:,1] , color = 'black' )
plt.xlabel('Annual Income (k$)e')
plt.ylabel('Spending Score (1-100)')
plt.show()



In [None]:
#Choose the number of clusters for age , annual income and spending score
wcss = []
for k in range (1,11) : 
    kmeans = KMeans (n_clusters = k, random_state= 00)
    kmeans.fit ( df [['Age', 'Annual Income (k$)', 'Spending Score (1-100)']])
    wcss.append (kmeans.inertia_ ) 
plt.figure ( figsize= (15,5))
plt.grid ()
plt.plot ( range(1,11), wcss, marker = '8' , color = 'green' )
plt.xlabel ('K Value')
plt.ylabel ('WCSS')



In [None]:
#k=6
kmeans = KMeans (n_clusters= 6)
label = kmeans.fit_predict (df [['Age', 'Annual Income (k$)', 'Spending Score (1-100)']])
print(label)

In [None]:
print (kmeans.cluster_centers_)