In [1]:
import pandas as pd
import numpy as np
import plotly.graph_objs as go
from plotly.subplots import make_subplots
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('../../datasets/Mall_Customers.csv')
df.tail()

Unnamed: 0,CustomerID,Gender,Age,Annual Income (k$),Spending Score (1-100)
195,196,Female,35,120,79
196,197,Female,45,126,28
197,198,Male,32,126,74
198,199,Male,32,137,18
199,200,Male,30,137,83


In [3]:
df.isna().sum(), df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   CustomerID              200 non-null    int64 
 1   Gender                  200 non-null    object
 2   Age                     200 non-null    int64 
 3   Annual Income (k$)      200 non-null    int64 
 4   Spending Score (1-100)  200 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 7.9+ KB


(CustomerID                0
 Gender                    0
 Age                       0
 Annual Income (k$)        0
 Spending Score (1-100)    0
 dtype: int64,
 None)

In [4]:
df['Age'].describe(), df['Annual Income (k$)'].describe(), df['Spending Score (1-100)'].describe(), df['Gender'].value_counts()

(count    200.000000
 mean      38.850000
 std       13.969007
 min       18.000000
 25%       28.750000
 50%       36.000000
 75%       49.000000
 max       70.000000
 Name: Age, dtype: float64,
 count    200.000000
 mean      60.560000
 std       26.264721
 min       15.000000
 25%       41.500000
 50%       61.500000
 75%       78.000000
 max      137.000000
 Name: Annual Income (k$), dtype: float64,
 count    200.000000
 mean      50.200000
 std       25.823522
 min        1.000000
 25%       34.750000
 50%       50.000000
 75%       73.000000
 max       99.000000
 Name: Spending Score (1-100), dtype: float64,
 Female    112
 Male       88
 Name: Gender, dtype: int64)

In [5]:
# Data analysis by gender
df_gender = df.groupby(['Gender']).mean()
df.drop(['CustomerID'], axis=1, inplace=True)

In [6]:
print(df_gender)
colors = ['#FF69B4', '#1E90FF']
# Create a bar chart to compare gender
trace = go.Bar(
    x=df_gender.index,
    y=df_gender['Spending Score (1-100)'],
    name='Spending Score (1-100)',
    marker=dict(color=colors)
)

fig = make_subplots(rows=1, cols=3,
                    subplot_titles=("Annual Income by Gender", "Spending Score by Gender", "Age by Gender"))

# Add a bar chart of annual income by gender to the subplot
df_income = df.groupby(['Gender'])['Annual Income (k$)'].mean().reset_index()
trace_income = go.Bar(
    x=df_income['Gender'],
    y=df_income['Annual Income (k$)'],
    name='Annual Income (k$)',
    marker=dict(color=colors)
)
fig.add_trace(trace_income, row=1, col=1)

# Add a bar chart of spending score by gender to the subplot
trace_spending = go.Bar(
    x=df_gender.index,
    y=df_gender['Spending Score (1-100)'],
    name='Spending Score (1-100)',
    marker=dict(color=colors)
)
fig.add_trace(trace_spending, row=1, col=2)

# Add a bar chart of age by gender to the subplot
df_age = df.groupby(['Gender'])['Age'].mean().reset_index()
trace_age = go.Bar(
    x=df_age['Gender'],
    y=df_age['Age'],
    name='Age',
    marker=dict(color=colors)
)
fig.add_trace(trace_age, row=1, col=3)

# Add a legend to the chart
fig.update_layout(
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=-0.2,
        xanchor="right",
        x=1
    )
)

# Set the y limit for all charts to 100
fig.update_yaxes(range=[0, 100])

fig.show()

        CustomerID        Age  Annual Income (k$)  Spending Score (1-100)
Gender                                                                   
Female   97.562500  38.098214           59.250000               51.526786
Male    104.238636  39.806818           62.227273               48.511364


In [7]:
import plotly.figure_factory as ff

fig = ff.create_distplot([df[df['Gender']=='Female']['Annual Income (k$)'], df[df['Gender']=='Male']['Annual Income (k$)']],
                         group_labels=['Female', 'Male'], bin_size=3, histnorm='probability density', colors=colors)

fig.update_layout(title='Gender vs Annual Income KDE')
fig.show()

fig = ff.create_distplot([df[df['Gender']=='Female']['Spending Score (1-100)'], df[df['Gender']=='Male']['Spending Score (1-100)']],
                         group_labels=['Female', 'Male'], bin_size=3, histnorm='probability density', colors=colors)

fig.update_layout(title='Gender vs Spending Score (1-100)')
fig.show()
fig = ff.create_distplot([df[df['Gender']=='Female']['Age'], df[df['Gender']=='Male']['Age']],
                         group_labels=['Female', 'Male'], bin_size=3, histnorm='probability density', colors=colors)

fig.update_layout(title='Gender vs Age')
fig.show()


In [8]:
fig = make_subplots(rows=3, cols=3, 
                    subplot_titles=('Age vs Age', 'Age vs Annual Income', 'Age vs Spending Score', 
                                    'Annual Income vs Age', 'Annual Income vs Annual Income', 'Annual Income vs Spending Score',
                                    'Spending Score vs Age', 'Spending Score vs Annual Income', 'Spending Score vs Spending Score'))

row_num = 1
col_num = 1

for x_col in ['Age', 'Annual Income (k$)', 'Spending Score (1-100)']:
    for y_col in ['Age', 'Annual Income (k$)', 'Spending Score (1-100)']:
        #if x_col != y_col:
            scatter = go.Scatter(x=df[x_col], y=df[y_col], mode='markers', name=f'{x_col} vs {y_col}')
            trendline = go.Scatter(x=df[x_col], y=np.poly1d(np.polyfit(df[x_col], df[y_col], 1))(df[x_col]), 
                                   mode='lines', line=dict(color='red'), name= 'Trend')
            fig.add_trace(scatter, row=row_num, col=col_num)
            fig.add_trace(trendline, row=row_num, col=col_num)
            fig.update_xaxes(title_text=x_col, row=row_num, col=col_num)
            fig.update_yaxes(title_text=y_col, row=row_num, col=col_num)
            col_num += 1
            if col_num > 3:
                row_num += 1
                col_num = 1

fig.update_layout(height=600, title='Scatter Plots with Trendlines', showlegend=False)
fig.show()


In [9]:
# Correlation heatmap
corr = df.corr()
trace_corr = go.Heatmap(
    x=corr.columns,
    y=corr.columns,
    z=corr.values,
    colorscale='oxy',
    colorbar=dict(title='Correlation'),
    zmin=-1,
    zmax=1,
)

annotations = []
for i, row in enumerate(corr.values):
    for j, value in enumerate(row):
        text = f"{value:.2f}"
        annotations.append(dict(
            x=corr.columns[j],
            y=corr.columns[i],
            text=text,
            showarrow=False,
            font=dict(color='white', size=12)
        ))

fig = make_subplots(rows=1, cols=1)
fig.add_trace(trace_corr, row=1, col=1)
fig.update_layout(
    title='Correlation Heatmap',
    annotations=annotations,
    xaxis=dict(title=dict(text='')),
    yaxis=dict(title=dict(text='')),
)
fig.show()


In [10]:
# Preprocess the data
label_encoder = LabelEncoder()
df['Gender'] = label_encoder.fit_transform(df['Gender'])
scaler = StandardScaler()
X = scaler.fit_transform(df)

In [11]:
# Elbow method to choose n_clusters
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)

trace_elbow = go.Scatter(
    x=list(range(1, 11)),
    y=wcss,
    mode='lines',
    name='WCSS'
)

fig = make_subplots(rows=1, cols=1)
fig.add_trace(trace_elbow, row=1, col=1)
fig.update_layout(
    title='Elbow Method',
    xaxis=dict(title='Number of Clusters'),
    yaxis=dict(title='WCSS')
)
fig.show()

In [12]:
#Create df_cluster
df_cluster = df.copy()
kmeans = KMeans(n_clusters=4, init='k-means++', max_iter=300, n_init=10, random_state=0)
pred_y = kmeans.fit_predict(X)
df_cluster[f'Clusters'] = pred_y

df_cluster.head()

Unnamed: 0,Gender,Age,Annual Income (k$),Spending Score (1-100),Clusters
0,1,19,15,39,0
1,1,21,15,81,0
2,0,20,16,6,1
3,0,23,16,77,2
4,0,31,17,40,2


In [13]:
df_cluster['Clusters'].value_counts()

2    57
1    55
3    48
0    40
Name: Clusters, dtype: int64

In [14]:
import plotly.express as px

fig1 = px.scatter(df_cluster, x='Age', y='Annual Income (k$)', color='Clusters')
fig1.update_layout(title='Age vs Annual Income by Cluster')

fig2 = px.scatter(df_cluster, x='Age', y='Spending Score (1-100)', color='Clusters')
fig2.update_layout(title='Age vs Spending Score by Cluster')

fig3 = px.scatter(df_cluster, x='Annual Income (k$)', y='Spending Score (1-100)', color='Clusters')
fig3.update_layout(title='Annual Income vs Spending Score by Cluster')

fig1.show()
fig2.show()
fig3.show()



In [15]:
from sklearn.cluster import KMeans

# Choose the columns to use
X = df[['Spending Score (1-100)', 'Age', 'Annual Income (k$)']]

# Elbow method to choose n_clusters
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)

trace_elbow = go.Scatter(
    x=list(range(1, 11)),
    y=wcss,
    mode='lines',
    name='WCSS'
)

fig = make_subplots(rows=1, cols=1)
fig.add_trace(trace_elbow, row=1, col=1)
fig.update_layout(
    title='Elbow Method',
    xaxis=dict(title='Number of Clusters'),
    yaxis=dict(title='WCSS')
)
fig.show()


In [16]:
# Create the KMeans object with 3 clusters
kmeans = KMeans(n_clusters=6)

# Fit the model to the data
kmeans.fit(X)

# Add the predicted cluster labels to the dataframe
df['cluster3d'] = kmeans.predict(X)

# Create the 3D scatter plot
fig = go.Figure(data=[go.Scatter3d(
    x=X['Spending Score (1-100)'],
    y=X['Age'],
    z=X['Annual Income (k$)'],
    mode='markers',
    marker=dict(
        color=df['cluster3d'],
        size=5,
        opacity=0.8
    )
)])

# Update axis titles
fig.update_layout(
    scene=dict
    (xaxis_title='Spending Score',
     yaxis_title='Age',
      zaxis_title='Annual Income'
    )
)

fig.show()

In [17]:
new_customer = np.array([[79, 35, 80]])

# Use the predict method to predict the cluster
predicted_cluster = kmeans.predict(new_customer)

print("Predicted cluster:", predicted_cluster[0])

Predicted cluster: 2


In [18]:
new_customer = np.array([[63, 25, 40]])

# Use the predict method to predict the cluster
predicted_cluster = kmeans.predict(new_customer)

print("Predicted cluster:", predicted_cluster[0])

Predicted cluster: 4


In [19]:
new_customer = np.array([[43, 35, 80]])

# Use the predict method to predict the cluster
predicted_cluster = kmeans.predict(new_customer)

print("Predicted cluster:", predicted_cluster[0])

Predicted cluster: 0
