In [1]:
import pandas as pd
import plotly.express as pltx
import seaborn as sns
import numpy as np


In [2]:
df = pd.read_csv("../Data/customer_dataset.csv")
df.head()

Unnamed: 0,Customer_ID,Gender,Age,City,Membership_Type,Total_Spend,Items_Purchased,Average_Rating,Discount_Applied,Days_Since_Last_Purchase,Satisfaction_Level
0,101,Female,29,New York,Gold,1120.2,14,4.6,True,25,Satisfied
1,102,Male,34,Los Angeles,Silver,780.5,11,4.1,False,18,Neutral
2,103,Female,43,Chicago,Bronze,510.75,9,3.4,True,42,Unsatisfied
3,104,Male,30,San Francisco,Gold,1480.3,19,4.7,False,12,Satisfied
4,105,Male,27,Miami,Bronze,720.4,13,4.0,True,55,Unsatisfied


In [3]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 350 entries, 0 to 349
Data columns (total 11 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Customer_ID               350 non-null    int64  
 1   Gender                    350 non-null    object 
 2   Age                       350 non-null    int64  
 3   City                      350 non-null    object 
 4   Membership_Type           350 non-null    object 
 5   Total_Spend               350 non-null    float64
 6   Items_Purchased           350 non-null    int64  
 7   Average_Rating            350 non-null    float64
 8   Discount_Applied          350 non-null    bool   
 9   Days_Since_Last_Purchase  350 non-null    int64  
 10  Satisfaction_Level        349 non-null    object 
dtypes: bool(1), float64(2), int64(4), object(4)
memory usage: 27.8+ KB


In [4]:
df.dtypes


Customer_ID                   int64
Gender                       object
Age                           int64
City                         object
Membership_Type              object
Total_Spend                 float64
Items_Purchased               int64
Average_Rating              float64
Discount_Applied               bool
Days_Since_Last_Purchase      int64
Satisfaction_Level           object
dtype: object

In [5]:
df.isnull().sum()


Customer_ID                 0
Gender                      0
Age                         0
City                        0
Membership_Type             0
Total_Spend                 0
Items_Purchased             0
Average_Rating              0
Discount_Applied            0
Days_Since_Last_Purchase    0
Satisfaction_Level          1
dtype: int64

In [6]:

df.describe()

Unnamed: 0,Customer_ID,Age,Total_Spend,Items_Purchased,Average_Rating,Days_Since_Last_Purchase
count,350.0,350.0,350.0,350.0,350.0,350.0
mean,275.5,33.597143,845.381714,12.6,4.019143,26.588571
std,101.180532,4.870882,362.058695,4.155984,0.580539,13.440813
min,101.0,26.0,410.8,7.0,3.0,9.0
25%,188.25,30.0,502.0,9.0,3.5,15.0
50%,275.5,32.5,775.2,12.0,4.1,23.0
75%,362.75,37.0,1160.6,15.0,4.5,38.0
max,450.0,43.0,1520.1,21.0,4.9,63.0


In [7]:
df.drop(columns=['Customer_ID'], inplace=True)

df.head()

Unnamed: 0,Gender,Age,City,Membership_Type,Total_Spend,Items_Purchased,Average_Rating,Discount_Applied,Days_Since_Last_Purchase,Satisfaction_Level
0,Female,29,New York,Gold,1120.2,14,4.6,True,25,Satisfied
1,Male,34,Los Angeles,Silver,780.5,11,4.1,False,18,Neutral
2,Female,43,Chicago,Bronze,510.75,9,3.4,True,42,Unsatisfied
3,Male,30,San Francisco,Gold,1480.3,19,4.7,False,12,Satisfied
4,Male,27,Miami,Bronze,720.4,13,4.0,True,55,Unsatisfied


In [8]:
age_bins = [20, 25, 30, 35, 40, 45, float('inf')]
age_labels = ['20-24', '25-29', '30-34', '35-39', '40-44', '45+']

df['AgeBin'] = pd.cut(df['Age'], bins=age_bins, labels=age_labels, right=False)

age_counts = df['AgeBin'].value_counts().sort_index().reset_index()
age_counts.columns = ['Age Group', 'Age Group Count']

plot = pltx.bar(
    age_counts,
    x='Age Group',
    y='Age Group Count',
    title="Age Group Distribution of Customers",
    text='Age Group Count',
    color='Age Group'
)

plot.update_layout(
    xaxis_title="Age Group",
    yaxis_title="Age Group Count",
    xaxis={'categoryorder': 'array', 'categoryarray': age_labels}
)

# Show plot
plot.show()

In [9]:
age_gender_dist = df.groupby(['Gender', 'AgeBin'])['Gender'].size().reset_index(name='Count')

fig = pltx.line(
    age_gender_dist, 
    x="AgeBin", 
    y="Count", 
    color="Gender", 
    markers=True, 
    title="Age Distribution By Gender"
)

fig.update_xaxes(title="Age Group")
fig.update_yaxes(title="Count")

fig.show()





In [10]:
df['Age'].describe()

count    350.000000
mean      33.597143
std        4.870882
min       26.000000
25%       30.000000
50%       32.500000
75%       37.000000
max       43.000000
Name: Age, dtype: float64

In [11]:
location_dist = df['City'].value_counts()

fig = pltx.pie(
    names=location_dist.index,
    values=location_dist.values,
    title="Location Distribution",
    hole=0.4,  
    color_discrete_sequence=pltx.colors.qualitative.Set3
)

fig.show()

In [12]:
items_purchased_gender = df.groupby("Gender")["Items_Purchased"].mean().reset_index()

fig = pltx.bar(
    items_purchased_gender,
    x="Gender",
    y="Items_Purchased",
    title="Average Items Purchased by Gender",
    color="Gender",
    text_auto=True
)

fig.update_xaxes(title="Gender")
fig.update_yaxes(title="Average Items Buy")
fig.show()

In [13]:
rating_vs_items = df.groupby("Average_Rating")["Items_Purchased"].mean().reset_index()

# Create bar chart
fig = pltx.bar(
    rating_vs_items,
    x="Average_Rating",
    y="Items_Purchased",
    title="Average Items Purchased by Rating",
    labels={"Average_Rating": "Average Rating", "Items_Purchased": "Average Items Purchased"},
    text_auto=True,
    color="Items_Purchased",
    color_continuous_scale="Blues"
)

fig.show()

In [14]:
gender_items_avg = df.groupby("Gender")["Items_Purchased"].mean().reset_index()

# Create pie chart
fig = pltx.pie(
    gender_items_avg, 
    names="Gender", 
    values="Items_Purchased", 
    title="Average Items Purchased by Gender",
    color_discrete_sequence=pltx.colors.qualitative.Set3
)

fig.show()

In [15]:
if df["Satisfaction_Level"].dtype == "object":
    df["Satisfaction_Level"] = df["Satisfaction_Level"].astype("category").cat.codes

# Group by City and calculate the average Satisfaction Level
city_vs_satisfaction = df.groupby("City")["Satisfaction_Level"].mean().reset_index()

# Create pie chart
fig = pltx.pie(
    city_vs_satisfaction, 
    names="City", 
    values="Satisfaction_Level", 
    title="Average Satisfaction Level by City",
    hole=0.4,  # Donut style for better visualization
    color_discrete_sequence=pltx.colors.qualitative.Set3
)

fig.show()

In [16]:
city_vs_items = df.groupby("City")["Items_Purchased"].sum().reset_index()

# Create bar chart
fig = pltx.bar(
    city_vs_items,
    x="City",
    y="Items_Purchased",
    title="Total Items Purchased by City",
    labels={"City": "City", "Items_Purchased": "Total Items Purchased"},
    text_auto=True,
    color="Items_Purchased",
    color_continuous_scale="Blues"
)

fig.update_xaxes(categoryorder="total descending")  # Sort cities by total items purchased

fig.show()

In [17]:
gender_membership = df.groupby(["Gender", "Membership_Type"]).size().reset_index(name="Count")

# Create a bar chart
fig = pltx.bar(
    gender_membership, 
    x="Gender", 
    y="Count", 
    color="Membership_Type", 
    title="Gender vs Membership Type Distribution",
    barmode="group",
    labels={"Count": "Number of Customers", "Gender": "Gender"}
)

fig.show()

In [18]:
gender_avg_rating = df.groupby("Gender")["Average_Rating"].mean().reset_index()

# Create bar chart
fig = pltx.bar(
    gender_avg_rating,
    x="Gender",
    y="Average_Rating",
    title="Average Rating by Gender",
    labels={"Average_Rating": "Average Rating", "Gender": "Gender"},
    text_auto=True,
    color="Gender",
    color_discrete_sequence=pltx.colors.qualitative.Set3
)

fig.show()

In [19]:
city_membership = df.groupby(["City", "Membership_Type"]).size().reset_index(name="Count")

# Create a bar chart
fig = pltx.bar(
    city_membership, 
    x="City", 
    y="Count", 
    color="Membership_Type", 
    title="Membership Type Distribution by City",
    barmode="group",
    labels={"Count": "Number of Customers", "City": "City"}
)

fig.show()

In [20]:
city_days_purchase = df.groupby("City")["Days_Since_Last_Purchase"].mean().reset_index()

# Create bar chart
fig = pltx.bar(
    city_days_purchase,
    x="City",
    y="Days_Since_Last_Purchase",
    title="Average Days Since Last Purchase by City",
    labels={"City": "City", "Days_Since_Last_Purchase": "Avg Days Since Last Purchase"},
    color="Days_Since_Last_Purchase",
    color_continuous_scale="Blues"
)

fig.update_xaxes(categoryorder="total descending")  # Sort cities by avg days since last purchase

fig.show()

In [21]:
gender_items_purchased = df.groupby("Gender")["Items_Purchased"].sum().reset_index()
fig = pltx.pie(
    gender_items_purchased,
    names="Gender",
    values="Items_Purchased",
    title="Percentage of Items Purchased by Gender",
    color_discrete_sequence=pltx.colors.qualitative.Set3
)

fig.show()

In [22]:
membership_spend = df.groupby("Membership_Type")["Total_Spend"].sum().reset_index()

# Create bar chart
fig = pltx.bar(
    membership_spend,
    x="Membership_Type",
    y="Total_Spend",
    title="Total Spend by Membership Type",
    labels={"Membership_Type": "Membership Type", "Total_Spend": "Total Spend"},
    text_auto=True,
    color="Total_Spend",
    color_continuous_scale="Blues"
)

fig.show()

In [23]:
df["AgeGroup"] = pd.cut(df["Age"], bins=age_bins, labels=age_labels, right=False)

# Create violin plot
fig = pltx.violin(
    df, 
    x="Membership_Type", 
    y="Age", 
    color="Membership_Type",
    box=True,  
    points="all",
    title="Age Group Distribution by Membership Type",
    labels={"Membership_Type": "Membership Type", "Age": "Age"}
)

fig.show()

In [24]:
membership_age = df.groupby(["Membership_Type", "Age"]).size().reset_index(name="Count")

# Create bar chart
fig = pltx.bar(
    membership_age,
    x="Age",
    y="Count",
    color="Membership_Type",
    title="Distribution of Age by Membership Type",
    labels={"Age": "Age", "Count": "Number of Customers"},
    barmode="group",
    color_discrete_sequence=pltx.colors.qualitative.Set3
)

fig.show()

In [25]:
gender_satisfaction = df.groupby(["Gender", "Satisfaction_Level"]).size().reset_index(name="Count")

# Create bar chart
fig = pltx.bar(
    gender_satisfaction,
    x="Gender",
    y="Count",
    color="Satisfaction_Level",
    title="Satisfaction Level Distribution by Gender",
    labels={"Gender": "Gender", "Count": "Number of Customers", "Satisfaction_Level": "Satisfaction Level"},
    barmode="group",
    color_discrete_sequence=pltx.colors.qualitative.Set3
)

fig.show()

In [26]:
import seaborn as sns
for col in df.select_dtypes(include=['object']).columns:
    df[col] = pd.Categorical(df[col]).codes

# Compute correlation matrix
df_correlation = df.corr(numeric_only=True)

# Create heatmap for correlation matrix
fig = pltx.imshow(
    df_correlation,
    title="Correlation Matrix Heatmap",
    labels=dict(color="Correlation"),
    color_continuous_scale="Viridis"
)

fig.show()

In [27]:
for col in df.select_dtypes(include=['object']).columns:
    df[col] = pd.Categorical(df[col]).codes

# Compute correlation matrix
df_correlation = df.corr(numeric_only=True)

# Select correlations with Total Spend
features = df_correlation["Total_Spend"].sort_values(ascending=False).drop("Total_Spend")

# Create bar chart
fig = pltx.bar(
    x=features.index,
    y=features.values,
    title="Feature Correlation with Total Spend",
    color=features.values,
    color_continuous_scale="Cividis"
)

# Add annotations for exact correlation values
for i in range(len(features)):
    fig.add_annotation(
        x=features.index[i],
        y=features.values[i],
        text=f"{features.values[i]:.2f}",
        yshift=-10 if features.values[i] < 0 else 10,
        showarrow=False,
    )

fig.update_layout(xaxis_title="Feature", yaxis_title="Correlation")

fig.show()

In [28]:
for col in df.select_dtypes(include=['object']).columns:
    df[col] = pd.Categorical(df[col]).codes

# Compute correlation matrix
df_correlation = df.corr(numeric_only=True)

# Select correlations with Satisfaction Level
features = df_correlation["Satisfaction_Level"].sort_values(ascending=True).drop("Satisfaction_Level")

# Create horizontal bar chart
fig = pltx.bar(
    x=features.values,
    y=features.index,
    title="Feature Correlation with Satisfaction Level",
    color=features.values,
    orientation="h",
    color_continuous_scale="Plasma"
)

# Add annotations for exact correlation values
for i in range(len(features)):
    fig.add_annotation(
        x=features.values[i],
        y=features.index[i],
        text=f"{features.values[i]:.2f}",
        xshift=10 if features.values[i] > 0 else -10,
        showarrow=False,
    )

fig.update_layout(xaxis_title="Correlation", yaxis_title="Feature")

fig.show()

In [29]:

import plotly.graph_objects as go

for col in df.select_dtypes(include=['object']).columns:
    df[col] = pd.Categorical(df[col]).codes

# Compute correlation matrix
df_correlation = df.corr(numeric_only=True)

# Extract correlations with Age & Items Purchased
age_corr = df_correlation["Age"].sort_values(ascending=False).drop("Age")
items_corr = df_correlation["Items_Purchased"].sort_values(ascending=False).drop("Items_Purchased")

# Create grouped bar chart
fig = go.Figure()

fig.add_trace(go.Bar(
    x=age_corr.index,
    y=age_corr.values,
    name="Correlation with Age",
    marker_color="blue"
))

fig.add_trace(go.Bar(
    x=items_corr.index,
    y=items_corr.values,
    name="Correlation with Items Purchased",
    marker_color="orange"
))

# Update layout
fig.update_layout(
    title="Feature Correlation with Age & Items Purchased",
    xaxis_title="Feature",
    yaxis_title="Correlation",
    barmode="group"
)

fig.show()

In [30]:
features = [
    "Total_Spend",
    "Items_Purchased",
    "Average_Rating",
    "Days_Since_Last_Purchase",
]

# Compute descriptive statistics grouped by Gender
gender_stats = df.groupby("Gender")[features].describe().reset_index()

# Create subplots (2 rows, 2 columns)
# fig = make_subplots(
#     rows=2,
#     cols=2,
#     subplot_titles=features,
# )

# Add mean and standard deviation bars for each feature
for i, feature in enumerate(features):
    fig.add_trace(
        go.Bar(
            x=gender_stats["Gender"],
            y=gender_stats[(feature, "mean")],
            name="Mean",
        ),
        row=(i // 2) + 1,
        col=(i % 2) + 1,
    )

    fig.add_trace(
        go.Bar(
            x=gender_stats["Gender"],
            y=gender_stats[(feature, "std")],
            name="Std",
        ),
        row=(i // 2) + 1,
        col=(i % 2) + 1,
    )

# Update layout
fig.update_layout(
    title="Customer Statistics by Gender",
    showlegend=False,
)

fig.update_yaxes(title_text="Mean", row=1, col=1)
fig.update_yaxes(title_text="Std", row=2, col=1)

fig.show()

Exception: In order to reference traces by row and column, you must first use plotly.tools.make_subplots to create the figure with a subplot grid.

In [205]:
features = [
    "Total_Spend",
    "Items_Purchased",
    "Average_Rating",
    "Days_Since_Last_Purchase",
]

# Compute descriptive statistics grouped by Membership Type
membership_stats = df.groupby("Membership_Type")[features].describe().reset_index()

# # Create subplots (2 rows, 2 columns)
# fig = make_subplots(
#     rows=2,
#     cols=2,
#     subplot_titles=features,
# )

# Add mean and standard deviation bars for each feature
for i, feature in enumerate(features):
    fig.add_trace(
        go.Bar(
            x=membership_stats["Membership_Type"],
            y=membership_stats[(feature, "mean")],
            name="Mean",
        ),
        row=(i // 2) + 1,
        col=(i % 2) + 1,
    )

    fig.add_trace(
        go.Bar(
            x=membership_stats["Membership_Type"],
            y=membership_stats[(feature, "std")],
            name="Std",
        ),
        row=(i // 2) + 1,
        col=(i % 2) + 1,
    )

# Update layout
fig.update_layout(
    title="Customer Statistics by Membership Type",
    showlegend=False,
)

fig.update_yaxes(title_text="Mean", row=1, col=1)
fig.update_yaxes(title_text="Std", row=2, col=1)

fig.show()

In [32]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler  

In [209]:
dk = df.drop(columns=['Satisfaction_Level'], errors='ignore')
categorical_features = ['Membership_Type']
numerical_features = ['Total_Spend', 'Items_Purchased', 'Average_Rating', 'Days_Since_Last_Purchase', 'Age']


In [210]:
for col in categorical_features:
    dk[col] = pd.Categorical(dk[col]).codes

# Standardize numerical features
scaler = StandardScaler()
dk[numerical_features] = scaler.fit_transform(dk[numerical_features])

# Display the transformed dataset
dk.head()

Unnamed: 0,Gender,Age,City,Membership_Type,Total_Spend,Items_Purchased,Average_Rating,Discount_Applied,Days_Since_Last_Purchase,AgeBin,AgeGroup
0,0,-0.945152,4,1,0.76013,0.337346,1.001981,True,-0.118359,25-29,26-35
1,1,0.082826,2,2,-0.179459,-0.385538,0.139479,False,-0.639907,30-34,26-35
2,0,1.933185,0,0,-0.92557,-0.867461,-1.068024,True,1.148256,40-44,36-45
3,1,-0.739557,5,1,1.756144,1.542153,1.174482,False,-1.086947,30-34,26-35
4,1,-1.356343,3,0,-0.345692,0.096385,-0.033022,True,2.116844,25-29,26-35


In [33]:
dk = df.drop(columns=['Satisfaction_Level'], errors='ignore')

# Identify categorical and numerical features
categorical_features = ['Membership_Type', 'City']
numerical_features = ['Total_Spend', 'Items_Purchased', 'Average_Rating', 'Days_Since_Last_Purchase']

# Fix Age column if it contains categorical values (like '20-24')
if df['Age'].dtype == 'object':  
    dk['Age'] = pd.Categorical(df['Age']).codes  # Convert to numeric codes

# Convert other categorical variables to numeric codes
for col in categorical_features:
    dk[col] = pd.Categorical(dk[col]).codes

# Standardize numerical features
scaler = StandardScaler()
dk[numerical_features] = scaler.fit_transform(dk[numerical_features])

# Initialize lists to store inertia and silhouette scores
inertia = []
silhouette_scores = []
k_range = range(2, 8)  # Testing different cluster numbers

# Train K-Means with different cluster sizes
for k in k_range:
    print(f"Training KMeans with {k} clusters")
    k_means = KMeans(n_clusters=k, random_state=42, n_init=10)
    k_means.fit(dk)
    inertia_ = k_means.inertia_
    silhouette_scores_ = silhouette_score(dk, k_means.labels_)
    inertia.append(inertia_)
    silhouette_scores.append(silhouette_scores_)
    print("Inertia:", inertia_)
    print("Silhouette Score:", silhouette_scores_)
    print("")

Training KMeans with 2 clusters


ValueError: Cannot cast object dtype to float64