In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score

df = pd.read_csv('/content/drive/MyDrive/Infobyte/Task3: Customer Segmentation Analysis/ifood_df.csv')

print(df.head())
print(df.isnull().sum())

df.dropna(axis=1, how='all', inplace=True)
df.fillna(df.mean(), inplace=True)

average_purchase_value = df[['MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds']].mean(axis=1)
frequency_of_purchases = df[['NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases']].sum(axis=1)

df['AveragePurchaseValue'] = average_purchase_value
df['FrequencyOfPurchases'] = frequency_of_purchases

df['In_relationship'] = df['marital_Married'] + df['marital_Together']

print(df.describe())

plt.figure(figsize=(10, 5))
sns.boxplot(y=df['MntTotal'])
plt.title('Box plot of MntTotal')
plt.show()

plt.figure(figsize=(10, 5))
sns.boxplot(y=df['Income'])
plt.title('Box plot of Income')
plt.show()

plt.figure(figsize=(10, 5))
sns.histplot(df['Income'], bins=20, kde=True)
plt.title('Histogram of Income')
plt.show()

plt.figure(figsize=(10, 5))
sns.histplot(df['Age'], bins=20, kde=True)
plt.title('Histogram of Age')
plt.show()

plt.figure(figsize=(15, 10))
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix Heatmap')
plt.show()

plt.figure(figsize=(10, 5))
df.groupby(['marital_Divorced', 'marital_Married', 'marital_Single', 'marital_Together', 'marital_Widow'])['MntTotal'].mean().plot(kind='bar')
plt.title('MntTotal by Marital Status')
plt.show()

features = df[['Income', 'Kidhome', 'Teenhome', 'Recency', 'MntTotal', 'NumDealsPurchases', 'NumWebPurchases',
               'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth', 'Age', 'Customer_Days',
               'marital_Divorced', 'marital_Married', 'marital_Single', 'marital_Together', 'marital_Widow',
               'education_2n Cycle', 'education_Basic', 'education_Graduation', 'education_Master', 'education_PhD']]

scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

inertia = []
silhouette_scores = []
K = range(2, 11)
for k in K:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(features_scaled)
    inertia.append(kmeans.inertia_)
    silhouette_scores.append(silhouette_score(features_scaled, kmeans.labels_))

plt.figure(figsize=(10, 5))
plt.plot(K, inertia, 'bx-')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.title('Inertia vs Number of Clusters')
plt.show()

plt.figure(figsize=(10, 5))
plt.plot(K, silhouette_scores, 'bx-')
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Score vs Number of Clusters')
plt.show()

optimal_k = 5
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
df['Cluster'] = kmeans.fit_predict(features_scaled)

pca = PCA(n_components=2)
principal_components = pca.fit_transform(features_scaled)
df['PC1'] = principal_components[:, 0]
df['PC2'] = principal_components[:, 1]

plt.figure(figsize=(10, 7))
sns.scatterplot(x='PC1', y='PC2', hue='Cluster', data=df, palette='viridis')
plt.title('Customer Segments Visualization')
plt.show()

product_columns = ['MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds']
for product in product_columns:
    plt.figure(figsize=(10, 5))
    sns.barplot(x='Cluster', y=product, data=df, estimator=np.mean)
    plt.title(f'Average {product} Consumption by Cluster')
    plt.show()

plt.figure(figsize=(10, 5))
sns.countplot(x='Cluster', data=df)
plt.title('Cluster Sizes')
plt.show()

plt.figure(figsize=(10, 5))
sns.boxplot(x='Cluster', y='Income', data=df)
plt.title('Income by Cluster (Box Plot)')
plt.show()

plt.figure(figsize=(10, 5))
sns.scatterplot(x='Cluster', y='Income', data=df)
plt.title('Income by Cluster (Scatter Plot)')
plt.show()

plt.figure(figsize=(10, 5))
sns.barplot(x='Cluster', y='In_relationship', data=df, estimator=np.mean)
plt.title('In_relationship by Cluster')
plt.show()

for cluster in range(optimal_k):
    print(f"Cluster {cluster}:")
    print(df[df['Cluster'] == cluster].describe())
    print("\n")



Output hidden; open in https://colab.research.google.com to view.

In [None]:
from google.colab import drive
drive.mount('/content/drive')