<a href="https://colab.research.google.com/github/theduskcharm/CODSOFT/blob/main/Customer_Segmentation_for_E_commerce.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# Set plot style
sns.set(style='whitegrid')

# Load the transaction data
df = pd.read_csv('E-commerce Customer Behavior - Sheet1 (1).csv')

# Preview the data
print(df.head())

# Check for missing values
print(df.isnull().sum())

# Summary statistics
print(df.describe())

# Select relevant features for segmentation (e.g., total amount spent, frequency of purchases)
features = ['Total_Spent', 'Frequency', 'Avg_Transaction_Value']

# Handle missing values if any (optional)
df.fillna(df.mean(), inplace=True)

# Standardize the data
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df[features])

# Convert the scaled features back to a DataFrame
scaled_df = pd.DataFrame(scaled_features, columns=features)

# Determine the optimal number of clusters using the elbow method
wcss = []  # Within-cluster sum of squares

for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=42)
    kmeans.fit(scaled_df)
    wcss.append(kmeans.inertia_)

# Plot the elbow graph
plt.figure(figsize=(10, 6))
plt.plot(range(1, 11), wcss, marker='o')
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

# Based on the elbow method, choose the optimal number of clusters
optimal_clusters = 4

# Apply K-means clustering
kmeans = KMeans(n_clusters=optimal_clusters, init='k-means++', max_iter=300, n_init=10, random_state=42)
df['Cluster'] = kmeans.fit_predict(scaled_df)

# Check the distribution of customers across clusters
print(df['Cluster'].value_counts())

# Use PCA for dimensionality reduction to visualize the clusters
pca = PCA(n_components=2)
pca_components = pca.fit_transform(scaled_df)

# Create a DataFrame with the PCA components
pca_df = pd.DataFrame(data=pca_components, columns=['PCA1', 'PCA2'])
pca_df['Cluster'] = df['Cluster']

# Plot the clusters
plt.figure(figsize=(10, 6))
sns.scatterplot(x='PCA1', y='PCA2', hue='Cluster', data=pca_df, palette='viridis', s=100)
plt.title('Customer Segments (K-means Clustering)')
plt.show()

# Analyze the characteristics of each cluster
cluster_summary = df.groupby('Cluster')[features].mean()
print(cluster_summary)

# Save the clustered data to a new CSV file
df.to_csv('segmented_customers.csv', index=False)



   Customer ID  Gender  Age           City Membership Type  Total Spend  \
0          101  Female   29       New York            Gold      1120.20   
1          102    Male   34    Los Angeles          Silver       780.50   
2          103  Female   43        Chicago          Bronze       510.75   
3          104    Male   30  San Francisco            Gold      1480.30   
4          105    Male   27          Miami          Silver       720.40   

   Items Purchased  Average Rating  Discount Applied  \
0               14             4.6              True   
1               11             4.1             False   
2                9             3.4              True   
3               19             4.7             False   
4               13             4.0              True   

   Days Since Last Purchase Satisfaction Level  
0                        25          Satisfied  
1                        18            Neutral  
2                        42        Unsatisfied  
3               

TypeError: can only concatenate str (not "int") to str

In [None]:
from google.colab import drive
drive.mount('/content/drive')