 # Data Cleaning â€” E-Commerce Customer Behavior

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Visualization settings
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)



In [2]:
df = pd.read_csv("../data/ecommerece-backup.csv")

In [3]:
#set visulization style 

sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12,6)
print("libraries loaded ")

libraries loaded 


# Data Inspection 

In [5]:
# Checking the missing values 

missing_summary = pd.DataFrame({ 
     "column":df.columns,
     "Missings": df.isnull().sum(),
     "percentages" :(df.isnull().sum() / len(df) *100).round(2)
})
missing_summary = missing_summary[missing_summary["Missings"]>0].sort_values("Missings",ascending=False)

if len(missing_summary) >0 :
     print(missing_summary.to_string(index=False))
     print(f" Total column with missing values {missing_summary}")
else:
     print("No missing values found")
print()

# Checking the duplicate rows 

duplicates = df.duplicated().sum()
print(f"Duplicated values found in row {duplicates}")
if duplicates > 0:
     print(f"percentage :{(duplicates/len(df)*100)}:.2f%")
     print("\n first few duplicates")
     print(df[df.duplicated(keep=False)].head())
else:
     print("No duplicated found ")
print()

# 3. Data Types Check

print(df.dtypes)
print()

# 4. Basic Stats for Numerical Columns (to spot issues)
numerical_cols = df.select_dtypes(include=[np.number]).columns
for col in numerical_cols:
     print(f"\n{col}:")
     print(f"Min:{df[col].min()}")
     print(f"Max:{df[col].max()}")
     print(f"Mean:{df[col].mean():.2f}")
     print(f"Median:{df[col].median():.2f}")

# Check for potential issues
     if df[col].min() < 0 and col in ['Age', 'Price', 'Quantity']:
          print(f"WARNING: Negative values found (shouldn't be negative)")
     if df[col].max() > 1000 and df[col].median() < 100:
          print(f"NOTICE: Very large outliers detected")

print()


No missing values found

Duplicated values found in row 0
No duplicated found 

Unnamed: 0                    int64
customer_id                   int64
gender                       object
age                           int64
city                         object
membership_type              object
total_spend                 float64
items_purchased               int64
average_rating              float64
discount_applied               bool
days_since_last_purchase      int64
satisfaction_level           object
dtype: object


Unnamed: 0:
Min:0
Max:349
Mean:174.89
Median:175.50

customer_id:
Min:101
Max:450
Mean:275.89
Median:276.50

age:
Min:26
Max:43
Mean:33.58
Median:32.00

total_spend:
Min:410.8
Max:1520.1
Mean:847.79
Median:780.20

items_purchased:
Min:7
Max:21
Mean:12.63
Median:12.00

average_rating:
Min:3.0
Max:4.9
Mean:4.02
Median:4.10

days_since_last_purchase:
Min:9
Max:63
Mean:26.61
Median:23.00



# Fill with mean/Median 

In [None]:
numerical_cols = df.select_dtypes(include=[np.number]).columns
missing_numerical = [col for col in numerical_cols if df[col].isnull().sum() > 0]

if len(missing_numerical) > 0:
     print(f"Found {len(missing_numerical)} numerical column(s) with missing values:")
     print()
     
     for col in missing_numerical:
          missing_count = df[col].isnull().sum()
          mean_val = df[col].mean()
          median_val = df[col].median()
          
          print(f"{col}:")
          print(f"Missing:{missing_count} ({(missing_count/len(df)*100):.1f}%)")
          print(f"Mean:{mean_val:.2f}")
          print(f" Median:{median_val:.2f}")
          
          # Recommendation
          if abs(mean_val - median_val) / median_val > 0.1:  # More than 10% difference
               print(f"Recommendation: Use MEDIAN (data is skewed)")
               recommended = "median"
          else:
               print(f"Recommendation: Use MEAN (data is normal)")
               recommended = "mean".
               print()
else:
     print("No missing values in numerical columns!")