<h2>Create a Small Dataset (with Clear Outliers)</h2>

In [19]:
import pandas as pd
import numpy as np

# Create dataset
data = {
    "Customer_ID": range(1, 21),
    "Monthly_Spend": [
        2000, 2200, 2500, 2700, 3000,
        3200, 3500, 3800, 4000, 4200,
        4500, 4800, 5000, 5200, 5500,
        6000, 6500, 7000, 30000, 50000  # outliers
    ]
}

df = pd.DataFrame(data)
df


Unnamed: 0,Customer_ID,Monthly_Spend
0,1,2000
1,2,2200
2,3,2500
3,4,2700
4,5,3000
5,6,3200
6,7,3500
7,8,3800
8,9,4000
9,10,4200


In [20]:
#Detect Outliers Using IQR
Q1 = df["Monthly_Spend"].quantile(0.25)
Q3 = df["Monthly_Spend"].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

lower_bound, upper_bound


(np.float64(-562.5), np.float64(9337.5))

In [21]:
outliers = df[
    (df["Monthly_Spend"] < lower_bound) |
    (df["Monthly_Spend"] > upper_bound)
]

outliers


Unnamed: 0,Customer_ID,Monthly_Spend
18,19,30000
19,20,50000


<h2>METHOD 1: Trimming (Removing Outliers)</h2>

In [22]:
df_trimmed = df[
    (df["Monthly_Spend"] >= lower_bound) &
    (df["Monthly_Spend"] <= upper_bound)
]

df_trimmed


Unnamed: 0,Customer_ID,Monthly_Spend
0,1,2000
1,2,2200
2,3,2500
3,4,2700
4,5,3000
5,6,3200
6,7,3500
7,8,3800
8,9,4000
9,10,4200


<h2>METHOD 2: Capping (Winsorization)</h2>

In [23]:
df_capped = df.copy()

df_capped["Monthly_Spend"] = np.where(
    df_capped["Monthly_Spend"] > upper_bound,
    upper_bound,
    np.where(
        df_capped["Monthly_Spend"] < lower_bound,
        lower_bound,
        df_capped["Monthly_Spend"]
    )
)

df_capped


Unnamed: 0,Customer_ID,Monthly_Spend
0,1,2000.0
1,2,2200.0
2,3,2500.0
3,4,2700.0
4,5,3000.0
5,6,3200.0
6,7,3500.0
7,8,3800.0
8,9,4000.0
9,10,4200.0


<h2>METHOD 3: Transformation (Log Transformation)</h2>

In [24]:
df_transformed = df.copy()
df_transformed["Log_Monthly_Spend"] = np.log(df_transformed["Monthly_Spend"])

df_transformed


Unnamed: 0,Customer_ID,Monthly_Spend,Log_Monthly_Spend
0,1,2000,7.600902
1,2,2200,7.696213
2,3,2500,7.824046
3,4,2700,7.901007
4,5,3000,8.006368
5,6,3200,8.070906
6,7,3500,8.160518
7,8,3800,8.242756
8,9,4000,8.29405
9,10,4200,8.34284


<h2>Compare Before & After (Mean Comparison)</h2>

In [25]:
comparison = pd.DataFrame({
    "Original Mean": [df["Monthly_Spend"].mean()],
    "Trimmed Mean": [df_trimmed["Monthly_Spend"].mean()],
    "Capped Mean": [df_capped["Monthly_Spend"].mean()],
    "Log Mean": [df_transformed["Log_Monthly_Spend"].mean()]
})

comparison


Unnamed: 0,Original Mean,Trimmed Mean,Capped Mean,Log Mean
0,7780.0,4200.0,4713.75,8.508769
