In [120]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.preprocessing import PowerTransformer

Preparation of Data

In [121]:
np.random.seed(42) #Same random numbers every run â†’ useful when demonstrating code

In [122]:
normal_data=np.random.normal(50, 10, 1000) #Generates 1000 data points. Follows a normal distribution. Mean = 50, Standard deviation = 10. Basically a bell-curve of values centered at 50.

In [123]:
normal_data

array([54.96714153, 48.61735699, 56.47688538, 65.23029856, 47.65846625,
       47.65863043, 65.79212816, 57.67434729, 45.30525614, 55.42560044,
       45.36582307, 45.34270246, 52.41962272, 30.86719755, 32.75082167,
       44.37712471, 39.8716888 , 53.14247333, 40.91975924, 35.87696299,
       64.65648769, 47.742237  , 50.67528205, 35.75251814, 44.55617275,
       51.1092259 , 38.49006423, 53.75698018, 43.9936131 , 47.0830625 ,
       43.98293388, 68.52278185, 49.86502775, 39.42289071, 58.22544912,
       37.7915635 , 52.08863595, 30.40329876, 36.71813951, 51.96861236,
       57.3846658 , 51.71368281, 48.84351718, 46.98896304, 35.2147801 ,
       42.80155792, 45.39361229, 60.57122226, 53.4361829 , 32.36959845,
       53.24083969, 46.1491772 , 43.23078   , 56.11676289, 60.30999522,
       59.31280119, 41.60782477, 46.90787624, 53.31263431, 59.75545127,
       45.20825762, 48.14341023, 38.93665026, 38.03793376, 58.12525822,
       63.56240029, 49.27989878, 60.03532898, 53.61636025, 43.54

In [124]:
outliers=np.array([150,200, -20])

In [125]:
data=np.concatenate([normal_data, outliers])

In [126]:
df=pd.DataFrame({'value': data})

In [127]:
df.head()

Unnamed: 0,value
0,54.967142
1,48.617357
2,56.476885
3,65.230299
4,47.658466


In [128]:
df.describe()

Unnamed: 0,value
count,1003.0
mean,50.372204
std,11.524898
min,-20.0
25%,43.513923
50%,50.260911
75%,56.494555
max,200.0


Detection of Outliers

In [129]:
#####Z-Score Method#######
df['z-score']=stats.zscore(df['value'])

In [130]:
outliers_by_zscore=df[df['z-score'].abs()>3]

In [131]:
outliers_by_zscore

Unnamed: 0,value,z-score
209,88.527315,3.31232
1000,150.0,8.648883
1001,200.0,12.989481
1002,-20.0,-6.109148


IQR Method

In [132]:
Q1=df['value'].quantile(0.25)

In [133]:
Q3=df['value'].quantile(0.75)

In [134]:
IQR=Q3-Q1

In [135]:
Boxplot_lower_whisker=Q1-1.5*IQR
Boxplot_upper_whisker=Q3+1.5*IQR

In [136]:
outliers_IQR=df[(df['value']<Boxplot_lower_whisker) | (df['value']>Boxplot_upper_whisker)]

In [157]:
outliers_IQR

Unnamed: 0,value,z-score
74,23.802549,-2.306564
179,77.201692,2.32912
209,88.527315,3.31232
262,17.587327,-2.846119
478,80.788808,2.640525
646,23.031134,-2.373532
668,23.490302,-2.33367
755,76.323821,2.25291
1000,150.0,8.648883
1001,200.0,12.989481


Compare the Two Methods
Z-score Outliers

Only these rows:

209 (â‰ˆ88.5)

1000 (150)

1001 (200)

1002 (-20)

These are extreme â€” many standard deviations away.
Z-score is clearly catching only the most insane values.

Because mean and std dev are heavily influenced by outliers:

The boundary threshold shifts outward

Mild outliers get hidden

IQR Outliers

Much bigger list:
Values around 17â€“24 and 76â€“88 also appear as outliers.

Why?

Because:

ðŸ”¥ IQR method detects values outside the central 50%
and it does NOT care about:

mean

standard deviation

So it catches more subtle data points that are unusual but not extreme.

ðŸ“Œ Interpretation
Method	What it catches	What it misses
Z-score	Extreme points only	Moderate but weird data
IQR	Both extreme + mild odd values	Sometimes too strict on normal tails

This difference tells us a lot:

ðŸ§  What does this tell us about your dataset?

âœ” The distribution is not perfectly normal
âœ” It has a longer right/left tail (skew)
âœ” Outliers affect mean and std strongly
âœ” Z-score is not reliable here (because skew breaks its assumptions)

Result?
â†’ IQR is currently a better tool than z-score

Removing  Outliers

In [138]:
df_removed=df[(df['value']<=Boxplot_upper_whisker) & (df['value']>=Boxplot_lower_whisker)]

In [139]:
df_removed

Unnamed: 0,value,z-score
0,54.967142,0.398895
1,48.617357,-0.152342
2,56.476885,0.529959
3,65.230299,1.289860
4,47.658466,-0.235585
...,...,...
995,47.188997,-0.276340
996,67.976865,1.528295
997,56.408429,0.524016
998,44.288210,-0.528163


In [140]:
df_removed.shape

(992, 2)

Winsorizing Outliers

In [141]:
lower_clip=Boxplot_lower_whisker
upper_clip=Boxplot_upper_whisker
#or we can do
#lower_clip=df['value'].quantile(0.01)
#upper_clip=df['value'].quantile(0.99)

In [142]:
df_winsor=df.copy()
df_winsor['value']=df['value'].clip(lower_clip, upper_clip)

In [143]:
df_winsor

Unnamed: 0,value,z-score
0,54.967142,0.398895
1,48.617357,-0.152342
2,56.476885,0.529959
3,65.230299,1.289860
4,47.658466,-0.235585
...,...,...
998,44.288210,-0.528163
999,55.725828,0.464759
1000,75.965504,8.648883
1001,75.965504,12.989481


In [144]:
df_winsor['value'].describe()

Unnamed: 0,value
count,1003.0
mean,50.207952
std,9.796656
min,24.042974
25%,43.513923
50%,50.260911
75%,56.494555
max,75.965504


Transformation

Winsorizing fixed the extremes
But the distribution likely still has skew.

The next professional step is:

ðŸ‘‰ Apply transformation
(Likely Yeo-Johnson using PowerTransformer)

This will:

Normalize skew

Stabilize variance

Make relationships linear â†’ essential for regression and NN

LOG TRANSFORM

In [145]:
df['log_value']=np.log(df['value']-df['value'].min()+1)

SQUAREROOT TRANSFORM

In [146]:
df['sqrt_value']=np.sqrt(df['value']-df['value'].min()+1)

BOX-COX TRANSFORM (works only when values>0)

In [148]:
positive_data=df['value']-df['value'].min()+1
df['boxcox'], lam=stats.boxcox(positive_data)

In [150]:
df.head()

Unnamed: 0,value,z-score,log_value,sqrt_value,boxcox
0,54.967142,0.398895,4.330301,8.715913,24.716911
1,48.617357,-0.152342,4.243014,8.343702,23.252983
2,56.476885,0.529959,4.34998,8.802096,25.058724
3,65.230299,1.28986,4.457022,9.286027,26.997518
4,47.658466,-0.235585,4.229144,8.28604,23.028013


YEO-JONHSON TRANSFORM

In [154]:
pt=PowerTransformer(method='yeo-johnson')
df['yeo_johnson_value'] = pt.fit_transform(df[['value']])  #used double[[]] because we have to pass a dataframe , not a pandas series to the fit_transform function

In [155]:
df.head()

Unnamed: 0,value,z-score,log_value,sqrt_value,boxcox,yeo_johnson_value
0,54.967142,0.398895,4.330301,8.715913,24.716911,0.408693
1,48.617357,-0.152342,4.243014,8.343702,23.252983,-0.143
2,56.476885,0.529959,4.34998,8.802096,25.058724,0.539049
3,65.230299,1.28986,4.457022,9.286027,26.997518,1.289281
4,47.658466,-0.235585,4.229144,8.28604,23.028013,-0.226825
