### Importing libraries

In [1]:
import pandas as pd
from sklearn.ensemble import IsolationForest

### 1) Continuous Data
Detection and handling of outliers in numerical columns.

In [2]:
# Example continuous data
df_cont = pd.DataFrame({'Age': [23, 25, 26, 24, 120, 28, 22]})

# IQR method for detecting outliers
Q1, Q3 = df_cont['Age'].quantile([0.25, 0.75])
IQR = Q3 - Q1
lower, upper = Q1 - 1.5*IQR, Q3 + 1.5*IQR

# Flag outliers
df_cont['is_outlier'] = (df_cont['Age'] < lower) | (df_cont['Age'] > upper)
print(df_cont)

# Example handling: capping (Winsorization)
df_cont['Age_capped'] = df_cont['Age'].clip(lower, upper)
print("\nAfter capping outliers:\n", df_cont)


   Age  is_outlier
0   23       False
1   25       False
2   26       False
3   24       False
4  120        True
5   28       False
6   22       False

After capping outliers:
    Age  is_outlier  Age_capped
0   23       False       23.00
1   25       False       25.00
2   26       False       26.00
3   24       False       24.00
4  120        True       32.25
5   28       False       28.00
6   22       False       22.00


In [3]:
# Capping / Winsorization
# Replace extreme values with the nearest threshold (usually 1.5IQR or 3std deviation limits).

df = pd.DataFrame({'Age': [23, 25, 26, 24, 120, 28, 22]})
Q1, Q3 = df['Age'].quantile([0.25, 0.75])
IQR = Q3 - Q1
lower, upper = Q1 - 1.5*IQR, Q3 + 1.5*IQR

# Cap the values
df['Age_capped'] = df['Age'].clip(lower, upper)
print(df)


   Age  Age_capped
0   23       23.00
1   25       25.00
2   26       26.00
3   24       24.00
4  120       32.25
5   28       28.00
6   22       22.00


Model based outlier replacement

In [4]:
from sklearn.impute import KNNImputer
df_knn = pd.DataFrame({'Age': [23, 25, 26, 24, 120, 28, 22], 'Salary': [4000, 4200, 4100, 4300, 15000, 4400, 4000]})

In [5]:
imputer = KNNImputer(n_neighbors=2)
df_knn_imputed = pd.DataFrame(imputer.fit_transform(df_knn), columns=df_knn.columns)
print(df_knn_imputed)

     Age   Salary
0   23.0   4000.0
1   25.0   4200.0
2   26.0   4100.0
3   24.0   4300.0
4  120.0  15000.0
5   28.0   4400.0
6   22.0   4000.0


### 2) Categorical Data
Detection of rare categories and handling.

In [6]:

# Example categorical data
df_cat = pd.DataFrame({'City': ['Colombo', 'Kandy', 'Galle', 'Colombo', 'Paris']})

# Detect rare categories (less than 10% frequency)
freq = df_cat['City'].value_counts(normalize=True)
rare_labels = freq[freq < 0.1].index

# Replace rare categories with 'Other'
df_cat['City_cleaned'] = df_cat['City'].replace(rare_labels, 'Other')
print(df_cat)


      City City_cleaned
0  Colombo      Colombo
1    Kandy        Kandy
2    Galle        Galle
3  Colombo      Colombo
4    Paris        Paris


### 3) Mixed-Type Data
Using Isolation Forest to detect outliers in datasets with numeric variables.

In [7]:

from sklearn.ensemble import IsolationForest

# Example mixed-type dataset (numeric columns only for outlier detection)
df_mixed = pd.DataFrame({
    'Age': [25, 27, 26, 29, 90],
    'Income': [50000, 52000, 51000, 48000, 200000]
})

# Isolation Forest to detect outliers
iso = IsolationForest(contamination=0.2, random_state=42)
df_mixed['outlier_flag'] = iso.fit_predict(df_mixed)

# In IsolationForest: -1 indicates outlier, 1 indicates inlier
print(df_mixed)

# Handling: replace outliers with NaN for later imputation
df_mixed_handled = df_mixed.copy()
df_mixed_handled.loc[df_mixed_handled['outlier_flag'] == -1, ['Age', 'Income']] = pd.NA
print("\nAfter replacing outliers with NaN:\n", df_mixed_handled)


   Age  Income  outlier_flag
0   25   50000             1
1   27   52000             1
2   26   51000             1
3   29   48000             1
4   90  200000            -1

After replacing outliers with NaN:
     Age   Income  outlier_flag
0  25.0  50000.0             1
1  27.0  52000.0             1
2  26.0  51000.0             1
3  29.0  48000.0             1
4   NaN      NaN            -1
