# Outlier Detection
This notebook conducts a comprehensive outlier detection and treatment within the House Price Prediction Challenge dataset, addressing one of the central research questions of this project: how different outlier handling strategies impact regression model performance. The primary objectives are:
- Statistical Outlier Detection
- Outlier Pattern Characterisation
- Comaparative Dataset Generation
- Treatment Impact Quantification
- Treatment Rationale Establishment

In [None]:
# Setup imports
import config
import pandas as pd
from IPython.display import display

In [2]:
# Load data from csv into dataframe
data_df = pd.read_csv(config.RAW_DATA_FILE)

In [None]:
def detect_outliers_iqr(data: pd.DataFrame, column_name: str) -> tuple[dict, pd.DataFrame]:
    """
    Detect outliers using the IQR method and return structured results.
    
    Parameters:
    data (DataFrame): Dataset containing the column
    column_name (str): Name of column to analyse
    
    Returns:
    tuple: (summary_dict, outliers_dataframe)
    """
    Q1 = data[column_name].quantile(0.25)
    Q3 = data[column_name].quantile(0.75)
    IQR = Q3 - Q1
    
    # Define outlier bounds
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Identify outliers
    outliers = data[(data[column_name] < lower_bound) | (data[column_name] > upper_bound)]
    
    # Create summary dictionary with clean names
    summary = {
        'Feature': column_name,
        'Total Records': len(data),
        'Q1 (25th Percentile)': round(Q1, 2),
        'Q3 (75th Percentile)': round(Q3, 2),
        'IQR': round(IQR, 2),
        'Lower Threshold': round(lower_bound, 2),
        'Upper Threshold': round(upper_bound, 2),
        'Outliers Detected': len(outliers),
        'Outlier Percentage': round((len(outliers) / len(data)) * 100, 2),
        'Records Remaining': len(data) - len(outliers)
    }
    
    return summary, outliers

In [4]:
# Detect price outliers
price_summary, price_outliers = detect_outliers_iqr(data_df, config.TARGET_VARIABLE)
price_summary_df = pd.DataFrame([price_summary])

print("Outlier Detection Summary")
display(price_summary_df)

Outlier Detection Summary


Unnamed: 0,Feature,Total Records,Q1 (25th Percentile),Q3 (75th Percentile),IQR,Lower Threshold,Upper Threshold,Outliers Detected,Outlier Percentage,Records Remaining
0,TARGET(PRICE_IN_LACS),29451,38.0,100.0,62.0,-55.0,193.0,3084,10.47,26367


In [None]:
# Save outlier results to file
price_summary_df.to_csv(config.RESULTS_DIR / 'price_outlier_summary.csv', index=False)

In [None]:
# Separate normal properties from outliers
normal_properties = data_df[~data_df.index.isin(price_outliers.index)]

# Create comparison statistics
def compare_outliers_vs_normal(normal_df: pd.DataFrame, outlier_df: pd.DataFrame, features: list) -> pd.DataFrame:
    """Compare characteristics between normal and outlier properties"""
    
    comparison_data = []
    
    for feature in features:
        if feature in normal_df.columns and feature in outlier_df.columns:
            normal_stats = {
                'Feature': feature,
                'Group': 'Normal Properties',
                'Count': len(normal_df),
                'Mean': round(normal_df[feature].mean(), 2),
                'Median': round(normal_df[feature].median(), 2),
                'Std Dev': round(normal_df[feature].std(), 2),
                'Min': round(normal_df[feature].min(), 2),
                'Max': round(normal_df[feature].max(), 2)
            }
            
            outlier_stats = {
                'Feature': feature,
                'Group': 'Outlier Properties',
                'Count': len(outlier_df),
                'Mean': round(outlier_df[feature].mean(), 2),
                'Median': round(outlier_df[feature].median(), 2),
                'Std Dev': round(outlier_df[feature].std(), 2),
                'Min': round(outlier_df[feature].min(), 2),
                'Max': round(outlier_df[feature].max(), 2)
            }
            
            comparison_data.extend([normal_stats, outlier_stats])
    
    return pd.DataFrame(comparison_data)

# Features to compare
features_to_analyze = config.NUMERICAL_FEATURES + [config.TARGET_VARIABLE]

comparison_df = compare_outliers_vs_normal(normal_properties, price_outliers, features_to_analyze)

print("Outlier vs Normal Property Characteristics")
display(comparison_df)

Outlier vs Normal Property Characteristics


Unnamed: 0,Feature,Group,Count,Mean,Median,Std Dev,Min,Max
0,SQUARE_FT,Normal Properties,26367,1489.15,1132.13,27718.38,3.0,4300000.0
1,SQUARE_FT,Outlier Properties,3084,176371.33,2017.56,5873552.49,398.01,254545500.0
2,LONGITUDE,Normal Properties,26367,21.45,21.17,6.3,-37.71,59.91
3,LONGITUDE,Outlier Properties,3084,20.01,19.1,5.14,9.59,52.56
4,LATITUDE,Normal Properties,26367,77.07,77.35,10.46,-121.76,152.96
5,LATITUDE,Outlier Properties,3084,74.88,73.78,11.19,-87.68,136.0
6,TARGET(PRICE_IN_LACS),Normal Properties,26367,66.15,55.0,40.53,0.25,190.0
7,TARGET(PRICE_IN_LACS),Outlier Properties,3084,799.04,300.0,1904.39,200.0,30000.0


In [7]:
# Analyze categorical differences
print("Categorical Feature Distribution: Outliers vs Normal")

for feature in config.CATEGORICAL_FEATURES:
    if feature in data_df.columns:
        print(f"\n{feature}:")
        
        # Normal properties distribution
        normal_dist = normal_properties[feature].value_counts(normalize=True) * 100
        outlier_dist = price_outliers[feature].value_counts(normalize=True) * 100
        
        # Combine for comparison
        comparison = pd.DataFrame({
            'Normal Properties (%)': normal_dist,
            'Outlier Properties (%)': outlier_dist
        }).fillna(0).round(2)
        
        display(comparison)

Categorical Feature Distribution: Outliers vs Normal

POSTED_BY:


Unnamed: 0_level_0,Normal Properties (%),Outlier Properties (%)
POSTED_BY,Unnamed: 1_level_1,Unnamed: 2_level_1
Dealer,58.46,93.32
Owner,39.27,5.93
Builder,2.27,0.75



UNDER_CONSTRUCTION:


Unnamed: 0_level_0,Normal Properties (%),Outlier Properties (%)
UNDER_CONSTRUCTION,Unnamed: 1_level_1,Unnamed: 2_level_1
0,82.12,81.23
1,17.88,18.77



BHK_OR_RK:


Unnamed: 0_level_0,Normal Properties (%),Outlier Properties (%)
BHK_OR_RK,Unnamed: 1_level_1,Unnamed: 2_level_1
BHK,99.91,100.0
RK,0.09,0.0



RERA:


Unnamed: 0_level_0,Normal Properties (%),Outlier Properties (%)
RERA,Unnamed: 1_level_1,Unnamed: 2_level_1
0,69.33,58.63
1,30.67,41.37



READY_TO_MOVE:


Unnamed: 0_level_0,Normal Properties (%),Outlier Properties (%)
READY_TO_MOVE,Unnamed: 1_level_1,Unnamed: 2_level_1
1,82.12,81.23
0,17.88,18.77



RESALE:


Unnamed: 0_level_0,Normal Properties (%),Outlier Properties (%)
RESALE,Unnamed: 1_level_1,Unnamed: 2_level_1
1,93.39,89.27
0,6.61,10.73



ADDRESS:


Unnamed: 0_level_0,Normal Properties (%),Outlier Properties (%)
ADDRESS,Unnamed: 1_level_1,Unnamed: 2_level_1
", panvel,Mumbai",0.00,0.0
",Manoramaganj,Indore",0.00,0.0
"100 Feet Road,Anand",0.00,0.0
"101,yamuna Complex, Gokul Vihar Society,nh-8, Vapi,Vapi",0.00,0.0
"150 Feet Ring Road,Rajkot",0.01,0.0
...,...,...
"virar,Palghar",0.00,0.0
"vishakoderu,Bhimavaram",0.00,0.0
"walkeshwari nagari,Jamnagar",0.00,0.0
"west mambalam,Chennai",0.00,0.0


In [None]:
# Dataset 1: Keep all original data
dataset_original = data_df.copy()
dataset_original.to_csv(config.PROCESSED_DATA_DIR / 'dataset_original.csv', index=False)

# Dataset 2: Remove outliers completely
dataset_removed = data_df[~data_df.index.isin(price_outliers.index)].copy()
dataset_removed.to_csv(config.PROCESSED_DATA_DIR / 'dataset_removed.csv', index=False)

# Dataset 3: Cap outliers at 5th and 95th percentiles
dataset_capped = data_df.copy()

lower_cap = data_df['TARGET(PRICE_IN_LACS)'].quantile(0.05)
upper_cap = data_df['TARGET(PRICE_IN_LACS)'].quantile(0.95)

dataset_capped['TARGET(PRICE_IN_LACS)'] = dataset_capped['TARGET(PRICE_IN_LACS)'].clip(lower=lower_cap, upper=upper_cap)
dataset_capped.to_csv(config.PROCESSED_DATA_DIR / 'dataset_capped.csv', index=False)

In [9]:
# Create comparison summary
dataset_summary = pd.DataFrame({
    'Dataset': ['Original', 'Outliers Removed', 'Outliers Capped'],
    'Records': [len(dataset_original), len(dataset_removed), len(dataset_capped)],
    'Min Price': [dataset_original['TARGET(PRICE_IN_LACS)'].min(), 
                  dataset_removed['TARGET(PRICE_IN_LACS)'].min(),
                  dataset_capped['TARGET(PRICE_IN_LACS)'].min()],
    'Max Price': [dataset_original['TARGET(PRICE_IN_LACS)'].max(),
                  dataset_removed['TARGET(PRICE_IN_LACS)'].max(), 
                  dataset_capped['TARGET(PRICE_IN_LACS)'].max()],
    'Mean Price': [dataset_original['TARGET(PRICE_IN_LACS)'].mean(),
                   dataset_removed['TARGET(PRICE_IN_LACS)'].mean(),
                   dataset_capped['TARGET(PRICE_IN_LACS)'].mean()],
    'Std Dev': [dataset_original['TARGET(PRICE_IN_LACS)'].std(),
                dataset_removed['TARGET(PRICE_IN_LACS)'].std(),
                dataset_capped['TARGET(PRICE_IN_LACS)'].std()]
}).round(2)

print("Dataset Comparison Summary")
display(dataset_summary)  

Dataset Comparison Summary


Unnamed: 0,Dataset,Records,Min Price,Max Price,Mean Price,Std Dev
0,Original,29451,0.25,30000.0,142.9,656.88
1,Outliers Removed,26367,0.25,190.0,66.15,40.53
2,Outliers Capped,29451,19.5,300.0,87.7,73.9


In [None]:
dataset_summary.to_csv(config.RESULTS_DIR / 'dataset_comparison_summary.csv', index=False)