In [34]:
# Setup code
import config
import pandas as pd
import numpy as np
import json
from sklearn.model_selection import train_test_split
from IPython.display import display

In [23]:
# Load the processed datasets
dataset_original = pd.read_csv(config.PROCESSED_ORIGINAL_FILE)
dataset_removed = pd.read_csv(config.PROCESSED_REMOVED_FILE)
dataset_capped = pd.read_csv(config.PROCESSED_CAPPED_FILE)

# Display the first few rows of each dataset
display(dataset_original.head())
display(dataset_removed.head())
display(dataset_capped.head())

Unnamed: 0,POSTED_BY,UNDER_CONSTRUCTION,RERA,BHK_NO.,BHK_OR_RK,SQUARE_FT,READY_TO_MOVE,RESALE,ADDRESS,LONGITUDE,LATITUDE,TARGET(PRICE_IN_LACS)
0,Owner,0,0,2,BHK,1300.236407,1,1,"Ksfc Layout,Bangalore",12.96991,77.59796,55.0
1,Dealer,0,0,2,BHK,1275.0,1,1,"Vishweshwara Nagar,Mysore",12.274538,76.644605,51.0
2,Owner,0,0,2,BHK,933.159722,1,1,"Jigani,Bangalore",12.778033,77.632191,43.0
3,Owner,0,1,2,BHK,929.921143,1,1,"Sector-1 Vaishali,Ghaziabad",28.6423,77.3445,62.5
4,Dealer,1,0,2,BHK,999.009247,0,1,"New Town,Kolkata",22.5922,88.484911,60.5


Unnamed: 0,POSTED_BY,UNDER_CONSTRUCTION,RERA,BHK_NO.,BHK_OR_RK,SQUARE_FT,READY_TO_MOVE,RESALE,ADDRESS,LONGITUDE,LATITUDE,TARGET(PRICE_IN_LACS)
0,Owner,0,0,2,BHK,1300.236407,1,1,"Ksfc Layout,Bangalore",12.96991,77.59796,55.0
1,Dealer,0,0,2,BHK,1275.0,1,1,"Vishweshwara Nagar,Mysore",12.274538,76.644605,51.0
2,Owner,0,0,2,BHK,933.159722,1,1,"Jigani,Bangalore",12.778033,77.632191,43.0
3,Owner,0,1,2,BHK,929.921143,1,1,"Sector-1 Vaishali,Ghaziabad",28.6423,77.3445,62.5
4,Dealer,1,0,2,BHK,999.009247,0,1,"New Town,Kolkata",22.5922,88.484911,60.5


Unnamed: 0,POSTED_BY,UNDER_CONSTRUCTION,RERA,BHK_NO.,BHK_OR_RK,SQUARE_FT,READY_TO_MOVE,RESALE,ADDRESS,LONGITUDE,LATITUDE,TARGET(PRICE_IN_LACS)
0,Owner,0,0,2,BHK,1300.236407,1,1,"Ksfc Layout,Bangalore",12.96991,77.59796,55.0
1,Dealer,0,0,2,BHK,1275.0,1,1,"Vishweshwara Nagar,Mysore",12.274538,76.644605,51.0
2,Owner,0,0,2,BHK,933.159722,1,1,"Jigani,Bangalore",12.778033,77.632191,43.0
3,Owner,0,1,2,BHK,929.921143,1,1,"Sector-1 Vaishali,Ghaziabad",28.6423,77.3445,62.5
4,Dealer,1,0,2,BHK,999.009247,0,1,"New Town,Kolkata",22.5922,88.484911,60.5


In [24]:
# Analyse Categorical Features Cardinality
cardinality_analysis = []
for feature in config.CATEGORICAL_FEATURES:
  if feature in dataset_original.columns:
    unique_count = dataset_original[feature].nunique()
    total_records = len(dataset_original)
    ratio = unique_count / total_records
    # Drop features with high cardinality or high ratio to avoid excessive dimensionality
    if unique_count > 100 or ratio > 0.5:
      strategy = 'DROP'
    else:
      strategy = 'ONE-HOT'

    cardinality_analysis.append({
      'Feature': feature,
      'Unique Values': unique_count,
      'Cardinality Ratio': round(ratio, 3),
      'Encoding Strategy': strategy
    })

cardinality_df = pd.DataFrame(cardinality_analysis)
print(f"Categorical Feature Cardinality Analysis:")
display(cardinality_df)

Categorical Feature Cardinality Analysis:


Unnamed: 0,Feature,Unique Values,Cardinality Ratio,Encoding Strategy
0,POSTED_BY,3,0.0,ONE-HOT
1,UNDER_CONSTRUCTION,2,0.0,ONE-HOT
2,BHK_OR_RK,2,0.0,ONE-HOT
3,RERA,2,0.0,ONE-HOT
4,READY_TO_MOVE,2,0.0,ONE-HOT
5,RESALE,2,0.0,ONE-HOT
6,ADDRESS,6899,0.234,DROP


In [25]:
# Droping features with high cardinality
# Features to drop from the table above have been added to the config file
# Clean the datasets by dropping specified features

dataset_original_clean = dataset_original.drop(columns=config.FEATURES_TO_DROP)
dataset_removed_clean = dataset_removed.drop(columns=config.FEATURES_TO_DROP)
dataset_capped_clean = dataset_capped.drop(columns=config.FEATURES_TO_DROP)

In [None]:
# Feature Engineering Function
def engineer_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Applies a set of feature engineering steps to the dataset.

    This function creates new features from existing columns, such as ratios,
    categorical bins, and interaction terms, to enhance the predictive
    power of the data for modeling.

    Parameters:
        df (pd.DataFrame): The input DataFrame to which new features will be added.

    Returns:
        pd.DataFrame: A new DataFrame containing the original columns plus the
                      newly engineered features.
    """
    df_engineered = df.copy()
    
    # Create price per square foot
    df_engineered['PRICE_PER_SQFT'] = df_engineered['TARGET(PRICE_IN_LACS)'] / df_engineered['SQUARE_FT']
    
    # Create total rooms indicator (if BHK_NO exists)
    if 'BHK_NO' in df_engineered.columns:
        df_engineered['TOTAL_ROOMS'] = df_engineered['BHK_NO']
        df_engineered['LARGE_PROPERTY'] = (df_engineered['BHK_NO'] >= 4).astype(int)
    
    # Create size categories from continuous square footage
    df_engineered['SIZE_CATEGORY'] = pd.cut(df_engineered['SQUARE_FT'], 
                                          bins=[0, 500, 1000, 2000, float('inf')], 
                                          labels=['Small', 'Medium', 'Large', 'XLarge'])
    
    # Create a basic geographic interaction feature
    df_engineered['LAT_LONG_INTERACTION'] = df_engineered['LATITUDE'] * df_engineered['LONGITUDE']
    
    return df_engineered

# Apply feature engineering to the cleaned datasets
dataset_original_eng = engineer_features(dataset_original_clean)
dataset_removed_eng = engineer_features(dataset_removed_clean)
dataset_capped_eng = engineer_features(dataset_capped_clean)

In [None]:
# Cell 6: Handle Categorical Encoding
def encode_categorical_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Encodes all specified categorical features using one-hot encoding.

    This function combines the original and newly engineered categorical features
    defined in the config file, then applies one-hot encoding to prepare them
    for machine learning models.

    Parameters:
        df (pd.DataFrame): The input DataFrame with features to be encoded.

    Returns:
        pd.DataFrame: A new DataFrame with categorical columns replaced by their
                      one-hot encoded representations.
    """
    df_encoded = df.copy()
    
    all_categorical_to_encode = [
        col for col in config.CATEGORICAL_FEATURES if col not in config.FEATURES_TO_DROP
    ] + config.ENGINEERED_CATEGORICAL_FEATURES
    
    # One-hot encode categorical features
    for col in all_categorical_to_encode:
        if col in df_encoded.columns:
            # Get dummies and drop first to avoid multicollinearity
            dummies = pd.get_dummies(df_encoded[col], prefix=col, drop_first=True)
            df_encoded = pd.concat([df_encoded, dummies], axis=1)
            df_encoded.drop(col, axis=1, inplace=True)
    
    return df_encoded

# Apply encoding to all datasets
dataset_original_encoded = encode_categorical_features(dataset_original_eng)
dataset_removed_encoded = encode_categorical_features(dataset_removed_eng)
dataset_capped_encoded = encode_categorical_features(dataset_capped_eng)


In [None]:
# Feature Scaling Preparation
def prepare_features_for_scaling(df: pd.DataFrame) -> tuple[pd.DataFrame, pd.Series, list]:
    """
    Separates the dataset into features (X) and the target variable (y).

    This function uses the master lists from the config file to identify
    the final set of numerical features that will require scaling before
    being passed to a model.

    Parameters:
        df (pd.DataFrame): The fully processed and encoded DataFrame.

    Returns:
        tuple: A tuple containing:
            - X (pd.DataFrame): The DataFrame of features for modeling.
            - y (pd.Series): The Series containing the target variable.
            - numerical_features (list): A list of column names in X that are numerical.
    """
    # Separate the target variable using the name from the config file
    y = df[config.TARGET_VARIABLE]
    X = df.drop(columns=[config.TARGET_VARIABLE])

    # Dynamically build the definitive list of all numerical features
    # by combining the original and engineered lists from the config file.
    all_numerical_features = config.NUMERICAL_FEATURES + config.ENGINEERED_NUMERICAL_FEATURES

    # Ensure we only select features that actually exist in the final DataFrame X
    final_numerical_to_scale = [
        col for col in all_numerical_features if col in X.columns
    ]

    return X, y, final_numerical_to_scale

# Prepare all datasets
X_original, y_original, num_features_original = prepare_features_for_scaling(dataset_original_encoded)
X_removed, y_removed, _ = prepare_features_for_scaling(dataset_removed_encoded)
X_capped, y_capped, _ = prepare_features_for_scaling(dataset_capped_encoded)

In [None]:
# Perform Train-Test Split for All Datasets

# Split the original dataset into training and testing sets
X_original_train, X_original_test, y_original_train, y_original_test = train_test_split(
    X_original, y_original, test_size=0.2, random_state=42
)

# Split the removed dataset into training and testing sets
X_removed_train, X_removed_test, y_removed_train, y_removed_test = train_test_split(
    X_removed, y_removed, test_size=0.2, random_state=42
)

# Split the capped dataset into training and testing sets
X_capped_train, X_capped_test, y_capped_train, y_capped_test = train_test_split(
    X_capped, y_capped, test_size=0.2, random_state=42
)

# Create a summary table of the splits
split_summary = pd.DataFrame({
    'Dataset': ['Original', 'Removed Outliers', 'Capped Outliers'],
    'Train Size': [len(X_original_train), len(X_removed_train), len(X_capped_train)],
    'Test Size': [len(X_original_test), len(X_removed_test), len(X_capped_test)]
})

display(split_summary)
split_summary.to_csv(config.RESULTS_DIR / 'split_summary.csv', index=False)

Unnamed: 0,Dataset,Train Size,Test Size
0,Original,23560,5891
1,Removed Outliers,21093,5274
2,Capped Outliers,23560,5891


In [None]:
# Summary of the feature engineering and encoding steps
def create_feature_summary(original_df: pd.DataFrame, final_df: pd.DataFrame) -> dict:
    """
    Creates a summary dictionary comparing a DataFrame before and after feature engineering.

    This utility function quantifies the changes made during the feature engineering
    process, providing key metrics suitable for reporting and analysis.

    Parameters:
        original_df (pd.DataFrame): The DataFrame before any feature engineering was applied
                                    (but after dropping high-cardinality columns).
        final_df (pd.DataFrame): The fully processed and encoded DataFrame, ready for modeling.

    Returns:
        dict: A dictionary containing summary statistics about the feature transformation.
    """
    # Calculate the number of new columns that are one-hot encoded dummies
    # This is a heuristic that counts columns containing an underscore
    encoded_dummies = len([
        col for col in final_df.columns 
        if '_' in col and col != config.TARGET_VARIABLE
    ])
    
    summary = {
        'Original Features': len(original_df.columns),
        'Final Features': len(final_df.columns),
        'Features Added': len(final_df.columns) - len(original_df.columns),
        'Encoded Dummy Features': encoded_dummies,
        'Final Numerical Features': len(final_df.select_dtypes(include=np.number).columns) - 1,  # -1 for target
        'Total Samples': len(final_df)
    }
    
    return summary

# Create summaries for each dataset
original_summary = create_feature_summary(dataset_original_clean, dataset_original_encoded)
removed_summary = create_feature_summary(dataset_removed_clean, dataset_removed_encoded)
capped_summary = create_feature_summary(dataset_capped_clean, dataset_capped_encoded)

# Combine summaries into a DataFrame for display
summary_df = pd.DataFrame([
    {'Dataset': 'Original', **original_summary},
    {'Dataset': 'Removed Outliers', **removed_summary},
    {'Dataset': 'Capped Outliers', **capped_summary}
])

# Display the summary DataFrame
display(summary_df)
summary_df.to_csv(config.RESULTS_DIR / 'feature_summary.csv', index=False)

Unnamed: 0,Dataset,Original Features,Final Features,Features Added,Encoded Dummy Features,Final Numerical Features,Total Samples
0,Original,11,17,6,14,6,29451
1,Removed Outliers,11,17,6,14,6,26367
2,Capped Outliers,11,17,6,14,6,29451


In [35]:
# Save all data splits and feature lists for modelling

# Save the original dataset splits
X_original_train.to_csv(config.PROCESSED_DATA_DIR / 'X_original_train.csv', index=False)
X_original_test.to_csv(config.PROCESSED_DATA_DIR / 'X_original_test.csv', index=False)
y_original_train.to_csv(config.PROCESSED_DATA_DIR / 'y_original_train.csv', index=False)
y_original_test.to_csv(config.PROCESSED_DATA_DIR / 'y_original_test.csv', index=False)

# Save the removed outliers dataset splits
X_removed_train.to_csv(config.PROCESSED_DATA_DIR / 'X_removed_train.csv', index=False)
X_removed_test.to_csv(config.PROCESSED_DATA_DIR / 'X_removed_test.csv', index=False)
y_removed_train.to_csv(config.PROCESSED_DATA_DIR / 'y_removed_train.csv', index=False)
y_removed_test.to_csv(config.PROCESSED_DATA_DIR / 'y_removed_test.csv', index=False)

# Save the capped outliers dataset splits
X_capped_train.to_csv(config.PROCESSED_DATA_DIR / 'X_capped_train.csv', index=False)
X_capped_test.to_csv(config.PROCESSED_DATA_DIR / 'X_capped_test.csv', index=False)
y_capped_train.to_csv(config.PROCESSED_DATA_DIR / 'y_capped_train.csv', index=False)
y_capped_test.to_csv(config.PROCESSED_DATA_DIR / 'y_capped_test.csv', index=False)

# Save the final feature lists
with open(config.PROCESSED_DATA_DIR / 'numerical_features_to_scale.json', 'w') as f:
  json.dump(num_features_original, f)