In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
df = pd.read_csv("./House_Rent_Dataset.csv")

In [4]:
df.head()

Unnamed: 0,Posted On,BHK,Rent,Size,Floor,Area Type,Area Locality,City,Furnishing Status,Tenant Preferred,Bathroom,Point of Contact
0,5/18/2022,2,10000.0,1100.0,Ground out of 2,Super Area,Bandel,Kolkata,Unfurnished,Bachelors/Family,2.0,Contact Owner
1,5/13/2022,2,20000.0,,1 out of 3,Super Area,"Phool Bagan, Kankurgachi",Kolkata,Semi-Furnished,Bachelors/Family,1.0,Contact Owner
2,5/16/2022,2,17000.0,1000.0,1 out of 3,Super Area,Salt Lake City Sector 2,Kolkata,Semi-Furnished,Bachelors/Family,1.0,Contact Owner
3,7/4/2022,2,10000.0,,1 out of 2,Super Area,Dumdum Park,Kolkata,Unfurnished,Bachelors/Family,1.0,Contact Owner
4,5/9/2022,2,7500.0,850.0,1 out of 2,Carpet Area,South Dum Dum,Kolkata,Unfurnished,Bachelors,1.0,Contact Owner


In [5]:
df.drop(columns=["Posted On", "Point of Contact", "Area Locality"], inplace=True)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4746 entries, 0 to 4745
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   BHK                4746 non-null   int64  
 1   Rent               4741 non-null   float64
 2   Size               4738 non-null   float64
 3   Floor              4746 non-null   object 
 4   Area Type          4739 non-null   object 
 5   City               4746 non-null   object 
 6   Furnishing Status  4746 non-null   object 
 7   Tenant Preferred   4746 non-null   object 
 8   Bathroom           4742 non-null   float64
dtypes: float64(3), int64(1), object(5)
memory usage: 333.8+ KB


In [7]:
df.describe()

Unnamed: 0,BHK,Rent,Size,Bathroom
count,4746.0,4741.0,4738.0,4742.0
mean,2.08386,35023.4,967.936049,1.965837
std,0.832256,78142.14,634.562635,0.884904
min,1.0,1200.0,10.0,1.0
25%,2.0,10000.0,550.0,1.0
50%,2.0,16000.0,850.0,2.0
75%,3.0,33000.0,1200.0,2.0
max,6.0,3500000.0,8000.0,10.0


In [8]:
df.isnull().sum()

BHK                  0
Rent                 5
Size                 8
Floor                0
Area Type            7
City                 0
Furnishing Status    0
Tenant Preferred     0
Bathroom             4
dtype: int64

In [9]:
df.dropna(inplace=True)

In [10]:
print("Number of duplicate rows:", df.duplicated().sum())
df.drop_duplicates(inplace=True)

Number of duplicate rows: 41


In [11]:
print("Number of null values in each column:", df.isnull().sum().sum())
print("Number of duplicate rows:", df.duplicated().sum())

Number of null values in each column: 0
Number of duplicate rows: 0


In [12]:
num_cols = df.select_dtypes(exclude=['object']).columns.tolist()
print("Length of numerical columns:", len(num_cols))
print("Numerical columns:", num_cols)

Length of numerical columns: 4
Numerical columns: ['BHK', 'Rent', 'Size', 'Bathroom']


In [13]:
def cap_outliers_iqr(df, cols):
    df_capped = df.copy()
    for col in cols:
        Q1 = df_capped[col].quantile(0.25)
        Q3 = df_capped[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df_capped[col] = df_capped[col].apply(
            lambda x: lower_bound if x < lower_bound else upper_bound if x > upper_bound else x
        )
    return df_capped

In [14]:
df = cap_outliers_iqr(df, num_cols)

print(df.describe())

               BHK          Rent         Size     Bathroom
count  4685.000000   4685.000000  4685.000000  4685.000000
mean      2.082284  24677.977161   933.871078     1.924440
std       0.816021  19949.702181   508.073384     0.756378
min       1.000000   1200.000000    10.000000     1.000000
25%       2.000000  10000.000000   558.000000     1.000000
50%       2.000000  16000.000000   850.000000     2.000000
75%       3.000000  33000.000000  1200.000000     2.000000
max       4.500000  67500.000000  2163.000000     3.500000


In [15]:
cat_cols = df.select_dtypes(include=['object']).columns.tolist()
print("Length of categorical columns:", len(cat_cols))
print("Categorical columns:", cat_cols)

Length of categorical columns: 5
Categorical columns: ['Floor', 'Area Type', 'City', 'Furnishing Status', 'Tenant Preferred']


In [16]:
for col in cat_cols[:3]:
  print(df[col].value_counts())

Floor
1 out of 2         370
Ground out of 2    336
2 out of 3         309
2 out of 4         302
1 out of 3         289
                  ... 
1 out of 11          1
6 out of 29          1
28 out of 31         1
1 out of 15          1
2 out of 11          1
Name: count, Length: 480, dtype: int64
Area Type
Super Area     2413
Carpet Area    2270
Built Area        2
Name: count, dtype: int64
City
Mumbai       968
Chennai      880
Bangalore    874
Hyderabad    863
Delhi        599
Kolkata      501
Name: count, dtype: int64


In [17]:
for col in cat_cols[3:6]:
  print(df[col].value_counts())

Furnishing Status
Semi-Furnished    2227
Unfurnished       1786
Furnished          672
Name: count, dtype: int64
Tenant Preferred
Bachelors/Family    3396
Bachelors            819
Family               470
Name: count, dtype: int64


In [18]:
# Split and clean both parts
split_floor = df["Floor"].str.split("out of", expand=True)

# Strip whitespace from both columns
split_floor[0] = split_floor[0].fillna("").str.strip()
split_floor[1] = split_floor[1].fillna("").str.strip()

In [19]:
split_floor[0] = split_floor[0].replace({
    "Ground": "0",
    "Upper Basement": "-1",
    "Basement": "-2",
    "Lower Basement": "-3"
})

In [20]:
df["Current Floor"] = pd.to_numeric(split_floor[0], errors="coerce").astype("Int64")
df["Total Floors"] = pd.to_numeric(split_floor[1], errors="coerce").astype("Int64")

In [21]:
df.dropna(inplace=True)
df.drop(columns=["Floor"], inplace=True)

In [22]:
df = df[df["Area Type"] != "Built Area"]

In [23]:
cat_cols = df.select_dtypes(include=['object']).columns.tolist()
print("Length of categorical columns:", len(cat_cols))
print("Categorical columns:", cat_cols)

Length of categorical columns: 4
Categorical columns: ['Area Type', 'City', 'Furnishing Status', 'Tenant Preferred']


In [24]:
df_encoded = pd.get_dummies(df, columns=cat_cols, drop_first=True)

In [25]:
df_encoded.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4679 entries, 0 to 4745
Data columns (total 16 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   BHK                                4679 non-null   float64
 1   Rent                               4679 non-null   float64
 2   Size                               4679 non-null   float64
 3   Bathroom                           4679 non-null   float64
 4   Current Floor                      4679 non-null   Int64  
 5   Total Floors                       4679 non-null   Int64  
 6   Area Type_Super Area               4679 non-null   bool   
 7   City_Chennai                       4679 non-null   bool   
 8   City_Delhi                         4679 non-null   bool   
 9   City_Hyderabad                     4679 non-null   bool   
 10  City_Kolkata                       4679 non-null   bool   
 11  City_Mumbai                        4679 non-null   bool   
 1

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, GradientBoostingRegressor, VotingRegressor, StackingRegressor
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.base import clone

In [27]:
df_encoded.head()

Unnamed: 0,BHK,Rent,Size,Bathroom,Current Floor,Total Floors,Area Type_Super Area,City_Chennai,City_Delhi,City_Hyderabad,City_Kolkata,City_Mumbai,Furnishing Status_Semi-Furnished,Furnishing Status_Unfurnished,Tenant Preferred_Bachelors/Family,Tenant Preferred_Family
0,2.0,10000.0,1100.0,2.0,0,2,True,False,False,False,True,False,False,True,True,False
2,2.0,17000.0,1000.0,1.0,1,3,True,False,False,False,True,False,True,False,True,False
4,2.0,7500.0,850.0,1.0,1,2,False,False,False,False,True,False,False,True,False,False
5,2.0,7000.0,600.0,2.0,0,1,True,False,False,False,True,False,False,True,True,False
6,2.0,10000.0,700.0,2.0,0,4,True,False,False,False,True,False,False,True,False,False


In [28]:
scaler = StandardScaler()
numerical_cols = ["BHK", "Size", "Bathroom", "Current Floor", "Total Floors"]
df_encoded[numerical_cols] = scaler.fit_transform(df_encoded[numerical_cols])

In [29]:
X = df_encoded.drop("Rent", axis=1)
y = df_encoded["Rent"]

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [31]:
model_results = {}

In [32]:
# Linear Regression
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

model_results["LinearRegression"] = {"R2": r2, "MSE": mse}
print(f"üîπ Linear Regression\nR2 Score: {r2:.4f}\nMSE: {mse:.2f}")

üîπ Linear Regression
R2 Score: 0.7408
MSE: 114431897.68


In [33]:
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

print("Train R2 Score:", r2_score(y_train, y_train_pred))
print("Test R2 Score:", r2_score(y_test, y_test_pred))

Train R2 Score: 0.7573679891824434
Test R2 Score: 0.7407955371233316


In [34]:
# SVR Regression
param_grid = {
    "C": [1, 10],
    "epsilon": [0.1, 0.2],
    "kernel": ["rbf"]
}

grid = GridSearchCV(SVR(), param_grid, cv=3, scoring="r2", n_jobs=-1)
grid.fit(X_train, y_train)
y_pred = grid.predict(X_test)

r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

model_results["SVR"] = {"R2": r2, "MSE": mse}
print(f"üîπ SVR\nBest Params: {grid.best_params_}\nR2 Score: {r2:.4f}\nMSE: {mse:.2f}")

üîπ SVR
Best Params: {'C': 10, 'epsilon': 0.2, 'kernel': 'rbf'}
R2 Score: -0.0365
MSE: 457592166.60


In [35]:
y_train_pred = grid.predict(X_train)
y_test_pred = grid.predict(X_test)

print("Train R2 Score:", r2_score(y_train, y_train_pred))
print("Test R2 Score:", r2_score(y_test, y_test_pred))

Train R2 Score: -0.006677325969553838
Test R2 Score: -0.036511096714409774


In [36]:
# KNN Regression
param_grid = {
    "n_neighbors": [3, 5, 7],
    "weights": ["uniform", "distance"]
}

grid = GridSearchCV(KNeighborsRegressor(), param_grid, cv=3, scoring="r2", n_jobs=-1)
grid.fit(X_train, y_train)
y_pred = grid.predict(X_test)

r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

model_results["KNN"] = {"R2": r2, "MSE": mse}
print(f"üîπ KNN\nBest Params: {grid.best_params_}\nR2 Score: {r2:.4f}\nMSE: {mse:.2f}")

üîπ KNN
Best Params: {'n_neighbors': 7, 'weights': 'distance'}
R2 Score: 0.7811
MSE: 96623231.80


In [37]:
y_train_pred = grid.predict(X_train)
y_test_pred = grid.predict(X_test)

print("Train R2 Score:", r2_score(y_train, y_train_pred))
print("Test R2 Score:", r2_score(y_test, y_test_pred))

Train R2 Score: 0.9975874623090998
Test R2 Score: 0.781134688774282


In [38]:
# Decision Tree Regression
param_grid = {
    "max_depth": [5, 10, None],
    "min_samples_split": [2, 5]
}

grid = GridSearchCV(DecisionTreeRegressor(), param_grid, cv=3, scoring="r2", n_jobs=-1)
grid.fit(X_train, y_train)
y_pred = grid.predict(X_test)

r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

model_results["DecisionTree"] = {"R2": r2, "MSE": mse}
print(f"üîπ Decision Tree\nBest Params: {grid.best_params_}\nR2 Score: {r2:.4f}\nMSE: {mse:.2f}")

üîπ Decision Tree
Best Params: {'max_depth': 5, 'min_samples_split': 2}
R2 Score: 0.7673
MSE: 102715338.51


In [39]:
y_train_pred = grid.predict(X_train)
y_test_pred = grid.predict(X_test)

print("Train R2 Score:", r2_score(y_train, y_train_pred))
print("Test R2 Score:", r2_score(y_test, y_test_pred))

Train R2 Score: 0.7911850862438496
Test R2 Score: 0.7673352038385599


In [40]:
# Random Forest Regression
param_grid = {
    "n_estimators": [50, 100],
    "max_depth": [None, 10],
    "min_samples_split": [2, 5]
}

grid = GridSearchCV(RandomForestRegressor(), param_grid, cv=3, scoring="r2", n_jobs=-1)
grid.fit(X_train, y_train)
y_pred = grid.predict(X_test)

r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

model_results["RandomForest"] = {"R2": r2, "MSE": mse}
print(f"üîπ Random Forest\nBest Params: {grid.best_params_}\nR2 Score: {r2:.4f}\nMSE: {mse:.2f}")

üîπ Random Forest
Best Params: {'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 100}
R2 Score: 0.8307
MSE: 74750119.15


In [41]:
y_train_pred = grid.predict(X_train)
y_test_pred = grid.predict(X_test)

print("Train R2 Score:", r2_score(y_train, y_train_pred))
print("Test R2 Score:", r2_score(y_test, y_test_pred))

Train R2 Score: 0.9203358003303774
Test R2 Score: 0.8306803882623927


In [42]:
print("üî∏ Compare all models:")
results_df = pd.DataFrame(model_results).T
print(results_df)

üî∏ Compare all models:
                        R2           MSE
LinearRegression  0.740796  1.144319e+08
SVR              -0.036511  4.575922e+08
KNN               0.781135  9.662323e+07
DecisionTree      0.767335  1.027153e+08
RandomForest      0.830680  7.475012e+07


In [52]:
def regression_analysis(X_train, y_train, X_test, y_test):
    # Define scalers
    scalers = {
        "StandardScaler": StandardScaler(),
        "MinMaxScaler": MinMaxScaler(),
        "RobustScaler": RobustScaler()
    }

    # Define models and their parameter grids
    models_with_params = {
        "LinearRegression": (LinearRegression(), {}),
        "SVM Regression": (SVR(), {"model__C": [0.1, 1, 10]}),
        "KNN Regression": (KNeighborsRegressor(), {"model__n_neighbors": [3, 5, 7]}),
        "DecisionTree": (DecisionTreeRegressor(), {"model__max_depth": [None, 5, 10]}),
        "RandomForest": (RandomForestRegressor(), {"model__n_estimators": [50, 100]}),
        "Bagging": (BaggingRegressor(estimator=DecisionTreeRegressor(), n_estimators=50), {}),
        "Boosting": (GradientBoostingRegressor(), {"model__n_estimators": [50, 100], "model__learning_rate": [0.05, 0.1]}),
        "Voting": (
            VotingRegressor(estimators=[
                ('lr', LinearRegression()),
                ('svr', SVR()),
                ('knn', KNeighborsRegressor())
            ]), {}
        ),
        "Stacking": (
            StackingRegressor(
                estimators=[
                    ('svr', SVR()),
                    ('knn', KNeighborsRegressor()),
                    ('dt', DecisionTreeRegressor())
                ],
                final_estimator=LinearRegression()
            ), {}
        )
    }

    # Feature selector
    feature_selector = SelectKBest(score_func=f_regression, k='all')
    results = []

    # Evaluate models
    for scaler_name, scaler in scalers.items():
        for model_name, (model, param_grid) in models_with_params.items():
            pipe = Pipeline([
                ('scaler', scaler),
                ('select', feature_selector),
                ('model', model)
            ])

            grid = GridSearchCV(pipe, param_grid=param_grid, cv=5, scoring='r2', n_jobs=-1)
            grid.fit(X_train, y_train)

            best_model = grid.best_estimator_
            y_pred = best_model.predict(X_test)

            results.append({
                "Scaler": scaler_name,
                "Model": model_name,
                "Best Params": grid.best_params_,
                "R2 Score": r2_score(y_test, y_pred),
                "MSE": mean_squared_error(y_test, y_pred),
                "MAE": mean_absolute_error(y_test, y_pred)
            })

    # Create and format results DataFrame
    df_results = pd.DataFrame(results)
    df_results_sorted = df_results.sort_values(by="R2 Score", ascending=False)
    df_results_sorted["R2 Score"] = df_results_sorted["R2 Score"].apply(lambda x: round(x, 4))
    df_results_sorted["MSE"] = df_results_sorted["MSE"].apply(lambda x: f"{x:.2e}")
    df_results_sorted["MAE"] = df_results_sorted["MAE"].apply(lambda x: f"{x:,.2f}")
    df_display = df_results_sorted[["Model", "Scaler", "R2 Score", "MSE", "MAE", "Best Params"]]

    # Print top results
    print("\nüìä üîù Top Results by R¬≤ Score:\n")
    print(df_display.to_string(index=False))

    # Calculate and print model averages
    df_avg = df_results.copy()
    df_avg["R2 Score"] = pd.to_numeric(df_avg["R2 Score"], errors="coerce")
    df_avg["MSE"] = pd.to_numeric(df_avg["MSE"], errors="coerce")
    df_avg["MAE"] = pd.to_numeric(df_avg["MAE"], errors="coerce")

    model_avg = df_avg.groupby("Model")[["R2 Score", "MSE", "MAE"]].mean().reset_index()
    model_avg = model_avg.sort_values(by="R2 Score", ascending=False)
    print("\nüìå Average Performance per Model:\n")
    print(model_avg.to_string(index=False, formatters={
        "R2 Score": "{:.4f}".format,
        "MSE": "{:.2e}".format,
        "MAE": "{:,.2f}".format
    }))

    # Calculate and print scaler averages
    scaler_avg = df_avg.groupby("Scaler")[["R2 Score", "MSE", "MAE"]].mean().reset_index()
    scaler_avg = scaler_avg.sort_values(by="R2 Score", ascending=False)
    print("\nüìå Average Performance per Scaler:\n")
    print(scaler_avg.to_string(index=False, formatters={
        "R2 Score": "{:.4f}".format,
        "MSE": "{:.2e}".format,
        "MAE": "{:,.2f}".format
    }))

    # Print max/min metrics for each scaler
    for scaler in df_avg["Scaler"].unique():
        subset = df_avg[df_avg["Scaler"] == scaler]
        print(f"\nüìä Scaler Results: {scaler}\n")
        print("üî∫ Highest R¬≤:")
        print(subset.loc[subset["R2 Score"].idxmax()][["Model", "R2 Score", "MSE", "MAE"]].to_string())
        print("\nüîª Lowest R¬≤:")
        print(subset.loc[subset["R2 Score"].idxmin()][["Model", "R2 Score", "MSE", "MAE"]].to_string())
        print("\nüîª Lowest MSE:")
        print(subset.loc[subset["MSE"].idxmin()][["Model", "R2 Score", "MSE", "MAE"]].to_string())
        print("\nüîª Lowest MAE:")
        print(subset.loc[subset["MAE"].idxmin()][["Model", "R2 Score", "MSE", "MAE"]].to_string())

    # Compare train vs test R¬≤ scores
    print("\nüß™ Train vs Test R¬≤ Score Comparison:\n")
    for scaler_name, scaler in scalers.items():
        print(f"\nüöÄ Scaler: {scaler_name}")
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        for model_name, (model, _) in models_with_params.items():
            model_clone = clone(model)
            model_clone.fit(X_train_scaled, y_train)
            train_score = model_clone.score(X_train_scaled, y_train)
            test_score = model_clone.score(X_test_scaled, y_test)
            print(f"üîπ {model_name:18} | Train R¬≤: {train_score:.4f} | Test R¬≤: {test_score:.4f}")
    
    # Get the best result for each model
    best_per_model = df_results.loc[df_results.groupby("Model")["R2 Score"].idxmax()]
    # Sort by R¬≤ Score descending
    best_per_model = best_per_model.sort_values(by="R2 Score", ascending=False)
    print("üìà Best result for each model:\n")
    for _, row in best_per_model.iterrows():
        print(f"üß† Model        : {row['Model']}")
        print(f"   üîß Scaler    : {row['Scaler']}")
        print(f"   üéØ R2 Score  : {row['R2 Score']:.4f}")
        print(f"   üß™ MSE       : {row['MSE']:.2f}")
        print(f"   üìâ MAE       : {row['MAE']:.2f}")
        print(f"   ‚öôÔ∏è Best Params: {row['Best Params']}\n")

    return df_results_sorted

In [53]:
results = regression_analysis(X_train, y_train, X_test, y_test)


üìä üîù Top Results by R¬≤ Score:

           Model         Scaler  R2 Score      MSE       MAE                                               Best Params
         Bagging StandardScaler    0.8263 7.67e+07  5,556.57                                                        {}
    RandomForest   RobustScaler    0.8253 7.71e+07  5,543.38                               {'model__n_estimators': 50}
    RandomForest StandardScaler    0.8250 7.72e+07  5,562.24                              {'model__n_estimators': 100}
         Bagging   RobustScaler    0.8226 7.83e+07  5,600.76                                                        {}
        Boosting   MinMaxScaler    0.8219 7.86e+07  5,666.63 {'model__learning_rate': 0.1, 'model__n_estimators': 100}
        Boosting StandardScaler    0.8218 7.87e+07  5,666.78 {'model__learning_rate': 0.1, 'model__n_estimators': 100}
        Boosting   RobustScaler    0.8218 7.87e+07  5,667.69 {'model__learning_rate': 0.1, 'model__n_estimators': 100}
    Random