In [16]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, VotingRegressor
from sklearn.metrics import r2_score
from xgboost import XGBRegressor
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset
data = pd.read_csv('House_Rent_Dataset - House_Rent_Dataset(1).xlsx - House_Rent_Dataset - House_Rent.csv')  # Replace with your dataset path

# Preprocessing: Handle categorical features
categorical_features = ['Area Type', 'City', 'Furnishing Status']
numerical_features = ['BHK', 'Size', 'Bathroom']

# Split dataset
X = data.drop('Rent', axis=1)
y = data['Rent']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Initialize results dictionary to store scores
results = {}

# Function to evaluate and print the R^2 score
def evaluate_model(name, model, X_test, y_test):
    predictions = model.predict(X_test)
    score = r2_score(y_test, predictions)
    results[name] = score
    print(f"{name} R^2 Score: {score:.4f}")

# 1. Linear Regression Model
linear_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                  ('model', LinearRegression())])
linear_pipeline.fit(X_train, y_train)
evaluate_model("Linear Regression", linear_pipeline, X_test, y_test)

# 2. Random Forest Regressor
rf_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', RandomForestRegressor(random_state=42))])
rf_params = {
    'model__n_estimators': [200],
    'model__max_depth': [10],
    'model__min_samples_split': [10],
    'model__min_samples_leaf': [5],
    'model__random_state': [42],
}
rf_grid = GridSearchCV(rf_pipeline, rf_params, cv=5, scoring='r2', n_jobs=-1)
rf_grid.fit(X_train, y_train)
evaluate_model("Random Forest Regressor", rf_grid.best_estimator_, X_test, y_test)

# 3. Gradient Boosting Regressor
gb_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', GradientBoostingRegressor(random_state=42))])
gb_params = {
    'model__n_estimators': [150],
    'model__learning_rate': [0.1],
    'model__max_depth': [3]
}
gb_grid = GridSearchCV(gb_pipeline, gb_params, cv=5, scoring='r2', n_jobs=-1)
gb_grid.fit(X_train, y_train)
evaluate_model("Gradient Boosting Regressor", gb_grid.best_estimator_, X_test, y_test)

# 4. AdaBoost Regressor
ada_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('model', AdaBoostRegressor(random_state=42))])
ada_params = {
    'model__n_estimators': [200, 300, 400],
    'model__learning_rate': [0.01, 0.1, 1.0]
}
ada_grid = GridSearchCV(ada_pipeline, ada_params, cv=5, scoring='r2', n_jobs=-1)
ada_grid.fit(X_train, y_train)
evaluate_model("AdaBoost Regressor", ada_grid.best_estimator_, X_test, y_test)

# 5. XGBoost Regressor
xgb_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('model', XGBRegressor(objective='reg:squarederror', random_state=42))])
xgb_params = {
    'model__n_estimators': [100, 200, 300],
    'model__learning_rate': [0.01, 0.1, 0.2],
    'model__max_depth': [3, 5, 7]
}
xgb_grid = GridSearchCV(xgb_pipeline, xgb_params, cv=5, scoring='r2', n_jobs=-1)
xgb_grid.fit(X_train, y_train)
evaluate_model("XGBoost Regressor", xgb_grid.best_estimator_, X_test, y_test)

# 6. Ensemble Voting Regressor
ensemble_model = VotingRegressor(estimators=[
    ('rf', rf_grid.best_estimator_.named_steps['model']),
    ('gb', gb_grid.best_estimator_.named_steps['model']),
    ('ada', ada_grid.best_estimator_.named_steps['model']),
    ('xgb', xgb_grid.best_estimator_.named_steps['model'])
])
ensemble_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                    ('model', ensemble_model)])
ensemble_pipeline.fit(X_train, y_train)
evaluate_model("Ensemble Model (Voting Regressor)", ensemble_pipeline, X_test, y_test)

# Print final results
print("\nModel R^2 Scores:")
for model_name, score in results.items():
    print(f"{model_name}: {score:.4f}")


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4746 entries, 0 to 4745
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   BHK                4746 non-null   int64 
 1   Rent               4746 non-null   int64 
 2   Size               4746 non-null   int64 
 3   Area Type          4746 non-null   object
 4   Area Locality      4746 non-null   object
 5   City               4746 non-null   object
 6   Furnishing Status  4746 non-null   object
 7   Bathroom           4746 non-null   int64 
 8   Point of Contact   4746 non-null   object
dtypes: int64(4), object(5)
memory usage: 333.8+ KB

Missing values per column:
BHK                  0
Rent                 0
Size                 0
Area Type            0
Area Locality        0
City                 0
Furnishing Status    0
Bathroom             0
Point of Contact     0
dtype: int64

Pearson Correlation Coefficients with Rent:
Rent                 1.000000
Ba