In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, StackingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
import numpy as np

# Load the dataset
data = pd.read_csv('House_Rent_Dataset - House_Rent_Dataset(1).xlsx - House_Rent_Dataset - House_Rent.csv')

# Display initial dataset information
data.info()

# Data Cleaning
# Check for missing values
print("\nMissing values per column:")
print(data.isnull().sum())

# Drop rows with missing values (if any)
data = data.dropna()

# Encode all categorical columns
label_encoders = {}
for column in data.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le

# Calculate and display Pearson correlation coefficient
print("\nPearson Correlation Coefficients with Rent:")
pearson_corr = data.corr()['Rent'].sort_values(ascending=False)
print(pearson_corr)

# Select only features with strong correlation (based on output)
selected_features = ['Bathroom', 'Size', 'BHK', 'City', 'Area Locality', 'Furnishing Status', 'Area Type']
X = data[selected_features]
y = data['Rent']

# Scale the features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Ensemble models with basic hyperparameter tuning
models = {
    'Random Forest': RandomForestRegressor(n_estimators=200,max_depth=10,min_samples_split=10,min_samples_leaf=5,
    random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=150, learning_rate=0.1, max_depth=3, random_state=42),
    'AdaBoost': AdaBoostRegressor(n_estimators=200, learning_rate=0.1, random_state=42),
    'XGBoost': XGBRegressor(n_estimators=150, learning_rate=0.1, max_depth=3, random_state=42)
}

# Hyperparameter tuning using GridSearchCV (example for Random Forest)
param_grid_rf = {
    'n_estimators': [100, 150, 200],
    'max_depth': [10, 15, 20],
    'min_samples_split': [2, 5, 10]
}

rf = RandomForestRegressor(random_state=42)
grid_search_rf = GridSearchCV(rf, param_grid_rf, cv=3, scoring='r2', n_jobs=-1)
grid_search_rf.fit(X_train, y_train)
print(f"Best Parameters for Random Forest: {grid_search_rf.best_params_}")

# Model Stacking
tuple_estimators = [(name, model) for name, model in models.items()]
stacked_model = StackingRegressor(estimators=tuple_estimators, final_estimator=GradientBoostingRegressor())
stacked_model.fit(X_train, y_train)

# Train and evaluate each model
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"{name} Mean Squared Error: {mse:.2f}")
    print(f"{name} R^2 Score: {r2:.2f}")

# Evaluate the stacked model
y_pred_stacked = stacked_model.predict(X_test)
mse_stacked = mean_squared_error(y_test, y_pred_stacked)
r2_stacked = r2_score(y_test, y_pred_stacked)
print(f"\nStacked Model Mean Squared Error: {mse_stacked:.2f}")
print(f"Stacked Model R^2 Score: {r2_stacked:.2f}")

# Cross-validation for the final stacked model
cv_scores = cross_val_score(stacked_model, X, y, cv=5, scoring='r2')
print(f"\nStacked Model Cross-Validation R^2 Scores: {cv_scores}")
print(f"Mean R^2 Score: {np.mean(cv_scores):.2f}")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4746 entries, 0 to 4745
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   BHK                4746 non-null   int64 
 1   Rent               4746 non-null   int64 
 2   Size               4746 non-null   int64 
 3   Area Type          4746 non-null   object
 4   Area Locality      4746 non-null   object
 5   City               4746 non-null   object
 6   Furnishing Status  4746 non-null   object
 7   Bathroom           4746 non-null   int64 
 8   Point of Contact   4746 non-null   object
dtypes: int64(4), object(5)
memory usage: 333.8+ KB

Missing values per column:
BHK                  0
Rent                 0
Size                 0
Area Type            0
Area Locality        0
City                 0
Furnishing Status    0
Bathroom             0
Point of Contact     0
dtype: int64

Pearson Correlation Coefficients with Rent:
Rent                 1.000000
Ba