In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.impute import SimpleImputer

In [45]:
# Read data from pandas df
df = pd.read_csv('zillow_rent_cleaned.csv')

In [65]:
df_model = df.copy()

# Convert Date to datetime
df_model['Date'] = pd.to_datetime(df_model['Date'], format='%B %Y', errors='coerce')

# Extract time-based features
df_model['Year'] = df_model['Date'].dt.year
df_model['Month'] = df_model['Date'].dt.month

# Drop raw Date (models don't handle datetime directly)
df_model.drop(columns=['Date'], inplace=True)

In [82]:
df_model.head()

Unnamed: 0,City Code,City,Metro,County,State,Population Rank,RentPrice,Year,Month
0,6181,New York,New York,Queens,NY,1,1327.100458,2010,11
1,12447,Los Angeles,Los Angeles,Los Angeles,CA,2,2184.0,2010,11
2,17426,Chicago,Chicago,Cook,IL,3,1563.0,2010,11
3,39051,Houston,Houston,Harris,TX,4,1198.0,2010,11
4,13271,Philadelphia,Philadelphia,Philadelphia,PA,5,1092.0,2010,11


In [4]:
X = df_model.drop(columns=['RentPrice'])
y = df_model['RentPrice']

In [81]:
import joblib
from sklearn.linear_model import LinearRegression

# Load processed arrays
X_train_processed = joblib.load("X_train_processed.pkl")
X_test_processed = joblib.load("X_test_processed.pkl")
y_train = joblib.load("y_train.pkl")
y_test = joblib.load("y_test.pkl")

# Load preprocessor if needed for new data
preprocessor = joblib.load("preprocessor.pkl")

In [84]:
# Train model
linreg = LinearRegression()
linreg.fit(X_train_processed, y_train)

# Predict on processed test set
y_pred = linreg.predict(X_test_processed)

In [75]:
y_pred

array([1863.58034127, 2276.35587917, 1567.22895141, ..., 1457.27511348,
       1402.77829779, 1995.83929284])

In [76]:
print("Linear Regression")
print("MAE:", mean_absolute_error(y_test, y_pred))
print("RMSE:", mean_squared_error(y_test, y_pred, squared=False))
print("R²:", r2_score(y_test, y_pred))

Linear Regression
MAE: 150.36831721652246
RMSE: 269.65185527537665
R²: 0.8866883403155227


In [9]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

In [12]:
X_sub = X_train_processed[:10000]  # first 500 rows
y_sub = y_train[:10000]

rf = RandomForestRegressor(random_state=42, n_jobs=1)

param_grid_rf = {
    'n_estimators': [50, 100],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5]
}

rf_search = RandomizedSearchCV(
    rf,
    param_distributions=param_grid_rf,
    n_iter=5,  
    cv=2,
    scoring='neg_mean_absolute_error',
    n_jobs=1    
)

rf_search.fit(X_sub, y_sub)

In [13]:
y_pred_rf = rf_search.predict(X_test_processed)

In [14]:
y_pred_rf

array([1457.45593671, 2104.99596199, 1510.07930952, ..., 1376.79488375,
       1470.26708981, 2228.25689514])

In [15]:
print("Random Forest (Tuned)")
print("MAE:", mean_absolute_error(y_test, y_pred_rf))
print("RMSE:", mean_squared_error(y_test, y_pred_rf, squared=False))
print("R²:", r2_score(y_test, y_pred_rf))

Random Forest (Tuned)
MAE: 239.19169801779998
RMSE: 519.0841350772798
R²: 0.5801025022799855


In [16]:
# Gradient Boosting Regressor

from sklearn.ensemble import GradientBoostingRegressor

In [32]:
gbr = GradientBoostingRegressor(random_state=42)

param_grid_gbr = {
    'n_estimators': [30, 50],    
    'learning_rate': [0.1],       
    'max_depth': [3]             
}

In [33]:
gbr_search = RandomizedSearchCV(
    gbr,
    param_distributions=param_grid_gbr,
    n_iter=5,
    cv=2,
    scoring='neg_mean_absolute_error',
    n_jobs=-1
)

gbr_search.fit(X_train_processed, y_train)
y_pred_gbr = gbr_search.predict(X_test_processed)



In [34]:
y_pred_gbr

array([2031.84163694, 2132.40087542, 1429.96812096, ..., 1268.18573413,
       1457.7285671 , 2002.77494019])

In [35]:
print("Gradient Boosting (Tuned)")
print("MAE:", mean_absolute_error(y_test, y_pred_gbr))
print("RMSE:", mean_squared_error(y_test, y_pred_gbr, squared=False))
print("R²:", r2_score(y_test, y_pred_gbr))

Gradient Boosting (Tuned)
MAE: 342.92163531204926
RMSE: 617.7504737013016
R²: 0.40530546223690167


In [78]:
# Last year and month in your dataset
last_year = df_model['Year'].max()
last_month = df_model['Month'].max()

# Create list of future Year and Month
future_years = []
future_months = []

year, month = last_year, last_month

for _ in range(12):
    month += 1
    if month > 12:
        month = 1
        year += 1
    future_years.append(year)
    future_months.append(month)

In [79]:
#forecast for Los Angeles
future_df = pd.DataFrame({
    'Year': future_years,
    'Month': future_months,
    'Population Rank': [df_model[df_model['City']=='Los Angeles']['Population Rank'].median()]*12,
    'State': ['CA']*12,
    'Metro': ['Los Angeles']*12
})