In [30]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
import joblib
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [23]:
df = pd.read_csv('zillow_rent_cleaned.csv')

In [27]:
df_model = df.copy()

# Convert Date to datetime
df_model['Date'] = pd.to_datetime(df_model['Date'], format='%B %Y', errors='coerce')

# Extract time-based features
df_model['Year'] = df_model['Date'].dt.year
df_model['Month'] = df_model['Date'].dt.month

# Drop raw Date (models don't handle datetime directly)
df_model.drop(columns=['Date'], inplace=True)

In [28]:
df_model.head()

Unnamed: 0,City Code,City,Metro,County,State,Population Rank,RentPrice,Year,Month
0,6181,New York,New York,Queens,NY,1,1327.100458,2010,11
1,12447,Los Angeles,Los Angeles,Los Angeles,CA,2,2184.0,2010,11
2,17426,Chicago,Chicago,Cook,IL,3,1563.0,2010,11
3,39051,Houston,Houston,Harris,TX,4,1198.0,2010,11
4,13271,Philadelphia,Philadelphia,Philadelphia,PA,5,1092.0,2010,11


In [6]:
X = df_model.drop(columns=['RentPrice'])
y = df_model['RentPrice']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

In [8]:
categorical_cols = X_train.select_dtypes(include=['object', 'category']).columns.tolist()
numeric_cols = X_train.select_dtypes(include=['int64','float64']).columns.tolist()


categorical_cols, numeric_cols

(['City', 'Metro', 'County', 'State'],
 ['City Code', 'Population Rank', 'Year', 'Month'])

In [12]:
import joblib

In [44]:
# Define transformers
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),    # fill NaNs
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),  # fill NaNs
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=True))
])

# Combine into preprocessor
preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_cols),
    ('cat', categorical_transformer, categorical_cols)
])

# Fit and transform
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# Save preprocessor and arrays
joblib.dump(preprocessor, "preprocessor.pkl")
joblib.dump(X_train_processed, "X_train_processed.pkl")
joblib.dump(X_test_processed, "X_test_processed.pkl")
joblib.dump(y_train.to_numpy(), "y_train.pkl")
joblib.dump(y_test.to_numpy(), "y_test.pkl")



['y_test.pkl']

Categorical variables were converted into indicator variables using one-hot encoding.
Numeric features were standardized using StandardScaler to ensure comparable magnitudes across features.
The dataset was split into training and testing sets without shuffling to preserve temporal structure.