In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import datetime
import joblib

# Load your dataset (replace with your actual file path)
data = pd.read_csv('/content/realtor-data.zip.csv')  # Update this path!

In [None]:
# Define columns
columns = ['price', 'bed', 'bath', 'acre_lot', 'street', 'city', 'state', 'zip_code', 'house_size', 'prev_sold_date']
data = data[columns]

# Data Preprocessing
data['prev_sold_date'] = pd.to_datetime(data['prev_sold_date'], errors='coerce')
data['sold_year'] = data['prev_sold_date'].dt.year.fillna(0).astype(int)
data = data.dropna(subset=['price'])

# Encode categorical variables
label_encoders = {}
for col in ['street', 'city', 'state', 'zip_code']:
    le = LabelEncoder()
    data[col] = data[col].fillna('Unknown').astype(str)
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

In [None]:
# Handle missing numerical values
data['bed'] = data['bed'].fillna(data['bed'].median())
data['bath'] = data['bath'].fillna(data['bath'].median())
data['acre_lot'] = data['acre_lot'].fillna(data['acre_lot'].median())
data['house_size'] = data['house_size'].fillna(data['house_size'].median())

# Features and target
X = data.drop(columns=['price', 'prev_sold_date'])
y = data['price']

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define XGBoost model
xgb_model = XGBRegressor(objective='reg:squarederror', random_state=42)

# Hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid,
                           cv=3, scoring='neg_mean_squared_error', n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

# Best model
best_model = grid_search.best_estimator_
print(f"Best parameters: {grid_search.best_params_}")

Fitting 3 folds for each of 48 candidates, totalling 144 fits
Best parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': 7, 'n_estimators': 100, 'subsample': 1.0}


In [None]:
# Evaluate
y_pred = best_model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Root Mean Squared Error: {rmse}")
print(f"R² Score: {r2_score(y_test, y_pred):.4f}")
print(f"Mean Absolute Error: {mean_absolute_error(y_test, y_pred):,.2f}")

# Feature importance
for feature, importance in zip(X.columns, best_model.feature_importances_):
    print(f"{feature}: {importance:.4f}")

# Predict current and future prices
current_date = datetime.datetime.now().year  # 2025
future_data = X_test.copy()
current_predictions = best_model.predict(future_data)
print(f"Average predicted price for 2025: ${np.mean(current_predictions):,.2f}")
future_data['sold_year'] = future_data['sold_year'].apply(lambda x: 2030 if x > 0 else x)
future_predictions = best_model.predict(future_data)
print(f"Average predicted price for 2030: ${np.mean(future_predictions):,.2f}")

Root Mean Squared Error: 1114254.0283854913
R² Score: 0.3547
Mean Absolute Error: 328,482.19
bed: 0.1124
bath: 0.0414
acre_lot: 0.0093
street: 0.0570
city: 0.0208
state: 0.7212
zip_code: 0.0126
house_size: 0.0238
sold_year: 0.0016
Average predicted price for 2025: $565,203.38
Average predicted price for 2030: $565,198.19


In [None]:
# Save model and label encoders
joblib.dump(best_model, 'real_estate_price_xgb_model.pkl')
for col, le in label_encoders.items():
    joblib.dump(le, f'{col}_label_encoder.pkl')