In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score

# Path to your CSV file
data_path = 'airbnb_Chicago_cleaned.csv'

# Columns to use
cols = [
    "Airbnb Host ID", "Airbnb Property ID", "superhost_period_all", "Scraped Date",
    "host_is_superhost_in_period", "prev_host_is_superhost_in_period", "Superhost",
    "prev_host_is_superhost", "superhost_change", "superhost_change_lose_superhost",
    "superhost_change_gain_superhost", "rating_ave_pastYear", "numReviews_pastYear",
    "numCancel_pastYear", "num_5_star_Rev_pastYear", "prop_5_StarReviews_pastYear",
    "prev_rating_ave_pastYear", "prev_numReviews_pastYear", "prev_numCancel_pastYear",
    "prev_num_5_star_Rev_pastYear", "prev_prop_5_StarReviews_pastYear", "numReservedDays_pastYear",
    "numReserv_pastYear", "prev_numReservedDays_pastYear", "prev_numReserv_pastYear",
    "available_days", "available_days_aveListedPrice", "booked_days", "booked_days_avePrice",
    "prev_available_days", "prev_available_days_aveListedPrice", "prev_booked_days",
    "prev_booked_days_avePrice", "Property Type", "Listing Type", "Created Date", "Zipcode",
    "Bedrooms", "Bathrooms", "Neighborhood", "Max Guests", "Cleaning Fee (USD)", "Minimum Stay",
    "Number of Photos", "Latitude", "Longitude", "Pets Allowed", "Instantbook Enabled",
    "prev_Instantbook Enabled", "Nightly Rate", "prev_Nightly Rate", "Number of Reviews",
    "prev_Number of Reviews", "Rating Overall", "prev_Rating Overall", "revenue", "occupancy_rate",
    "prev_revenue", "prev_occupancy_rate", "census_tract", "tract_total_pop", "tract_white_perc",
    "tract_black_perc", "tract_asian_perc", "tract_housing_units", "zip_total_population",
    "zip_hispanic_or_latino_anyrace", "zip_white_nothispanic", "zip_black_nothispanic",
    "zip_asian_nothispanic", "tract_count_obs", "tract_unique_prices", "Nightly Rate_tractQuartile",
    "prev_Nightly Rate_tractQuartile", "available_days_aveListedPrice_tractQuartile",
    "prev_available_days_aveListedPrice_tractQuartile", "tract_superhosts", "tract_prev_superhosts",
    "tract_price_variance", "tractQuartilePrice_variance", "prev_host_is_superhost1",
    "prev_year_superhosts", "booked_days_period_city", "revenue_period_city",
    "booked_days_period_tract", "revenue_period_tract", "tract_booking_share", "tract_revenue_share"
]

df = pd.read_csv(data_path, usecols=cols)

# Extract seasonal factors
date_col = 'Scraped_Date' if 'Scraped_Date' in df.columns else 'Created_Date'
if date_col in df.columns:
    df[date_col] = pd.to_datetime(df[date_col], errors='coerce')
    df[date_col+'_month'] = df[date_col].dt.month
    df[date_col+'_day'] = df[date_col].dt.day
    df.drop(date_col, axis=1, inplace=True)

# Rename columns to avoid spaces and parentheses
df.columns = df.columns.str.replace(' ', '_').str.replace('(', '', regex=False).str.replace(')', '', regex=False)

target = 'revenue'
feature_candidates = [
    'Nightly_Rate', 
    'Nightly_Rate_tractQuartile', 
    'tract_price_variance', 
    'available_days', 
    'Property_Type', 
    'Neighborhood',
    'Rating_Overall' if 'Rating_Overall' in df.columns else 'rating_ave_pastYear'
]

if 'Scraped_Date_month' in df.columns:
    feature_candidates.append('Scraped_Date_month')
if 'Scraped_Date_day' in df.columns:
    feature_candidates.append('Scraped_Date_day')

feature_candidates = [f for f in feature_candidates if f in df.columns and f != target]

# Drop rows with missing target
df = df.dropna(subset=[target])

# Handle outliers for target and Nightly_Rate
for col in [target, 'Nightly_Rate']:
    if col in df.columns:
        cap = df[col].quantile(0.99)
        df.loc[df[col] > cap, col] = cap

# Apply log1p transformation
if target in df.columns:
    df[target] = np.log1p(df[target])
if 'Nightly_Rate' in df.columns:
    df['Nightly_Rate'] = np.log1p(df['Nightly_Rate'])

X = df[feature_candidates].copy()
y = df[target].copy()

# Identify categorical and numeric features
cat_cols = [c for c in X.columns if X[c].dtype == 'object']
num_cols = [c for c in X.columns if X[c].dtype != 'object']

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_cols),
        ('cat', categorical_transformer, cat_cols)
    ]
)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Simplified parameter grid for faster search
param_grid = {
    'model__n_estimators': [100],   # fewer options
    'model__max_depth': [10, None]  # only two options
}

model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(random_state=42))
])

search = GridSearchCV(model_pipeline, param_grid, cv=3, n_jobs=-1, scoring='r2')
search.fit(X_train, y_train)

best_model = search.best_estimator_
y_pred = best_model.predict(X_test)

test_r2 = r2_score(y_test, y_pred)
print(f"Best Params: {search.best_params_}")
print(f"Test R²: {test_r2:.2f}")

final_model = best_model.named_steps['model']
encoded_cat_columns = best_model.named_steps['preprocessor'].transformers_[1][1].get_feature_names_out(cat_cols)
all_feature_names = np.concatenate([num_cols, encoded_cat_columns])
importances = pd.Series(final_model.feature_importances_, index=all_feature_names).sort_values(ascending=False)

print("\nFeature Importances:")
print(importances.head(20))


Best Params: {'model__max_depth': 10, 'model__n_estimators': 100}
Test R²: 0.26

Feature Importances:
Nightly_Rate                            0.506160
available_days                          0.273732
tract_price_variance                    0.075774
Rating_Overall                          0.039128
Nightly_Rate_tractQuartile              0.014971
Neighborhood_Near West Side             0.007085
Neighborhood_Loop                       0.004037
Neighborhood_Near North Side            0.003576
Neighborhood_Bronzeville                0.003338
Property_Type_Apartment                 0.003175
Neighborhood_Englewood                  0.002945
Property_Type_House                     0.002857
Neighborhood_Lakeview                   0.002524
Property_Type_Room in boutique hotel    0.002206
Property_Type_Serviced apartment        0.001850
Property_Type_Condominium               0.001621
Neighborhood_West Ridge                 0.001406
Neighborhood_Rogers Park                0.001406
Neighborhood_Wes