import pandas as pd
from sklearn.model_selection import train_test_split

# Step 1: Read the dataset
airbnbdata = pd.read_csv("/Users/rox/Desktop/sta160 project/airbnbvars.csv", low_memory=False)

# Only coerce the "Enroll" columns to numeric (fixes mixed types with minimal impact)
for c in [c for c in airbnbdata.columns if "Enroll" in c]:
    airbnbdata[c] = pd.to_numeric(airbnbdata[c], errors="coerce")

# Step 2: List of cities
cities = [
    'Los Angeles', 'Oakland', 'Pacific Grove',
    'San Diego', 'San Francisco', 'San Mateo',
    'Santa Clara', 'Santa Cruz'
]

# Step 3: Split into train/test sets for each city and create separate variables
print(f"{'City':<15} {'Train Var':<25} {'Train Rows':<10} {'Test Var':<25} {'Test Rows':<10}")
print("-" * 90)

for city in cities:
    # Match names correctly (ignore spaces, tabs)
    city_data = airbnbdata[
        airbnbdata['city'].str.strip().str.replace(r'\s+', ' ', regex=True) == city
    ]

    if len(city_data) == 0:
        print(f"{city:<15} {'(skipped)':<25} {0:<10} {'(skipped)':<25} {0:<10}")
        continue

    # Split
    X_train, X_test = train_test_split(city_data, test_size=0.2, random_state=42)

    # Use underscores in variable names
    var_base = city.replace(' ', '_')
    train_varname = f"{var_base}_train"
    test_varname = f"{var_base}_test"

    # Create dynamic global variables
    globals()[train_varname] = X_train.reset_index(drop=True)
    globals()[test_varname] = X_test.reset_index(drop=True)

    print(f"{city:<15} {train_varname:<25} {X_train.shape[0]:<10} {test_varname:<25} {X_test.shape[0]:<10}")


In [7]:
list(airbnbdata.columns)


['neighborhood',
 'city',
 'room_type',
 'bathrooms',
 'bedrooms',
 'beds',
 'price',
 'review_scores_location',
 'shared_bathroom',
 'air conditioning',
 'bathtub',
 'dishwasher',
 'dryer',
 'freezer',
 'heating',
 'hot water',
 'oven',
 'parking',
 'refrigerator',
 'stove',
 'tv',
 'washer',
 'wifi',
 'county',
 'crime_rate_percent',
 'percent_bike',
 'percent_car',
 'percent_carpool',
 'percent_home',
 'percent_publictr',
 'percent_total',
 'percent_walk',
 'Population',
 'num_priv_schools',
 'Total Enroll',
 'K Enroll',
 'G1 Enroll',
 'G2 Enroll',
 'G3 Enroll',
 'G4 Enroll',
 'G5 Enroll',
 'G6 Enroll',
 'G7 Enroll',
 'G8 Enroll',
 'G9 Enroll',
 'G10 Enroll',
 'G11 Enroll',
 'G12 Enroll']

In [3]:
#Drop problematic columns for SF
sf_train = globals()['San_Francisco_train'].drop(
    columns=['review_scores_location', 'neighborhood', 'room_type'],
    errors='ignore'
)
sf_test  = globals()['San_Francisco_test'].drop(
    columns=['review_scores_location', 'neighborhood', 'room_type'],
    errors='ignore'
)

for _df in (sf_train, sf_test):
    num_cols = _df.select_dtypes(include='number').columns
    _df[num_cols] = _df[num_cols].fillna(_df[num_cols].median())

globals()['San_Francisco_train'] = sf_train
globals()['San_Francisco_test']  = sf_test

# -----------------------------
# 4) OTHER CITIES: fill numeric NA (replace prior dropna)
# -----------------------------
all_cities = ['Los_Angeles', 'Oakland', 'Pacific_Grove', 
              'San_Diego', 'San_Francisco', 'San_Mateo', 
              'Santa_Clara', 'Santa_Cruz']

for city in all_cities:
    if city != 'San_Francisco':
        for nm in (f"{city}_train", f"{city}_test"):
            _df = globals()[nm].copy()
            num_cols = _df.select_dtypes(include='number').columns
            _df[num_cols] = _df[num_cols].fillna(_df[num_cols].median())
            globals()[nm] = _df

In [4]:
from sklearn.metrics import mean_squared_error, r2_score

def build_and_evaluate_model_safe(model, train_df, test_df, target='price', drop_columns=['city']):
    y_train = train_df[target]
    y_test  = test_df[target]

    # Drop target and unwanted columns
    X_train = train_df.drop(columns=[target] + drop_columns, errors='ignore')
    X_test  = test_df[X_train.columns]

    # Convert all columns to numeric if possible (coerce errors)
    X_train = X_train.apply(pd.to_numeric, errors='coerce').fillna(0)
    X_test  = X_test.apply(pd.to_numeric, errors='coerce').fillna(0)

    if X_train.shape[1] == 0:
        raise ValueError("No numeric features available after processing.")

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    metrics = {'RMSE': mean_squared_error(y_test, y_pred),
               'R2': r2_score(y_test, y_pred)}

    return model, metrics

In [5]:
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from xgboost import XGBRegressor

# -------------------------------------------
# Per-city stacked ensemble: XGB + RF -> LinearRegression meta-model
# -------------------------------------------

target = 'price'
drop_cols = ['city']  # already split by city

city_metrics = {}
city_models = {}

for city in all_cities:
    train_name = f"{city}_train"
    test_name  = f"{city}_test"

    if train_name not in globals() or test_name not in globals():
        continue

    train_df = globals()[train_name].copy()
    test_df  = globals()[test_name].copy()

    # Skip very small cities
    if len(train_df) < 50 or len(test_df) < 10:
        print(f"Skipping {city} (too few samples)")
        continue

    # 1) Light winsorization of price to reduce outliers
    q1, q99 = np.percentile(train_df[target], [1, 99])
    train_df[target] = train_df[target].clip(q1, q99)
    test_df[target]  = test_df[target].clip(q1, q99)

    # 2) Encode categoricals with LabelEncoder (per city, train+test together)
    for col in train_df.select_dtypes(include='object').columns:
        le = LabelEncoder()
        combined = pd.concat([train_df[col], test_df[col]], axis=0).astype(str)
        le.fit(combined)
        train_df[col] = le.transform(train_df[col].astype(str))
        test_df[col]  = le.transform(test_df[col].astype(str))

    # 3) Build feature matrices
    X_train = train_df.drop(columns=[target] + drop_cols, errors='ignore')
    X_test  = test_df[X_train.columns]
    y_train = np.log1p(train_df[target])   # log-price
    y_test  = np.log1p(test_df[target])

    # Drop zero-variance features
    keep = X_train.columns[X_train.nunique() > 1]
    X_train = X_train[keep]
    X_test  = X_test[keep]

    # Final NA guard
    X_train = X_train.apply(pd.to_numeric, errors='coerce').fillna(X_train.median())
    X_test  = X_test.apply(pd.to_numeric, errors='coerce').fillna(X_train.median())

    n_samples = len(train_df)

    # 4) Define base models
    # XGBoost: main nonlinear model
    if n_samples < 500:      # smaller cities
        n_est = 400
        max_depth = 4
    else:
        n_est = 800
        max_depth = 6

    base_models = [
        ("xgb", XGBRegressor(
            n_estimators=n_est,
            max_depth=max_depth,
            learning_rate=0.05,
            subsample=0.9,
            colsample_bytree=0.9,
            min_child_weight=3,
            gamma=0.0,
            reg_alpha=4.0,
            reg_lambda=4.0,
            tree_method="hist",
            eval_metric="rmse",
            random_state=42,
        )),
        ("rf", RandomForestRegressor(
            n_estimators=300,
            max_depth=None,
            min_samples_leaf=3,
            n_jobs=-1,
            random_state=42,
        ))
    ]

    # 5) Stacking: get out-of-fold predictions for each base model
    kf = KFold(n_splits=3, shuffle=True, random_state=42)
    n_models = len(base_models)

    oof_preds = np.zeros((X_train.shape[0], n_models))

    for fold_idx, (tr_idx, val_idx) in enumerate(kf.split(X_train)):
        X_tr, X_val = X_train.iloc[tr_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[tr_idx], y_train.iloc[val_idx]

        for m_idx, (name, model_template) in enumerate(base_models):
            # Re-instantiate model each fold
            if name == "xgb":
                model = XGBRegressor(
                    n_estimators=n_est,
                    max_depth=max_depth,
                    learning_rate=0.05,
                    subsample=0.9,
                    colsample_bytree=0.9,
                    min_child_weight=3,
                    gamma=0.0,
                    reg_alpha=4.0,
                    reg_lambda=4.0,
                    tree_method="hist",
                    eval_metric="rmse",
                    random_state=42 + fold_idx,  # tiny tweak per fold
                )
            else:  # rf
                model = RandomForestRegressor(
                    n_estimators=300,
                    max_depth=None,
                    min_samples_leaf=3,
                    n_jobs=-1,
                    random_state=42 + fold_idx,
                )

            model.fit(X_tr, y_tr)
            oof_preds[val_idx, m_idx] = model.predict(X_val)

    # 6) Fit meta-model on OOF predictions
    meta_model = LinearRegression()
    meta_model.fit(oof_preds, y_train)

    # 7) Fit base models on full training data for final predictions
    fitted_models = {}
    for name, model_template in base_models:
        if name == "xgb":
            model = XGBRegressor(
                n_estimators=n_est,
                max_depth=max_depth,
                learning_rate=0.05,
                subsample=0.9,
                colsample_bytree=0.9,
                min_child_weight=3,
                gamma=0.0,
                reg_alpha=4.0,
                reg_lambda=4.0,
                tree_method="hist",
                eval_metric="rmse",
                random_state=42,
            )
        else:
            model = RandomForestRegressor(
                n_estimators=300,
                max_depth=None,
                min_samples_leaf=3,
                n_jobs=-1,
                random_state=42,
            )
        model.fit(X_train, y_train)
        fitted_models[name] = model

    # 8) Test-time stacked predictions
    test_base_preds = np.column_stack([
        fitted_models["xgb"].predict(X_test),
        fitted_models["rf"].predict(X_test),
    ])
    y_pred = meta_model.predict(test_base_preds)

    # 9) Metrics
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2   = r2_score(y_test, y_pred)

    city_metrics[city] = {'RMSE': rmse, 'R2': r2}
    city_models[city] = {
        'xgb': fitted_models["xgb"],
        'rf': fitted_models["rf"],
        'meta': meta_model
    }

    print(f"{city}: RMSE = {rmse:.2f}, R2 = {r2:.4f}")

print("\n--- Per-city stacked ensemble performance ---")
for c, m in city_metrics.items():
    print(f"{c}: RMSE = {m['RMSE']:.2f}, R2 = {m['R2']:.4f}")


Los_Angeles: RMSE = 0.40, R2 = 0.7804
Oakland: RMSE = 0.35, R2 = 0.6708
Pacific_Grove: RMSE = 0.45, R2 = 0.1887
San_Diego: RMSE = 0.40, R2 = 0.7662
San_Francisco: RMSE = 0.46, R2 = 0.6149
San_Mateo: RMSE = 0.41, R2 = 0.7662
Santa_Clara: RMSE = 0.39, R2 = 0.7421
Santa_Cruz: RMSE = 0.43, R2 = 0.6631

--- Per-city stacked ensemble performance ---
Los_Angeles: RMSE = 0.40, R2 = 0.7804
Oakland: RMSE = 0.35, R2 = 0.6708
Pacific_Grove: RMSE = 0.45, R2 = 0.1887
San_Diego: RMSE = 0.40, R2 = 0.7662
San_Francisco: RMSE = 0.46, R2 = 0.6149
San_Mateo: RMSE = 0.41, R2 = 0.7662
Santa_Clara: RMSE = 0.39, R2 = 0.7421
Santa_Cruz: RMSE = 0.43, R2 = 0.6631


In [6]:
import joblib

# These dicts will store per-city info
feature_columns = {}
feature_medians = {}

# Re-run the per-city loop but ONLY to capture X_train & medians
# (If you still have X_train/X_test in memory per city, you can reuse that logic here.)

all_cities = ['Los_Angeles', 'Oakland', 'Pacific_Grove', 
              'San_Diego', 'San_Francisco', 'San_Mateo', 
              'Santa_Clara', 'Santa_Cruz']

target = 'price'
drop_cols = ['city']

for city in all_cities:
    train_name = f"{city}_train"
    if train_name not in globals():
        continue

    train_df = globals()[train_name].copy()

    # repeat the same preprocessing steps **up to** where you defined X_train:

    # 1) Winsorize price
    q1, q99 = np.percentile(train_df[target], [1, 99])
    train_df[target] = train_df[target].clip(q1, q99)

    # 2) Label encode object columns (same as training code)
    for col in train_df.select_dtypes(include='object').columns:
        # Be careful: here we only have train_df; thatâ€™s fine for medians/columns
        le = LabelEncoder()
        train_df[col] = train_df[col].astype(str)
        le.fit(train_df[col])
        train_df[col] = le.transform(train_df[col])

    # 3) Build feature matrix (same as training code)
    X_train = train_df.drop(columns=[target] + drop_cols, errors='ignore')
    y_train = np.log1p(train_df[target])

    # Drop zero-variance
    keep = X_train.columns[X_train.nunique() > 1]
    X_train = X_train[keep]

    # Final NA guard
    X_train = X_train.apply(pd.to_numeric, errors='coerce')
    X_train = X_train.fillna(X_train.median())

    # Save feature columns + medians
    feature_columns[city] = X_train.columns.tolist()
    feature_medians[city] = X_train.median()

# Now pack everything into a single artifact dict
artifacts = {
    "city_models": city_models,          # from your training step
    "feature_columns": feature_columns,  # list of columns per city
    "feature_medians": feature_medians   # Series per city
}

# Save to disk
joblib.dump(artifacts, "airbnb_ensemble_artifacts.pkl")
print("Saved artifacts to airbnb_ensemble_artifacts.pkl")


Saved artifacts to airbnb_ensemble_artifacts.pkl
