In [12]:
import os
import pandas as pd
import numpy as np
import ast
import xgboost as xgb
from rapidfuzz import process, fuzz
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error, mean_squared_error, r2_score

In [2]:
df = pd.read_csv(os.path.join(os.getcwd(), "airbnbListingsData.csv"))

In [3]:
dropping_col = [ 'host_name',
 'host_location',
 'host_about',
 'host_response_rate',
 'host_acceptance_rate',
 'host_total_listings_count',
 'host_has_profile_pic',
 'host_identity_verified',
 'n_host_verifications',
 'calculated_host_listings_count_entire_homes',
 'calculated_host_listings_count_private_rooms',
 'calculated_host_listings_count_shared_rooms', 'neighborhood_overview', 'name', 'description']
df.drop(columns=dropping_col, errors='ignore', inplace=True)

In [4]:
# ─────────── Missing Values ───────────
#   object→'Unknown', numeric→mean
for c in df.columns:
    if df[c].dtype == 'object':
        df[c].fillna('Unknown', inplace=True)
    else:
        df[c].fillna(df[c].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[c].fillna(df[c].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[c].fillna('Unknown', inplace=True)


In [5]:
# ─────────── Price & Log-Transform ───────────
df['price'] = df['price'].replace('[\$,]', '', regex=True).astype(float)
df['log_price'] = np.log1p(df['price'])

In [6]:
# ─────────── Amenities Normalize & Prune ───────────
df['amenities'] = df['amenities'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else [])
standard_amenities = {
    'wifi': ['wifi', 'fast wifi'],
    'tv': ['tv', 'hdtv', 'flat screen'],
    'streaming_services': ['netflix', 'hbo max', 'amazon prime video', 'apple tv', 'chromecast', 'roku'],
    'body_soap': ['body soap', 'bar soap', 'body wash'],
    'shampoo': ['shampoo'],
    'conditioner': ['conditioner'],
    'sound_system': ['sound system', 'bluetooth sound system', 'speaker'],
    'oven': ['oven', 'air fryer'],
    'stove': ['stove', 'gas stove', 'electric stove'],
    'workspace': ['workspace', 'monitor', 'desk', 'office chair'],
    'refrigerator': ['refrigerator', 'fridge', 'mini fridge'],
    'parking': ['parking', 'garage', 'driveway'],
    'children_amenities': ['children', 'books and toys', 'crib', 'baby bath'],
    'gym': ['gym', 'fitness'],
    'pool': ['pool', 'rooftop pool', 'heated pool']
}

def normalize(a):
    a = a.lower()
    for cat, kws in standard_amenities.items():
        match, score, _ = process.extractOne(a, kws, scorer=fuzz.partial_ratio) or (None, 0, None)
        if score > 80:
            return cat
    return None

df['norm_amenities'] = df['amenities'].apply(lambda L: {normalize(a) for a in L if normalize(a)})

# Build binary columns for the 20 most common normalized amenities
all_norms = set().union(*df['norm_amenities'])
amen_df = pd.DataFrame([{n: int(n in norms) for n in all_norms} for norms in df['norm_amenities']])
freq    = amen_df.sum().sort_values(ascending=False)
top20   = freq.index[:20]
amen_df = amen_df[top20]                            # keep just top 20
df['amenity_count'] = amen_df.sum(axis=1)
df = pd.concat([df, amen_df], axis=1)

# ─────────── Final Cleanup ───────────
# drop raw columns no longer needed
df.drop(columns=['amenities','norm_amenities','price'], inplace=True)

In [7]:
# ─────────── One-Hot & Boolean Encode ───────────
df = pd.get_dummies(df, columns=['neighbourhood_group_cleansed','room_type'])
bools = df.select_dtypes(include='bool').columns
for b in bools:
    df[b] = LabelEncoder().fit_transform(df[b])

In [36]:
# ─────────── Train/Test Split & Model ───────────
X = df.drop(columns='log_price')
y = df['log_price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [11]:
rf = RandomForestRegressor(n_estimators=200, random_state=1234)
rf.fit(X_train, y_train)
preds = rf.predict(X_test)

print("MSE:", mean_squared_error(y_test, preds))
print("RMSE (log):", root_mean_squared_error(y_test, preds))
print("R² (log):",  r2_score(y_test, preds))

Training RandomForest…
MSE: 0.14025548779124986
RMSE (log): 0.37450699298043805
R² (log): 0.7089545968191524


In [41]:
# model = xgb.XGBRegressor(
#             objective='reg:squarederror',
#             n_estimators=700,
#             learning_rate=0.01,
#             max_depth=6,
#             reg_alpha=0.1,
#             reg_lambda=1.0,
#         )

model = xgb.XGBRegressor(
            objective='reg:squarederror',
            n_estimators=2500,
            learning_rate=0.01,
            subsample=0.8,
            max_depth=7,
            reg_alpha=0.2,
            reg_lambda=0.8,
        )

model.scaler = StandardScaler()
model.feature_selector = None

model.fit(X_train, y_train)
xgb_preds = model.predict(X_test)

print("MSE:", mean_squared_error(y_test, xgb_preds))
print("RMSE (log):", root_mean_squared_error(y_test, xgb_preds))
print("R² (log):",  r2_score(y_test, xgb_preds))

MSE: 0.13368560249702366
RMSE (log): 0.36563041790450596
R² (log): 0.7185300228469576


In [30]:
param_grid = {
    'max_depth': [7],
    'learning_rate': [0.01],
    'n_estimators': [1300, 1500, 1700],
    'subsample': [0.8],
    'reg_alpha': [0.2],
    'reg_lambda': [0.8],
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)
print(grid_search.best_params_)

{'learning_rate': 0.01, 'max_depth': 7, 'n_estimators': 1700, 'reg_alpha': 0.2, 'reg_lambda': 0.8, 'subsample': 0.8}
