In [1]:
import pandas as pd

# Load each file
paris = pd.read_csv("data/listings_Paris.csv")
london = pd.read_csv("data/listings_London.csv")
nyc = pd.read_csv("data/listings_nyc.csv")

# Add city column
paris["city"] = "Paris"
london["city"] = "London"
nyc["city"] = "New York"

# Align columns (take common ones only)
common_cols = list(set(paris.columns) & set(london.columns) & set(nyc.columns))
paris = paris[common_cols]
aqnyc = nyc[common_cols]

# Combine all
df = pd.concat([paris, london, nyc], ignore_index=True)

# Save combined dataset
df.to_csv("data/airbnb_listings_combined.csv", index=False)
print(f"‚úÖ Combined dataset created: {df.shape[0]} rows and {df.shape[1]} columns")




‚úÖ Combined dataset created: 214615 rows and 80 columns


In [2]:
import pandas as pd
df = pd.read_csv("data/airbnb_listings_combined.csv", low_memory=False)
print(df.shape)
df.head(2)


(214615, 80)


Unnamed: 0,review_scores_rating,bathrooms_text,property_type,host_listings_count,number_of_reviews_l30d,picture_url,review_scores_communication,reviews_per_month,maximum_minimum_nights,last_scraped,...,availability_60,estimated_revenue_l365d,host_picture_url,calendar_updated,neighbourhood,host_identity_verified,first_review,host_has_profile_pic,accommodates,calculated_host_listings_count_entire_homes
0,4.78,1 bath,Entire rental unit,1.0,0,https://a0.muscache.com/pictures/295786e7-116c...,4.94,0.57,2.0,2025-09-12,...,0,,https://a0.muscache.com/im/pictures/user/242c2...,,Neighborhood highlights,t,2014-05-17,t,2,1
1,4.86,1 bath,Entire rental unit,1.0,1,https://a0.muscache.com/pictures/de55313f-9b4b...,4.95,0.66,5.0,2025-09-12,...,3,,https://a0.muscache.com/im/users/13925876/prof...,,Neighborhood highlights,t,2014-10-29,t,2,1


In [3]:
use = [
  "price","city","neighbourhood_group_cleansed","room_type",
  "minimum_nights","number_of_reviews","reviews_per_month",
  "availability_365","calculated_host_listings_count"
]
df = df[use].copy()


In [4]:
import pandas as pd

raw = pd.read_csv("data/airbnb_listings_combined.csv", low_memory=False)
print("Cities before cleaning:")
print(raw["city"].value_counts())


Cities before cleaning:
city
London      96651
Paris       81853
New York    36111
Name: count, dtype: int64


In [5]:

df["price"] = (df["price"].astype(str).str.replace(r"[\$,]","", regex=True).astype(float))
# keep reasonable nightly prices for standard listings
df = df[df["price"].between(10, 1000)]


In [6]:
import pandas as pd
import numpy as np

df = pd.read_csv("data/airbnb_listings_combined.csv", low_memory=False)

df["price"] = (df["price"].astype(str)
                         .str.replace(r"[\$,]", "", regex=True)
                         .astype(float))
df = df[df["price"].between(10, 1000)]

df["reviews_per_month"] = df["reviews_per_month"].fillna(0.0)
df["number_of_reviews"] = df["number_of_reviews"].fillna(0.0)
df["availability_365"]  = df["availability_365"].fillna(0.0)
df["calculated_host_listings_count"] = df["calculated_host_listings_count"].fillna(1.0)

# --- Clip outliers ---
df["minimum_nights"] = df["minimum_nights"].clip(1, 30)
df["availability_365"] = df["availability_365"].clip(0, 365)

# --- üîπ Fix for missing neighbourhood_group_cleansed ---
# For cities that lack this info, fill with 'Unknown' or fallback to 'city' itself
df["neighbourhood_group_cleansed"] = df["neighbourhood_group_cleansed"].fillna(df["city"])

# --- Drop rows missing truly critical info ---
df = df.dropna(subset=["city", "room_type", "price"])

# --- Verify results ---
print("‚úÖ Cleaned dataset shape:", df.shape)
print("\nCity distribution:")
print(df["city"].value_counts())
print("\nUnique neighbourhood_group_cleansed examples:")
print(df["neighbourhood_group_cleansed"].unique()[:10])


‚úÖ Cleaned dataset shape: (82642, 80)

City distribution:
city
London      61887
New York    20755
Name: count, dtype: int64

Unique neighbourhood_group_cleansed examples:
['London' 'Queens' 'Manhattan' 'Brooklyn' 'Bronx' 'Staten Island']


In [7]:
from sklearn.model_selection import train_test_split

target = "price"
feature = ["city","neighbourhood_group_cleansed","room_type",
  "minimum_nights","number_of_reviews","reviews_per_month",
  "availability_365","calculated_host_listings_count"
]

X = df[feature]
y = df[target]

X_train,X_test,y_train,y_test = train_test_split(
    X,y, test_size=0.2, random_state=42)


In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.pipeline import Pipeline
import numpy as np
import pandas as pd

numeric_cols = [
  "minimum_nights","number_of_reviews","reviews_per_month",
  "availability_365","calculated_host_listings_count"
]
cat_cols = ["city","neighbourhood_group_cleansed","room_type"]

# Optional: light numeric transforms for skewed counts
def numeric_fix(X: pd.DataFrame) -> pd.DataFrame:
    out = X.copy()
    # log1p for skew; keep non-negative guarantees
    out["number_of_reviews"] = np.log1p(out["number_of_reviews"])
    out["reviews_per_month"] = np.log1p(out["reviews_per_month"])
    out["calculated_host_listings_count"] = np.log1p(out["calculated_host_listings_count"])
    return out

numeric_pre = Pipeline(steps=[
    ("fix", FunctionTransformer(numeric_fix, feature_names_out="one-to-one"))
])

cat_pre = OneHotEncoder(handle_unknown="ignore", sparse_output=True)

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_pre, numeric_cols),
        ("cat", cat_pre, cat_cols)
    ],
    remainder="drop"
)


In [10]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline

pipe = Pipeline(steps=[
    ("pre", preprocess),
    ("model", RandomForestRegressor(
        n_estimators=400,
        random_state=42,
        n_jobs=-1
    ))
])


In [13]:
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Define cross-validation
cv = KFold(n_splits=2, shuffle=True, random_state=42)

# Compute cross-validation RMSE
rmse_cv = -cross_val_score(
    pipe, X_train, y_train,
    scoring="neg_root_mean_squared_error",
    cv=cv, n_jobs=-1
)

print(f"CV RMSE mean={rmse_cv.mean():.2f} ¬± {rmse_cv.std():.2f}")

# Fit model on full train data
pipe.fit(X_train, y_train)

# Evaluate on test data
y_pred = pipe.predict(X_test)
rmse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Test RMSE={rmse:.2f} | MAE={mae:.2f} | R¬≤={r2:.3f}")


CV RMSE mean=124.86 ¬± 0.74
Test RMSE=14866.78 | MAE=77.42 | R¬≤=0.373


In [None]:
from sklearn.model_selection import RandomizedSearchCV

param_dist = {
  "model__n_estimators": [100, 200, 300],
  "model__max_depth": [None, 10, 11, 16],
  "model__min_samples_split": [2, 5, 10],
  "model__min_samples_leaf": [1, 2, 4]
}

search = RandomizedSearchCV(
    pipe, param_distributions=param_dist,
    n_iter=10, cv=2, scoring="neg_root_mean_squared_error",
    random_state=42, n_jobs=2, verbose=1
)
search.fit(X_train, y_train)
best_pipe = search.best_estimator_

y_pred = best_pipe.predict(X_test)
print("Tuned Test RMSE:", mean_squared_error(y_test, y_pred))


Fitting 2 folds for each of 10 candidates, totalling 20 fits


In [None]:
import joblib
joblib.dump(best_pipe if 'best_pipe' in locals() else pipe,
            "models/airbnb_pipeline.pkl")
print("‚úÖ Saved models/airbnb_pipeline.pkl")


In [None]:
# src/api.py
from fastapi import FastAPI
import joblib, pandas as pd

app = FastAPI(title="Airbnb Price Prediction API")
pipe = joblib.load("models/airbnb_pipeline.pkl")

@app.get("/")
def home():
    return {"message": "Airbnb Price Prediction API is running üè†"}

@app.post("/predict")
def predict(payload: dict):
    # payload must contain the same feature keys as training
    df = pd.DataFrame([payload])
    pred = pipe.predict(df)[0]
    return {"predicted_price": round(float(pred), 2)}
