Loading datasets into a dataframe

In [0]:
storage_account = "replace_by_storage_account"  
container = "replace_by_container"

Reading airbnb data from Parquet.

In [0]:
sas_token="replace_with_your_sas_token"
sas_token = sas_token.lstrip('?')
spark.conf.set(f"fs.azure.account.auth.type.{storage_account}.dfs.core.windows.net", "SAS")
spark.conf.set(f"fs.azure.sas.token.provider.type.{storage_account}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.sas.FixedSASTokenProvider")
spark.conf.set(f"fs.azure.sas.fixed.token.{storage_account}.dfs.core.windows.net", sas_token)

In [0]:
path = f"abfss://{container}@{storage_account}.dfs.core.windows.net/airbnb_1_12_parquet"

airbnb = spark.read.parquet(path)
display(airbnb.limit(5))

# Linear Regression

In [0]:
# Setup & Config

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
import math
import numpy as np
import pandas as pd
import statsmodels.api as sm
import os

# =============================
# GLOBAL CONFIGURATION / HYPERPARAMETERS
# =============================

# -------- Data filtering --------
MIN_REVIEWS = 10
# Minimum number of reviews required for a listing to be included in the analysis.
# Higher values increase reliability but reduce sample size.

# -------- Ridge regression --------
RIDGE_ALPHAS = [0.1, 1.0, 10.0, 50.0]
# Candidate regularization strengths for RidgeCV.
# Higher values = stronger regularization (more bias, less variance).

# -------- Bootstrap confidence --------
N_BOOTSTRAP = 200
# Number of bootstrap resamples used to estimate confidence.
# Higher = more stable confidence estimates, slower runtime.

BOOTSTRAP_RANDOM_STATE = 42
# Random seed for reproducible bootstrap confidence scores.

# -------- Recommendation limits --------
TOP_K_SINGLE = 3
# Number of single-amenity upgrade recommendations per category.

TOP_K_INTERACTION = 3
# Number of interaction-based upgrade recommendations.

# -------- Scoring weights --------
WEIGHT_PRICE = 0.7
WEIGHT_RATING = 0.3
# Relative importance of price vs rating in OVERALL recommendations.
# Must sum to 1.0.

# -------- Rating constraints --------
MAX_RATING = 5.0
# Upper bound for ratings (used to avoid impossible values like >5.0).

# -------- Expected rating model config --------
USE_PRICE_IN_EXPECTED_RATING = True
# If True, expected-rating model will explicitly depend on price (log_price),
# reflecting that guests form higher expectations for more expensive listings.

PRICE_COL = "price_per_night"

# =============================
# Spark session
# =============================
spark = (
    SparkSession.builder
    .appName("AmenityUpgradeAdvisor-LinearRegression")
    .master("local[*]")
    .getOrCreate()
)

# =============================
# 0) Load CSV
# =============================
df = airbnb.drop("price") if "price" in airbnb.columns else airbnb # drop "price" if it exists

# Prefer property_id as the primary identifier
ID_COL_CANDIDATES = ["property_id", "listing_id", "id"]
id_col = next((c for c in ID_COL_CANDIDATES if c in df.columns), None)

if id_col is None:
    df = df.withColumn("property_id", F.monotonically_increasing_id())
    id_col = "property_id"

In [0]:
# Parse JSON + amenity_name_value

# =============================
# 1) Parse JSON columns
# =============================
amenities_schema = ArrayType(
    StructType([
        StructField("group_name", StringType(), True),
        StructField("items", ArrayType(
            StructType([
                StructField("name", StringType(), True),
                StructField("value", StringType(), True)
            ])
        ), True)
    ])
)

seller_schema = StructType([
    StructField("seller_id", StringType(), True),
    StructField("url", StringType(), True),
])

pricing_schema = StructType([
    StructField("airbnb_service_fee", DoubleType(), True),
    StructField("aairbnb_service_fee", DoubleType(), True),  
    StructField("cleaning_fee", DoubleType(), True),
    StructField("initial_price_per_night", DoubleType(), True),
    StructField("num_of_nights", IntegerType(), True),
    StructField("price_per_night", DoubleType(), True),
    StructField("price_without_fees", DoubleType(), True),
    StructField("special_offer", DoubleType(), True),
    StructField("taxes", DoubleType(), True),
])

df = (
    df
    .withColumn("ratings", F.expr("try_cast(ratings as double)"))

    .withColumn("pricing_parsed", F.from_json(F.col("pricing_details"), pricing_schema))
    .withColumn("price_per_night", F.col("pricing_parsed.price_per_night"))

    .withColumn("amenities_parsed", F.from_json(F.col("amenities"), amenities_schema))
    .withColumn("reviews_parsed", F.from_json(F.col("reviews"), ArrayType(StringType())))
    .withColumn("num_reviews", F.size(F.col("reviews_parsed")))
    .withColumn("seller_parsed", F.from_json(F.col("seller_info"), seller_schema))
)

df = df.withColumn("host_id", F.col("seller_parsed.seller_id"))

# Flatten amenities to (name, value) pairs (lowercase)
df = df.withColumn(
    "amenity_name_value",
    F.expr("""
        transform(
            flatten(
                transform(amenities_parsed, g -> g.items)
            ),
            x -> struct(
                lower(x.name) as name,
                lower(x.value) as value
            )
        )
    """)
)

df.write.mode("overwrite").parquet("dbfs:/airbnb/df_base")

In [0]:
# Load Parsed Base DF
df = spark.read.parquet("dbfs:/airbnb/df_base")

In [0]:
# Amenity Inventory

amenities_exploded = (
    df
    .select(
        F.col("property_id"),
        F.explode("amenity_name_value").alias("a")
    )
    .select(
        F.col("property_id"),
        F.col("a.name").alias("amenity_name")
    )
)

amenity_inventory = (
    amenities_exploded
    .groupBy("amenity_name")
    .agg(
        F.countDistinct("property_id").alias("n_properties")
    )
    .orderBy(F.desc("n_properties"))
)

display(amenity_inventory)
amenity_inventory.write.mode("overwrite").parquet("dbfs:/airbnb/amenity_inventory")


In [0]:
# Load Amenity Inventory + select amenities
amenity_inventory = spark.read.parquet(
    "dbfs:/airbnb/amenity_inventory"
)

MIN_PROPERTIES = 2000

selected_amenities_df = (
    amenity_inventory
    .filter(F.col("n_properties") >= MIN_PROPERTIES)
)

amenities_to_test = [
    r["amenity_name"] for r in selected_amenities_df.collect()
]

print(f"{len(amenities_to_test)} amenities selected.")

In [0]:
# Add Amenity Flags

def col_name(a: str) -> str:
    return "a_" + "".join(ch if ch.isalnum() else "_" for ch in a.lower())

def add_amenity_flag(df_in, amenity: str):
    a = amenity.lower()
    c = col_name(a)

    return df_in.withColumn(
        c,
        F.when(
            F.exists(
                F.col("amenity_name_value"),
                lambda x: (
                    F.contains(x["name"], F.lit(a)) &
                    F.regexp_like(x["value"], F.lit("no_"))
                )
            ),
            F.lit(0)
        ).when(
            F.exists(
                F.col("amenity_name_value"),
                lambda x: (
                    F.contains(x["name"], F.lit(a)) &
                    (~F.regexp_like(x["value"], F.lit("no_")))
                )
            ),
            F.lit(1)
        ).otherwise(F.lit(0))
    )

for a in amenities_to_test:
    df = add_amenity_flag(df, a)

df.write.mode("overwrite").parquet("dbfs:/airbnb/df_with_amenities")

In [0]:
# Load DF with Amenities
df = spark.read.parquet("dbfs:/airbnb/df_with_amenities")

In [0]:
# Filter + Feature Engineering
# Collect actual amenity columns that exist in the dataframe
amenity_cols_existing = [c for c in df.columns if c.startswith("a_")]
amenity_set = set(amenity_cols_existing)


# =============================
# 3) Filter for reliable analysis + print breakdown (ONCE)
# =============================

total_raw = df.count()

df_price_ok = df.filter(F.col(PRICE_COL).isNotNull() & (F.col(PRICE_COL) > 0))
n_price_ok = df_price_ok.count()

df_rating_ok = df_price_ok.filter(F.col("ratings").isNotNull())
n_rating_ok = df_rating_ok.count()

df_reviews_ok = df_rating_ok.filter(F.col("num_reviews") >= MIN_REVIEWS)
n_reviews_ok = df_reviews_ok.count()

df_f = df_reviews_ok.withColumn("log_price", F.log(F.col(PRICE_COL)))

# Cache the filtered, feature-rich dataframe (used many times later)
df_f = df_f.cache()
df_f.count()  # materialize cache

print("\n=== Analysis Sample Explanation ===")
print(f"Total listings in raw dataset: {total_raw}")
print(f"Listings used for statistical analysis: {n_reviews_ok}")
print("\nListings were INCLUDED in the analysis only if they satisfy ALL of the following:")
print(f"  • After price_per_night filter (price_per_night > 0): {n_price_ok} (removed {total_raw - n_price_ok})")
print(f"  • After rating filter (rating available): {n_rating_ok} (removed {n_price_ok - n_rating_ok})")
print(f"  • After review-count filter (≥ {MIN_REVIEWS} reviews): {n_reviews_ok} (removed {n_rating_ok - n_reviews_ok})")


# =============================
# 3.5) Control features (non-actionable)
# =============================
df_f = (
    df_f
    .withColumn("guests", F.expr("try_cast(guests as int)"))
    .withColumn(
        "num_beds",
        F.when(
            F.col("arrangement_details").isNotNull(),
            F.regexp_count(F.lower(F.col("arrangement_details")), F.lit("bed"))
        ).otherwise(0)
    )
    .withColumn("is_superhost", F.expr("CASE WHEN is_supperhost = TRUE THEN 1 ELSE 0 END"))
    .withColumn("host_rating", F.expr("try_cast(host_rating as double)"))
    .withColumn("host_reviews", F.expr("try_cast(host_number_of_reviews as int)"))
)

# =============================
# Amenity interaction features
# =============================
INTERACTION_PAIRS = [
    # Work / business
    ("a_wifi", "a_dedicated_workspace"),
    ("a_wifi", "a_self_check_in"),

    # Kitchen completeness
    ("a_kitchen", "a_cooking_basics"),
    ("a_kitchen", "a_dishwasher"),
    ("a_oven", "a_baking_sheet"),
    ("a_coffee", "a_kettle"),

    # Outdoor / leisure
    ("a_patio_or_balcony", "a_outdoor_furniture"),
    ("a_bbq_grill", "a_outdoor_dining_area"),
    ("a_backyard", "a_bbq_grill"),
    ("a_private_backyard___fully_fenced", "a_pets_allowed"),

    # Views
    ("a_sea_view", "a_patio_or_balcony"),
    ("a_city_skyline_view", "a_patio_or_balcony"),
    ("a_garden_view", "a_outdoor_furniture"),

    # Long stay
    ("a_washing_machine", "a_drying_rack_for_clothing"),
    ("a_washing_machine", "a_iron"),

    # Family
    ("a_crib", "a_high_chair"),
    ("a_children_s_books_and_toys", "a_children_s_dinnerware"),

    # Comfort
    ("a_bed_linens", "a_extra_pillows_and_blankets"),
    ("a_room_darkening_shades", "a_ceiling_fan"),

    # Safety / trust
    ("a_smoke_alarm", "a_fire_extinguisher"),
    ("a_first_aid_kit", "a_smoke_alarm"),
]

# Create ONLY valid interaction features (both sides must exist)
valid_interactions = []

for a, b in INTERACTION_PAIRS:
    if a in amenity_set and b in amenity_set:
        inter_col = f"{a}__x__{b}"
        df_f = df_f.withColumn(inter_col, F.col(a) * F.col(b))
        valid_interactions.append(inter_col)

print("Created interaction features")

# =============================
# 4) Build Pandas dataset for regression
# =============================
amenity_features = [col_name(a) for a in amenities_to_test]

# NON_ACTIONABLE amenities (not actionable for upgrade recommendations)
NON_ACTIONABLE = {
    "a_sea_view",
    "a_city_skyline_view",
    "a_garden_view",
    "a_elevator"
}

# Interaction features
interaction_features = valid_interactions

control_features = [
    "guests",
    "num_beds",
    "is_superhost",
    "host_rating",
    "host_reviews"
]

feature_cols = [c for c in (amenity_features + interaction_features + control_features) if c in df_f.columns]

df_f = (
    df_f
    .withColumn("desc_len", F.length(F.coalesce(F.col("description"), F.lit(""))))
    .withColumn("num_reviews_text", F.coalesce(F.size(F.col("reviews_parsed")), F.lit(0)))
)


OUTPUT_PATH = "dbfs:/airbnb/df_features_v1"

df_f.write.mode("overwrite").parquet(OUTPUT_PATH)

In [0]:
# Load Features
df_f = spark.read.parquet("dbfs:/airbnb/df_features_v1")


# Add these features to the feature list
feature_cols += ["desc_len", "num_reviews_text"]

# Define rating feature list (includes log_price)
feature_cols_rating = feature_cols.copy()
if "log_price" not in feature_cols_rating:
    feature_cols_rating.append("log_price")

# --- Train on a Spark sample only (NOT full dataset) ---
MODEL_SAMPLE_FRACTION = 0.1  # 10% (tune if you want)

df_model = (
    df_f
    .sample(withReplacement=False, fraction=MODEL_SAMPLE_FRACTION, seed=42)
    .select(["ratings"] + feature_cols_rating)
)

pdf_model = df_model.toPandas()
pdf_model = pdf_model.loc[:, ~pdf_model.columns.duplicated()]

# Training subsets
pdf_price = pdf_model.dropna(subset=["log_price"]).copy()
pdf_rating = pdf_model.dropna(subset=["ratings"]).copy()


# =============================
# Helper functions for upgrade improvement calculations
# =============================
def price_improvement(current_price_per_night, price_multiplier):
    new_price_per_night = current_price_per_night * price_multiplier
    pct = (new_price_per_night / current_price_per_night - 1.0) * 100.0 if current_price_per_night > 0 else None
    return new_price_per_night, pct

def rating_improvement(current_rating, delta):
    new_rating = min(current_rating + delta, MAX_RATING)
    pct = ((new_rating - current_rating) / current_rating) * 100.0 if current_rating > 0 else None
    return new_rating, pct

# (property_id is now always present earlier)

# ===== Define feature_cols_rating for rating model (with log_price) =====
feature_cols_rating = feature_cols.copy()
if "log_price" not in feature_cols_rating:
    feature_cols_rating.append("log_price")

In [0]:
# Modeling + Bootstrap
# =============================
# 5) Ridge Regression - Price model: log(price)
# =============================
from sklearn.linear_model import RidgeCV, Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.utils import resample

price_feature_order = feature_cols.copy()

X_price = pdf_price[price_feature_order].fillna(0.0)

y_price = pdf_price["log_price"]

ridge = Pipeline([
    ("scaler", StandardScaler()),
    ("ridge", RidgeCV(alphas=RIDGE_ALPHAS))
])

ridge.fit(X_price, y_price)

# =============================
# Bootstrap-based confidence helper
# =============================
def bootstrap_variance_ridge(
    X, y, feature_names, base_pipeline,
    n_boot=N_BOOTSTRAP,
    random_state=BOOTSTRAP_RANDOM_STATE
):
    rng = np.random.RandomState(random_state)

    coef_samples = {f: [] for f in feature_names}

    for i in range(n_boot):
        X_b, y_b = resample(X, y, random_state=rng.randint(1e9))

        model = Pipeline([
            ("scaler", StandardScaler()),
            ("ridge", Ridge(alpha=base_pipeline.named_steps["ridge"].alpha_))
        ])
        model.fit(X_b, y_b)

        coefs = model.named_steps["ridge"].coef_.ravel()

        for f, c in zip(feature_names, coefs):
            coef_samples[f].append(c)

    stats = {}
    for f, values in coef_samples.items():
        v = np.array(values)
        stats[f] = {
            "mean": v.mean(),
            "var": v.var(ddof=1),
            "std": v.std(ddof=1),
            "ci_low": np.percentile(v, 2.5),
            "ci_high": np.percentile(v, 97.5)
        }

    return stats

price_stats = bootstrap_variance_ridge(
    X_price.values,
    y_price.values,
    feature_cols,
    ridge
)

coef_price = ridge.named_steps["ridge"].coef_.reshape(-1)

price_table = pd.DataFrame({
    "feature": X_price.columns.tolist(),
    "coef_log_price": coef_price,
})

price_table["price_multiplier"] = np.exp(price_table["coef_log_price"])
price_table["coef_std"] = price_table["feature"].map(lambda f: price_stats[f]["std"])
price_table["coef_var"] = price_table["feature"].map(lambda f: price_stats[f]["var"])
price_table["ci_low"] = price_table["feature"].map(lambda f: price_stats[f]["ci_low"])
price_table["ci_high"] = price_table["feature"].map(lambda f: price_stats[f]["ci_high"])

# =============================
# 6) Ridge Regression - Rating model
# =============================
rating_feature_order = feature_cols_rating.copy()

X_rating = pdf_rating[rating_feature_order].fillna(0.0)

y_rating = pdf_rating["ratings"]

ridge_rating = Pipeline([
    ("scaler", StandardScaler()),
    ("ridge", RidgeCV(alphas=RIDGE_ALPHAS))
])

ridge_rating.fit(X_rating, y_rating)

# Print verification for log_price in rating model
print(f"Rating model includes log_price: {'log_price' in feature_cols_rating}")

rating_stats = bootstrap_variance_ridge(
    X_rating.values,
    y_rating.values,
    feature_cols_rating,
    ridge_rating
)

coef_rating = ridge_rating.named_steps["ridge"].coef_.reshape(-1)

rating_table = pd.DataFrame({
    "feature": X_rating.columns.tolist(),
    "coef_rating": coef_rating,
})

rating_table["coef_std"] = rating_table["feature"].map(lambda f: rating_stats[f]["std"])
rating_table["coef_var"] = rating_table["feature"].map(lambda f: rating_stats[f]["var"])
rating_table["ci_low"] = rating_table["feature"].map(lambda f: rating_stats[f]["ci_low"])
rating_table["ci_high"] = rating_table["feature"].map(lambda f: rating_stats[f]["ci_high"])

# =============================
# 7) Concise conclusions table (amenities only)
# =============================
final_conclusions = (
    price_table
    .merge(
        rating_table,
        on="feature",
        how="left",
        suffixes=("_price", "_rating")
    )
    .query("feature in @amenity_features")
    .rename(columns={"feature": "amenity"})
    .fillna(0.0)
    .reset_index(drop=True)
)


print("\n=== Final Amenity Impact Conclusions (Ridge Regression, Controlled) ===")
print("Interpretation: multiplicative price effect & additive rating effect, controlling for size, host, and text quality.")
print(final_conclusions.to_string(index=False))

# =============================
# 7.1) Interaction effect tables (explicit output)
# =============================
interaction_price_table = (
    price_table
    .query("feature in @interaction_features")
    .sort_values("price_multiplier", ascending=False)
    .reset_index(drop=True)
)

interaction_rating_table = (
    rating_table
    .query("feature in @interaction_features")
    .sort_values("coef_rating", ascending=False)
    .reset_index(drop=True)
)


print("\n=== Amenity Interaction Effects – PRICE ===")
print("Interpretation: joint effect beyond individual amenities.")
if interaction_price_table.empty:
    print("No interaction effects available.")
else:
    print(interaction_price_table.to_string(index=False))

print("\n=== Amenity Interaction Effects – RATING ===")
print("Interpretation: joint effect on guest satisfaction.")
if interaction_rating_table.empty:
    print("No interaction effects available.")
else:
    print(interaction_rating_table.to_string(index=False))


spark_final_conclusions = spark.createDataFrame(final_conclusions)
spark_interaction_price = spark.createDataFrame(interaction_price_table)
spark_interaction_rating = spark.createDataFrame(interaction_rating_table)

spark_final_conclusions.write.mode("overwrite").parquet("dbfs:/airbnb/final_conclusions")
spark_interaction_price.write.mode("overwrite").parquet("dbfs:/airbnb/interaction_effects_price")
spark_interaction_rating.write.mode("overwrite").parquet("dbfs:/airbnb/interaction_effects_rating")

spark.createDataFrame(price_table).write.mode("overwrite").parquet("dbfs:/airbnb/price_table")
spark.createDataFrame(rating_table).write.mode("overwrite").parquet("dbfs:/airbnb/rating_table")

print("Analysis tables saved to DBFS")

In [0]:
#  Load Model Outputs
price_table = spark.read.parquet("dbfs:/airbnb/price_table").toPandas()
rating_table = spark.read.parquet("dbfs:/airbnb/rating_table").toPandas()

# Property-level Recommendations

# =============================
# 9) PROPERTY-LEVEL UPGRADE RECOMMENDATIONS (SPARK-SAFE)
# =============================

# --------------------------------
# Prepare Python-side lookup tables
# --------------------------------

# --------------------------------
# SAFETY: ensure unique feature rows
# --------------------------------
price_table = (
    price_table
    .groupby("feature", as_index=False)
    .first()
)

rating_table = (
    rating_table
    .groupby("feature", as_index=False)
    .first()
)

# Optional sanity check
assert price_table["feature"].is_unique
assert rating_table["feature"].is_unique

price_table_py = price_table.set_index("feature").to_dict("index")
rating_table_py = rating_table.set_index("feature").to_dict("index")
price_stats_bc = spark.sparkContext.broadcast(price_stats)
rating_stats_bc = spark.sparkContext.broadcast(rating_stats)

# --------------------------------
# Broadcast everything needed
# --------------------------------
price_table_bc = spark.sparkContext.broadcast(price_table_py)
rating_table_bc = spark.sparkContext.broadcast(rating_table_py)
amenity_features_bc = spark.sparkContext.broadcast(set(amenity_features))
interaction_features_bc = spark.sparkContext.broadcast(set(interaction_features))
non_actionable_bc = spark.sparkContext.broadcast(set(NON_ACTIONABLE))

# --------------------------------
# Core computation per property
# --------------------------------
def compute_recommendations_for_row(row):
    row = row.asDict()

    pid = int(row["property_id"]) if row.get("property_id") is not None else None
    host_id = row.get("host_id")

    current_price_per_night = row.get(PRICE_COL)
    current_rating = row.get("ratings")

    if current_price_per_night is None or current_rating is None:
        return {
            "host_id": host_id,
            "property_id": pid,
            "price_upgrades_all": [],
            "rating_upgrades_all": [],
        }

    price_upgrades = []
    rating_upgrades = []
    combined = []

    price_dict = {}
    rating_dict = {}

    # ---------- PRICE: single amenities ----------
    for f, r in price_table_bc.value.items():
        if (
            f in amenity_features_bc.value
            and f not in non_actionable_bc.value
            and row.get(f, 0) == 0
            and r["coef_log_price"] > 0
        ):
            stat = price_stats_bc.value[f]
            price_delta = current_price_per_night * (math.exp(r["coef_log_price"]) - 1)
            price_upgrades.append({
                "feature": f,
                "price_delta_usd": float(price_delta),
                "coef_mean": float(stat["mean"]),
                "coef_std": float(stat["std"]),
                "coef_var": float(stat["var"]),
                "ci_low": float(stat["ci_low"]),
                "ci_high": float(stat["ci_high"]),
            })

    # ---------- PRICE: interactions ----------
    for f, r in price_table_bc.value.items():
        if f in interaction_features_bc.value and r["coef_log_price"] > 0:
            a, b = f.split("__x__")
            if (
                row.get(a, 0) == 0 and row.get(b, 0) == 0
                and a not in non_actionable_bc.value
                and b not in non_actionable_bc.value
            ):
                stat = price_stats_bc.value[f]
                price_delta = current_price_per_night * (math.exp(r["coef_log_price"]) - 1)
                price_upgrades.append({
                    "feature": f,
                    "price_delta_usd": float(price_delta),
                    "coef_mean": float(stat["mean"]),
                    "coef_std": float(stat["std"]),
                    "coef_var": float(stat["var"]),
                    "ci_low": float(stat["ci_low"]),
                    "ci_high": float(stat["ci_high"]),
                })

    # ---------- RATING: single amenities ----------
    for f, r in rating_table_bc.value.items():
        if (
            f in amenity_features_bc.value
            and f not in non_actionable_bc.value
            and row.get(f, 0) == 0
            and r["coef_rating"] > 0
        ):
            stat = rating_stats_bc.value[f]
            rating_delta = min(r["coef_rating"], MAX_RATING - current_rating)
            rating_upgrades.append({
                "feature": f,
                "rating_delta": float(rating_delta),
                "coef_mean": float(stat["mean"]),
                "coef_std": float(stat["std"]),
                "coef_var": float(stat["var"]),
                "ci_low": float(stat["ci_low"]),
                "ci_high": float(stat["ci_high"]),
            })

    # ---------- RATING: interactions ----------
    for f, r in rating_table_bc.value.items():
        if f in interaction_features_bc.value and r["coef_rating"] > 0:
            a, b = f.split("__x__")
            if (
                row.get(a, 0) == 0 and row.get(b, 0) == 0
                and a not in non_actionable_bc.value
                and b not in non_actionable_bc.value
            ):
                stat = rating_stats_bc.value[f]
                rating_delta = min(r["coef_rating"], MAX_RATING - current_rating)
                rating_upgrades.append({
                    "feature": f,
                    "rating_delta": float(rating_delta),
                    "coef_mean": float(stat["mean"]),
                    "coef_std": float(stat["std"]),
                    "coef_var": float(stat["var"]),
                    "ci_low": float(stat["ci_low"]),
                    "ci_high": float(stat["ci_high"]),
                })

    return {
        "host_id": host_id,
        "property_id": pid,
        "price_upgrades_all": sorted(
            price_upgrades,
            key=lambda x: x["price_delta_usd"],
            reverse=True
        ),
        "rating_upgrades_all": sorted(
            rating_upgrades,
            key=lambda x: x["rating_delta"],
            reverse=True
        ),
    }

# --------------------------------
# Apply to ALL properties (Spark)
# --------------------------------
spark_recommendations = (
    df_f
    # .select("host_id", "property_id", *amenity_features)
    .select("host_id", "property_id", PRICE_COL, "ratings", *amenity_features)
    .rdd
    .map(compute_recommendations_for_row)
)

# --------------------------------
# Schema
# --------------------------------
recommendation_schema = StructType([
    StructField("host_id", StringType(), True),
    StructField("property_id", LongType(), True),
    StructField(
        "price_upgrades_all",
        ArrayType(StructType([
            StructField("feature", StringType()),
            StructField("price_delta_usd", DoubleType()),
            StructField("coef_mean", DoubleType()),
            StructField("coef_std", DoubleType()),
            StructField("coef_var", DoubleType()),
            StructField("ci_low", DoubleType()),
            StructField("ci_high", DoubleType())
        ]))
    ),
    StructField(
        "rating_upgrades_all",
        ArrayType(StructType([
            StructField("feature", StringType()),
            StructField("rating_delta", DoubleType()),
            StructField("coef_mean", DoubleType()),
            StructField("coef_std", DoubleType()),
            StructField("coef_var", DoubleType()),
            StructField("ci_low", DoubleType()),
            StructField("ci_high", DoubleType())
        ]))
    ),
])

# --------------------------------
# Create Spark DataFrame
# --------------------------------
spark_with_recos = spark.createDataFrame(
    spark_recommendations,
    schema=recommendation_schema
)

# --------------------------------
# SAVE (PARQUET ONLY)
# --------------------------------
spark_with_recos.write.mode("overwrite").parquet(
    "dbfs:/airbnb/property_upgrade_recommendations"
)

In [0]:
# =============================
# DEBUG PRINT: 3 SAMPLE HOSTS
# =============================

# 1) Pick 3 distinct host_ids (cheap action)
sample_host_ids = (
    spark_with_recos
    .select("host_id")
    .where(F.col("host_id").isNotNull())
    .distinct()
    .limit(3)
    .rdd
    .map(lambda r: r["host_id"])
    .collect()
)

print("\n=== DEBUG: SAMPLE HOST UPGRADE RECOMMENDATIONS ===")

# 2) Print recommendations for those hosts only
for hid in sample_host_ids:
    print("\n==============================")
    print(f"HOST ID: {hid}")
    print("==============================")

    host_rows = (
        spark_with_recos
        .where(F.col("host_id") == hid)
        .limit(3)   # limit properties per host to keep output readable
        .collect()
    )

    for r in host_rows:
        print(f"\nProperty ID: {r['property_id']}")

        print("  PRICE upgrades (top 3 shown):")
        for u in (r["price_upgrades_all"] or [])[:3]:
            print(
                f"   - {u['feature'].replace('a_', '')}: "
                f"+${u['price_delta_usd']:.2f} per night "
                f"| std={u['coef_std']:.4f} | var={u['coef_var']:.4f} "
                f"| CI=[{u['ci_low']:.4f}, {u['ci_high']:.4f}]"
            )

        print("  RATING upgrades (top 3 shown):")
        for u in (r["rating_upgrades_all"] or [])[:3]:
            print(
                f"   - {u['feature'].replace('a_', '')}: "
                f"+{u['rating_delta']:.3f} rating | std={u['coef_std']:.4f} | var={u['coef_var']:.4f} | CI=[{u['ci_low']:.4f}, {u['ci_high']:.4f}]"
            )

In [0]:
# Load final conclusions from DBFS
final_conclusions = (
    spark.read
    .parquet("dbfs:/airbnb/final_conclusions")
    .toPandas()
)

# Optional: clean amenity names for plotting
final_conclusions["amenity_clean"] = (
    final_conclusions["amenity"]
    .str.replace("a_", "", regex=False)
    .str.replace("_", " ")
)

In [0]:
import seaborn as sns
import matplotlib.pyplot as plt

# --- Modern poster theme ---
sns.set_theme(
    style="white",
    font_scale=1.05,
    rc={
        "axes.grid": False,
        "axes.spines.right": False,
        "axes.spines.top": False,
    }
)

# -----------------------------
# Select top + bottom amenities
# -----------------------------
top_pos = final_conclusions.sort_values("coef_log_price", ascending=False).head(5)
top_neg = final_conclusions.sort_values("coef_log_price", ascending=True).head(5)

plot_df = pd.concat([top_neg, top_pos]).sort_values("coef_log_price")

# -----------------------------
# Color by sign
# -----------------------------
plot_df["bar_color"] = plot_df["coef_log_price"].apply(
    lambda x: "#4C72B0" if x > 0 else "#DD8452"  # blue / orange
)

# -----------------------------
# Plot
# -----------------------------
plt.figure(figsize=(7,4))

ax = sns.barplot(
    data=plot_df,
    x="coef_log_price",
    y="amenity_clean",
    palette=plot_df["bar_color"],
    alpha=0.85
)

# --- Subtle vertical grid ---
ax.xaxis.grid(True, color="#E6E6E6", linewidth=0.8)
ax.yaxis.grid(False)

# -----------------------------
# Error bars
# -----------------------------
for i, row in enumerate(plot_df.itertuples()):
    plt.errorbar(
        x=row.coef_log_price,
        y=i,
        xerr=row.coef_std_price,
        fmt="none",
        ecolor="#555555",
        capsize=2,
        linewidth=0.9,
        alpha=0.8
    )

# -----------------------------
# Coefficient annotations (anchored to zero)
# -----------------------------
for i, row in enumerate(plot_df.itertuples()):
    coef = row.coef_log_price

    if coef > 0:
        x_text, ha = -0.004, "right"
    else:
        x_text, ha = 0.004, "left"

    plt.text(
        x_text,
        i,
        f"{coef:+.3f}",
        va="center",
        ha=ha,
        fontsize=9.5,
        fontweight="semibold",
        color="#2F2F2F"
    )

# --- Zero line ---
plt.axvline(0, color="#444444", linewidth=1.2)

# -----------------------------
# Labels & title
# -----------------------------
plt.xlabel("Effect on price", fontsize=11)
plt.ylabel("")
plt.title(
    "Top Positive and Negative Amenity Effects",
    fontsize=13,
    weight="semibold",
    pad=8
)

sns.despine(left=True, bottom=True)
plt.tight_layout()
plt.show()

In [0]:
import seaborn as sns
import matplotlib.pyplot as plt

# --- SAME poster theme ---
sns.set_theme(
    style="white",
    font_scale=1.05,
    rc={
        "axes.grid": False,
        "axes.spines.right": False,
        "axes.spines.top": False,
    }
)

# -----------------------------
# Select only significant amenities
# -----------------------------
tradeoff_df = final_conclusions.copy()

# Define overall impact
tradeoff_df["impact"] = (
    tradeoff_df["coef_log_price"].abs() +
    tradeoff_df["coef_rating"].abs()
)

TOP_K = 12  # adjust if needed (6–10 works best for posters)

plot_df = (
    tradeoff_df
    .sort_values("impact", ascending=False)
    .head(TOP_K)
)

# -----------------------------
# Plot
# -----------------------------
plt.figure(figsize=(6.5, 4.5))

ax = sns.scatterplot(
    data=plot_df,
    x="coef_log_price",
    y="coef_rating",
    s=90,
    color="#4C72B0",
    alpha=0.9
)

# --- Zero reference lines ---
plt.axvline(0, color="#444444", linewidth=1.2)
plt.axhline(0, color="#444444", linewidth=1.2)

# --- Subtle grid ---
ax.xaxis.grid(True, color="#E6E6E6", linewidth=0.8)
ax.yaxis.grid(True, color="#E6E6E6", linewidth=0.8)

# -----------------------------
# Labels for ALL shown points
# -----------------------------
for _, row in plot_df.iterrows():
    plt.text(
        row["coef_log_price"],
        row["coef_rating"] - 0.0002,  # slight vertical offset downward
        row["amenity_clean"],
        fontsize=9.5,
        fontweight="semibold",
        ha="center",
        va="top",
        alpha=0.95
    )

# -----------------------------
# Labels & title
# -----------------------------
plt.xlabel("Effect on price", fontsize=11)
plt.ylabel("Effect on rating", fontsize=11)

plt.title(
    "Price vs Rating Tradeoff",
    fontsize=13,
    weight="semibold",
    pad=8
)

sns.despine()
plt.tight_layout()
plt.show()

# EDA

In [0]:
from pyspark.sql import functions as F

# Orders = number of rows
orders = airbnb.count()

# Hosts
hosts = (
    airbnb
    .select(F.col("seller_info"))
    .where(F.col("seller_info").isNotNull())
    .select(F.from_json("seller_info", "struct<seller_id:string>").alias("s"))
    .select("s.seller_id")
    .distinct()
    .count()
)

# Properties (prefer property_id / listing_id if exists)
property_id_col = next(
    (c for c in ["property_id", "listing_id", "id"] if c in airbnb.columns),
    None
)

properties = (
    airbnb
    .select(property_id_col)
    .distinct()
    .count()
)

amenities_schema = "array<struct<group_name:string, items:array<struct<name:string,value:string>>>>"

amenities_count = (
    airbnb
    .withColumn("amenities_parsed", F.from_json("amenities", amenities_schema))
    .withColumn("amenity", F.explode(F.expr("flatten(transform(amenities_parsed, g -> g.items))")))
    .select(F.lower("amenity.name").alias("amenity_name"))
    .distinct()
    .count()
)

# Parse reviews array
reviews_df = (
    airbnb
    .withColumn("reviews_parsed", F.from_json("reviews", "array<string>"))
    .withColumn("num_reviews", F.size("reviews_parsed"))
)

# Total reviews
total_reviews = (
    reviews_df
    .select(F.sum("num_reviews").alias("total"))
    .collect()[0]["total"]
)

# Reviews per property (average)
avg_reviews_per_property = (
    reviews_df
    .select(F.avg("num_reviews").alias("avg"))
    .collect()[0]["avg"]
)

print("Airbnb Data Statistics")
print("----------------------")
print(f"Orders: {orders:,}")
print(f"Hosts: {hosts:,}")
print(f"Properties: {properties:,}")
print(f"Amenities: {amenities_count:,}")
print(f"Reviews: {total_reviews:,}")
print(f"Reviews per Property (avg): {avg_reviews_per_property:.2f}")

In [0]:
import seaborn as sns
import matplotlib.pyplot as plt
from pyspark.sql import functions as F

sns.set_theme(
    style="white",
    font_scale=1.05,
    rc={
        "axes.grid": False,
        "axes.spines.right": False,
        "axes.spines.top": False,
    }
)

In [0]:
ratings_df = (
    airbnb
    .select(F.col("ratings").cast("double").alias("rating"))
    .where(F.col("rating").isNotNull())
    .toPandas()
)

plt.figure(figsize=(4.5,3))

sns.histplot(
    ratings_df["rating"],
    bins=20,
    color="#4C72B0",
    edgecolor="white"
)

plt.xlabel("Rating")
plt.ylabel("Number of Listings")
plt.title("Rating Distribution")

sns.despine()
plt.tight_layout()
plt.show()

In [0]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

# Prepare price data
price_df = (
    df
    .select(F.col("price_per_night").cast("double").alias("price"))
    .where((F.col("price") > 0) & (F.col("price") < 500))  # optional cap for clarity
    .toPandas()
)

# Define price bins
bins = [0, 50, 100, 150, 200, 300, 500]
labels = ["0–50", "50–100", "100–150", "150–200", "200–300", "300+"]

price_df["price_range"] = pd.cut(
    price_df["price"],
    bins=bins,
    labels=labels,
    right=False
)

# Plot
plt.figure(figsize=(5, 3.5))

ax = sns.countplot(
    data=price_df,
    x="price_range",
    color="#4C72B0"
)

ax.yaxis.grid(True, color="#E6E6E6", linewidth=0.8)
ax.xaxis.grid(False)

plt.xlabel("Price per Night ($)")
plt.ylabel("Number of Listings")
plt.title("Price Distribution by Range")

# 🔹 Rotate price range labels
plt.xticks(rotation=90, ha="right")

sns.despine(left=True, bottom=True)
plt.tight_layout()
plt.show()

In [0]:
amenities_schema = """
array<struct<
    group_name:string,
    items:array<struct<name:string,value:string>>
>>
"""

top_amenities = (
    airbnb
    .withColumn("amenities_parsed", F.from_json("amenities", amenities_schema))
    .withColumn("amenity", F.explode(F.expr("flatten(transform(amenities_parsed, g -> g.items))")))
    .select(F.lower("amenity.name").alias("amenity"))
    .groupBy("amenity")
    .count()
    .orderBy(F.desc("count"))
    .limit(10)
    .toPandas()
)

plt.figure(figsize=(5,3.5))

sns.barplot(
    data=top_amenities,
    x="count",
    y="amenity",
    color="#4C72B0",
    alpha=0.85
)

plt.xlabel("Number of Listings")
plt.ylabel("")
plt.title("Top 10 Most Common Amenities")

sns.despine()
plt.tight_layout()
plt.show()

In [0]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from pyspark.sql import functions as F
import numpy as np

# -----------------------------
# Poster theme
# -----------------------------
sns.set_theme(
    style="white",
    font_scale=1.05,
    rc={
        "axes.grid": False,
        "axes.spines.right": False,
        "axes.spines.top": False,
    }
)

# -----------------------------
# Prepare data
# -----------------------------
eda_df = (
    df
    .select(
        F.col("price_per_night").cast("double").alias("price"),
        F.col("ratings").cast("double").alias("rating")
    )
    .where(
        (F.col("price") > 0) &
        (F.col("price") <= 500) &
        (F.col("rating").between(1, 5))
    )
    .toPandas()
)

# Price ranges
bins = [0, 50, 100, 150, 200, 300, 500, 1000]
labels = ["0–50", "50–100", "100–150", "150–200", "200–300", "300-500", "500+"]

eda_df["price_range"] = pd.cut(
    eda_df["price"],
    bins=bins,
    labels=labels,
    right=False
)

# Aggregate: mean + CI
summary = (
    eda_df
    .groupby("price_range")
    .agg(
        mean_rating=("rating", "mean"),
        std_rating=("rating", "std"),
        n=("rating", "count")
    )
    .reset_index()
)

summary["ci"] = 1.96 * summary["std_rating"] / np.sqrt(summary["n"])

# -----------------------------
# Plot
# -----------------------------
plt.figure(figsize=(5.2, 3.4))

# Points + confidence intervals
plt.errorbar(
    x=summary["price_range"],
    y=summary["mean_rating"],
    yerr=summary["ci"],
    fmt="o",
    color="#4C72B0",
    ecolor="#4C72B0",
    elinewidth=1.2,
    capsize=3,
    markersize=6
)

# Subtle connecting trend line
plt.plot(
    summary["price_range"],
    summary["mean_rating"],
    color="#4C72B0",
    linewidth=1,
    alpha=0.6
)

# Subtle horizontal grid
plt.gca().yaxis.grid(True, color="#E6E6E6", linewidth=0.8)

# Annotate only min & max price ranges
for i in [0, len(summary) - 1]:
    plt.text(
        i,
        summary.loc[i, "mean_rating"] + 0.01,
        f'{summary.loc[i, "mean_rating"]:.2f}',
        ha="center",
        va="bottom",
        fontsize=9,
        fontweight="semibold"
    )

# Labels & title
plt.xlabel("Price per Night ($)")
plt.ylabel("Average Rating")
plt.title(
    "Average Rating by Price Range",
    fontsize=13,
    weight="semibold",
    pad=8
)

plt.ylim(4.5, 5.05)
plt.xticks(rotation=45, ha="right")

sns.despine()
plt.tight_layout()
plt.show()