<a href="https://colab.research.google.com/github/torbenbillow/CBS-AML-PROJECT/blob/main/notebooks/00_welcome.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

*Import all the relevant libraries below to process, explore, and model data.*

In [140]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import geopandas as gpd
from shapely.geometry import Point, Polygon

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, LogisticRegression
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingRegressor, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.svm import SVR, SVC

from sklearn.metrics import (mean_absolute_error, mean_squared_error, r2_score,
                             accuracy_score, precision_score, recall_score,
                             f1_score, confusion_matrix, classification_report,
                             roc_auc_score, roc_curve, max_error, mean_absolute_percentage_error)

from google.colab import drive
pd.options.mode.copy_on_write = True
import os
os.chdir("/content/drive/MyDrive/AML_Exam")


In [296]:
# Load data and features
listings_raw = pd.read_csv("listings.csv")
feature_selection = pd.read_excel("features.xlsx")

# DATA WRANGLING AND FEATURE ENGINEERING

In [297]:
# Load data as df
df = listings_raw.copy()

# Remove listings without reviews in 2025 and no occupancy in the last year. Listing should also be available now
df = df.query(
    "last_review >= '2025-01-01' and estimated_occupancy_l365d > 0 and has_availability == 't'"
)

In [298]:
# Check sizes of data and filtered data
print("Before filtering:",listings_raw.shape, "\n After filtering:",df.shape)

Before filtering: (22684, 79) 
 After filtering: (10124, 79)


## Price

In [299]:
# Remove $, commas, and spaces, then convert to numeric
price_clean = (
    df["price"]
    .astype(str)                             # handle existing ints / NA
    .str.replace(r'[\$,]', '', regex=True)   # remove $ and commas
    .str.strip()
)

# Convert to numeric, coercing bad values (like '<NA>') to NaN
price_numeric = pd.to_numeric(price_clean, errors="coerce")

# Store back as nullable integer
df["price"] = price_numeric.astype("Int64")

## Boolean columns

In [300]:
# Some cols have boolean values "f" and "t", we change to 0 or 1

bool_cols = [
    "host_is_superhost",
    "host_has_profile_pic",
    "host_identity_verified",
    "has_availability",
    "instant_bookable"
]

for col in bool_cols:
    df[col] = (
        df[col]
        .map({"t": 1, "f": 0, "true": 1, "false": 0, True: 1, False: 0})
        .astype("Int64")
    )

# fill missing superhost values
df["host_is_superhost"] = df["host_is_superhost"].fillna(0).astype("Int64")


In [335]:
# 257 listings have unknown verification status, profile picture and listing count
df[["host_listings_count",
"host_has_profile_pic",
"host_identity_verified"]].isna().sum()

Unnamed: 0,0
host_listings_count,0
host_has_profile_pic,0
host_identity_verified,0


In [328]:
# A quick check of the pages with unknown verification status shows that most do indeed have both profile pictures and verification.
# Verification and profile pic rates are very high in general
# For listings, we can safely assume one since we have at least one property listed from these users
# Thus, we set all these features to 1

In [332]:
df["host_has_profile_pic"].agg("mean")

np.float64(0.9702890522711249)

In [333]:
df["host_identity_verified"].agg("mean")

np.float64(0.9114714472756572)

In [334]:
for col in ["host_listings_count","host_has_profile_pic","host_identity_verified"]:
  df[col] = df[col].fillna(1)

In [336]:
df[["host_listings_count",
"host_has_profile_pic",
"host_identity_verified"]].isna().sum()

Unnamed: 0,0
host_listings_count,0
host_has_profile_pic,0
host_identity_verified,0


## Missing bed and bedroom measures

In [337]:
# There are three measures for how many people a property fits:
# 1. accommodates = max number of guests
# 2. bedrooms = number of bedrooms
# 3. beds = number of beds

df[['accommodates','bedrooms','beds']].isna().sum()

# accommodates has no missingness, while the others do, so we use combinations of them to impute values
# We will use all for the training

Unnamed: 0,0
accommodates,0
bedrooms,0
beds,0


In [338]:
# Only use rows where all three variables are valid (non-missing, non-zero)
valid = df[
    (df['accommodates'] > 0) &
    (df['beds'] > 0) &
    (df['bedrooms'] > 0)
]

# Compute ratios
valid['guests_per_bed'] = valid['accommodates'] / valid['beds']
valid['beds_per_bedroom'] = valid['beds'] / valid['bedrooms']
valid['guests_per_bedroom'] = valid['accommodates'] / valid['bedrooms']

# Get averages and medians
summary = valid[['guests_per_bed', 'beds_per_bedroom', 'guests_per_bedroom']].agg(['mean', 'median'])
print(summary)

# Use medians for integers
guests_per_bed = summary.loc['median', 'guests_per_bed']
beds_per_bedroom = summary.loc['median', 'beds_per_bedroom']
guests_per_bedroom = summary.loc['median', 'guests_per_bedroom']

# Impute median values where missing
# For missing beds but nonmissing bedrooms, impute median beds per bedroom
df.loc[df['beds'].isna() & df['bedrooms'].notna(), 'beds'] = df['bedrooms'] * beds_per_bedroom

# For remaining missing beds, divide mazx guest count by median guests per bed
df.loc[df['beds'].isna(), 'beds'] = df['accommodates'] / guests_per_bed

# For missing bedrooms but nonmissing bedrooms, divide beds by beds per bedroom
df.loc[df['bedrooms'].isna() & df['beds'].notna(), 'bedrooms'] = df['beds'] / beds_per_bedroom

# For remaining missing bedrooms, divide max guest count by median guests per bedroom
df.loc[df['bedrooms'].isna(), 'bedrooms'] = df['accommodates'] / guests_per_bedroom

# Re-check missingness
df[['accommodates','bedrooms','beds']].isna().sum()

        guests_per_bed  beds_per_bedroom  guests_per_bedroom
mean          2.033974          1.172938            2.246526
median        2.000000          1.000000            2.000000


Unnamed: 0,0
accommodates,0
bedrooms,0
beds,0


## Bathrooms

In [339]:
# Some properties are missing a number of bathrooms

df.loc[df["bathrooms"].isna(),["bathrooms","bathrooms_text"]].head(10)

Unnamed: 0,bathrooms,bathrooms_text


In [340]:
# Extract the numeric part from bathrooms_text
bathrooms_from_text = (
    df["bathrooms_text"]
    .astype(str)
    .str.extract(r'(\d+(\.\d+)?)')[0]   # capture integers or decimals
    .astype(float)
)

# Fill missing values in bathrooms with extracted numbers
df["bathrooms"] = df["bathrooms"].fillna(bathrooms_from_text)

In [341]:
df["bathrooms"].isna().sum()

np.int64(0)

In [342]:
# Fill rest of missing values with median bathrooms per guest

baths = df[["bathrooms", "accommodates"]]
baths["baths_per_guests"] = baths["bathrooms"] / baths["accommodates"]

median_baths_per_guests = baths["baths_per_guests"].agg("median")

df.loc[df['bathrooms'].isna(), 'bathrooms'] = df['accommodates'] * median_baths_per_guests

df["bathrooms"] = df["bathrooms"].round(0)

## Description

In [343]:
# Description has some missing values
df["description"].isna().sum()

np.int64(181)

In [344]:
# Make flag for missing description
df['description_missing'] = df['description'].isna().astype(int)

# Add description length
df['description_length'] = df['description'].fillna('').str.len()

# The description feature itself will be dropped later as we are not doing any text analysis

## Host tenure

In [345]:
# Data has a host_since feature. Let's use it to create a tenure

df['host_since'] = pd.to_datetime(df['host_since'], errors='coerce')
latest_scrape = pd.to_datetime(df['last_scraped']).max()
df['host_tenure_days'] = (latest_scrape - df['host_since']).dt.days

In [346]:
df["host_tenure_days"].isna().sum()

np.int64(257)

In [347]:
# For missing tenure, we assume 0
df['host_tenure_days'] = df['host_tenure_days'].fillna(0)

# Flag for missing host_since
df['host_since_missing'] = df['host_since'].isna().astype(int)

## Host description

In [348]:
# Make flag for host about
df['host_about_missing'] = df['host_about'].isna().astype(int)

# Host about length
df['host_about_length'] = df['host_about'].fillna('').str.len()

# The host description feature itself will be dropped later as we are not doing any text analysis

## Response and acceptance rate

In [349]:
# Format as float, add missing flag, fill with median

for col in ["host_response_rate", "host_acceptance_rate"]:
    temp = df[col].astype(str).str.strip().str.rstrip('%').replace('', np.nan)
    df[col] = pd.to_numeric(temp, errors='coerce')
    df[f"{col}_missing"] = df[col].isna().astype(int)
    df[col] = df[col].fillna(df[col].median())

## Response time

In [350]:
# Response time is a category
# Fill unknown or missing response time with category "unknown"

df['host_response_time'] = df['host_response_time'].fillna('unknown')

## Location

In [136]:
# Neighborhood data is not very granular
# Long and lat used to create hexbins

# --- Imports ---
import numpy as np
import geopandas as gpd
from shapely.geometry import Point, Polygon
import matplotlib.pyplot as plt

# --- 1. Create GeoDataFrame from lat / lon ---

# df is your original pandas DataFrame with 'latitude' and 'longitude'
gdf = gpd.GeoDataFrame(
    df.copy(),
    geometry=gpd.points_from_xy(df["longitude"], df["latitude"]),
    crs="EPSG:4326"   # WGS84 (lat/lon)
)

# Project to a metric CRS (UTM zone – here: 32N, good for Denmark/southern Sweden)
gdf = gdf.to_crs(epsg=32632)


# --- 2. Helper: build a single regular hexagon around a center ---

def make_hexagon(cx, cy, radius):
    """
    Create a pointy-top regular hexagon centered at (cx, cy)
    with given radius (distance from center to each vertex).
    """
    # Pointy-top: start at 30° and step by 60°
    angles = np.deg2rad(np.arange(0, 360, 60) + 30)
    coords = [(cx + radius * np.cos(a), cy + radius * np.sin(a)) for a in angles]
    return Polygon(coords)


# --- 3. Build a hex grid over the extent of gdf ---

def make_hex_grid(gdf, radius):
    """
    Create a pointy-top hexagon grid covering the extent of gdf.
    radius = distance from hex center to each vertex (in CRS units, e.g. meters).
    """
    xmin, ymin, xmax, ymax = gdf.total_bounds

    # Pointy-top spacing (Red Blob Games):
    # horizontal distance between centers = sqrt(3) * radius
    # vertical distance between rows = 1.5 * radius
    dx = np.sqrt(3) * radius
    dy = 1.5 * radius

    cols = np.arange(xmin - dx, xmax + dx, dx)
    rows = np.arange(ymin - dy, ymax + dy, dy)

    hexes = []
    for row_idx, cy in enumerate(rows):
        for col_idx, cx in enumerate(cols):
            # Offset every second row by half the horizontal spacing
            cx_shifted = cx + (dx / 2.0 if row_idx % 2 == 1 else 0.0)
            hex_poly = make_hexagon(cx_shifted, cy, radius)
            hexes.append(hex_poly)

    hex_grid = gpd.GeoDataFrame(
        {"hex_id": range(len(hexes))},
        geometry=hexes,
        crs=gdf.crs
    )
    return hex_grid


# --- 4. Generate hex grid + optional trimming ---

hex_radius = 250  # meters
hex_grid = make_hex_grid(gdf, hex_radius)

# Optional trimming to a buffered convex hull of your listings
study_area = gdf.geometry.union_all().convex_hull.buffer(2 * hex_radius)
hex_grid = hex_grid[hex_grid.intersects(study_area)].reset_index(drop=True)
hex_grid["hex_id"] = hex_grid.index  # reindex after trimming

# Clean any leftover sjoin columns if they exist
for frame in (gdf, hex_grid):
    for col in ("index_left", "index_right"):
        if col in frame.columns:
            frame.drop(columns=col, inplace=True)


# --- 5. Spatial join: assign each listing to a hex ---

joined = gpd.sjoin(
    gdf,
    hex_grid[["hex_id", "geometry"]],
    how="left",
    predicate="within"   # use "intersects" if you see edge-cases
)

# Use the hex id from the hex grid as the final one
joined["hex_id"] = joined["hex_id_right"]

# (Optional) drop the clutter
cols_to_drop = ["hex_id_left", "hex_id_right", "index_right", "index_left"]
cols_to_drop = [c for c in cols_to_drop if c in joined.columns]
joined = joined.drop(columns=cols_to_drop)



# --- Visualization: check that hexes look like hexes ---

fig, ax = plt.subplots(figsize=(8, 8))

# plot hex outlines
hex_grid.boundary.plot(ax=ax, linewidth=0.5)

# plot listing points
gdf.plot(ax=ax, markersize=3, color="red", alpha=0.7)

ax.set_title("Listings and true hex grid")
ax.set_axis_off()
plt.show()

KeyError: 'hex_id_right'

In [138]:
joined.head()

Unnamed: 0,id,listing_url,scrape_id,last_scraped,source,name,description,neighborhood_overview,picture_url,host_id,...,description_length,host_tenure_days,host_since_missing,host_about_missing,host_about_length,host_response_rate_missing,host_acceptance_rate_missing,geometry,index_right,hex_id
2,32379,https://www.airbnb.com/rooms/32379,20250627030604,2025-06-28,city scrape,"155 m2 artist flat on Vesterbro, with 2 bathrooms",You enter a narrow entrance and feel the good ...,"Værnedamsvej area is super hip area, we call i...",https://a0.muscache.com/pictures/miso/Hosting-...,140105,...,482,5504.0,0,0,180,0,0,POINT (723378.291 6175367.956),364,364
4,38499,https://www.airbnb.com/rooms/38499,20250627030604,2025-06-30,city scrape,0 min. from everything in Cph.,It doesn't get more central than this. Histori...,Area: <br />This is the eye of Cph. right betw...,https://a0.muscache.com/pictures/hosting/Hosti...,122489,...,412,5531.0,0,0,407,0,0,POINT (724601.53 6176729.917),474,474
5,39055,https://www.airbnb.com/rooms/39055,20250627030604,2025-06-29,city scrape,Stylish apartment in central Copenhagen,"Big, bright, airy and attractive apartment dec...",The flat is excellently located in Islands Bry...,https://a0.muscache.com/pictures/hosting/Hosti...,167511,...,75,5466.0,0,0,115,0,0,POINT (725348.592 6174625.515),316,316
7,69013,https://www.airbnb.com/rooms/69013,20250627030604,2025-07-01,previous scrape,Gåafstand til hele København,Welcome to our home with a view of the Lakes. ...,,https://a0.muscache.com/pictures/hosting/Hosti...,344223,...,558,5285.0,0,1,0,0,0,POINT (724405.65 6177427.001),500,500
8,69440,https://www.airbnb.com/rooms/69440,20250627030604,2025-07-01,previous scrape,Clean room in peacefull part of Cph,"Adorable apartment in peaceful, green and old-...",The green and old-fashion Frederiksberg is the...,https://a0.muscache.com/pictures/hosting/Hosti...,194944,...,515,5442.0,0,0,98,0,0,POINT (722167.638 6176676.657),442,442


### Mapping hexes

In [None]:
import folium

# Convert hexes and points to WGS84 (lat/lon)
hex_wgs = hex_grid.to_crs(epsg=4326)
pts_wgs = gdf.to_crs(epsg=4326)

center_lat = pts_wgs.geometry.y.mean()
center_lon = pts_wgs.geometry.x.mean()

m = folium.Map(location=[center_lat, center_lon], zoom_start=11)

folium.GeoJson(
    hex_wgs,
    name="Hex grid",
    style_function=lambda feature: {
        "fillColor": "none",
        "color": "blue",
        "weight": 1,
        "fillOpacity": 0.1,
    },
).add_to(m)

sample_pts = pts_wgs.sample(min(2000, len(pts_wgs)), random_state=0)

for _, row in sample_pts.iterrows():
    folium.CircleMarker(
        location=[row.geometry.y, row.geometry.x],
        radius=2,
        color="red",
        fill=True,
        fill_opacity=0.7,
    ).add_to(m)

m

# FEATURE SELECTION

In [208]:
# Fetch selected features and filter columns
features = feature_selection.loc[feature_selection['keep_2'] == True, "feature"].tolist()
features = features[0:-1]

In [351]:
df_filtered = df[features]

In [352]:
df_filtered.isna().sum()

Unnamed: 0,0
host_response_time,0
host_response_rate,0
host_acceptance_rate,0
host_is_superhost,0
host_listings_count,0
host_has_profile_pic,0
host_identity_verified,0
property_type,0
room_type,0
accommodates,0


In [319]:
df_filtered[df_filtered["host_has_profile_pic"].isna()].head(10)

Unnamed: 0,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,host_has_profile_pic,host_identity_verified,property_type,room_type,accommodates,...,instant_bookable,calculated_host_listings_count,description_missing,description_length,host_tenure_days,host_since_missing,host_about_missing,host_about_length,host_response_rate_missing,host_acceptance_rate_missing
542,unknown,100.0,81.0,0,,,,Entire home,Entire home/apt,9,...,0,1,0,209,0.0,1,1,0,1,1
695,unknown,100.0,81.0,0,,,,Entire rental unit,Entire home/apt,4,...,0,2,0,275,0.0,1,1,0,1,1
902,unknown,100.0,81.0,1,,,,Entire condo,Entire home/apt,5,...,0,1,0,468,0.0,1,1,0,1,1
937,unknown,100.0,81.0,1,,,,Entire rental unit,Entire home/apt,5,...,0,1,0,183,0.0,1,1,0,1,1
1700,unknown,100.0,81.0,0,,,,Entire rental unit,Entire home/apt,2,...,0,1,0,536,0.0,1,1,0,1,1
1835,unknown,100.0,81.0,0,,,,Entire rental unit,Entire home/apt,4,...,0,1,0,503,0.0,1,1,0,1,1
1850,unknown,100.0,81.0,0,,,,Entire rental unit,Entire home/apt,4,...,0,1,0,320,0.0,1,1,0,1,1
1893,unknown,100.0,81.0,0,,,,Private room in condo,Private room,1,...,0,1,0,535,0.0,1,1,0,1,1
1914,unknown,100.0,81.0,1,,,,Entire rental unit,Entire home/apt,4,...,0,1,0,405,0.0,1,1,0,1,1
1925,unknown,100.0,81.0,0,,,,Entire rental unit,Entire home/apt,2,...,0,1,0,386,0.0,1,1,0,1,1


# Earlier code

In [None]:
features = [
    'accommodates', 'bathrooms', 'bedrooms', 'beds',
    'room_type', 'property_type', 'neighbourhood_cleansed',
    'host_is_superhost', 'host_response_rate', 'host_acceptance_rate'
]

X = df[features].copy()
y = df['price']

In [None]:
X['host_response_rate'] = X['host_response_rate'].str.rstrip('%').astype(float)
X['host_acceptance_rate'] = X['host_acceptance_rate'].str.rstrip('%').astype(float)
X['host_is_superhost'] = X['host_is_superhost'].map({'t': 1, 'f': 0})

In [None]:
X = pd.get_dummies(X, columns=['room_type', 'property_type', 'neighbourhood_cleansed'], drop_first=True)


In [None]:
X = X.dropna(subset=['bathrooms', 'bedrooms', 'beds', 'host_is_superhost'])
X['host_response_rate'] = X['host_response_rate'].fillna(0)
X['host_acceptance_rate'] = X['host_acceptance_rate'].fillna(0)

In [None]:
y = df.loc[X.index, 'price']
print(X.shape, y.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
y_pred = lr_model.predict(X_test)

In [None]:
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print("Linear Regression R²:", r2)
print("Linear Regression RMSE:", rmse)

In [None]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

In [None]:
r2_rf = r2_score(y_test, y_pred_rf)
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))

print("Random Forest R²:", r2_rf)
print("Random Forest RMSE:", rmse_rf)

In [None]:
feature_importances = pd.Series(rf_model.feature_importances_, index=X.columns)
feature_importances.sort_values(ascending=False).head(10)