# Importing Libraries and loading Datasets

In [1]:
import pandas as pd
import numpy as np
from geopy.distance import geodesic
from datetime import datetime
from collections import Counter
import ast
import os
import requests
from PIL import Image
from io import BytesIO
from tqdm import tqdm
import torch
import torchvision.transforms as transforms
from torchvision.models import resnet50
from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input
from tensorflow.keras.preprocessing import image
from tensorflow.keras.models import Model

### Loadind and inspecting each data set
#### A. listings.csv

In [3]:
# Load listings
listings = pd.read_csv('listings.csv')

# Clean and convert the 'price' column to float'
#listings['price'] = (listings['price'].str.replace(r'[\$,]', '', regex=True).astype(float))
print(listings.shape)
print(listings[['id', 'name', 'neighbourhood', 'room_type', 'price', 'latitude', 'longitude']].head(5))

(13945, 79)
      id                                           name    neighbourhood  \
0   3176                Fabulous Flat in great Location  Berlin, Germany   
1   9991            Geourgeous flat - outstanding views  Berlin, Germany   
2  14325            Studio Apartment in Prenzlauer Berg              NaN   
3  16644             In the Heart of Berlin - Kreuzberg  Berlin, Germany   
4  17904  Beautiful Kreuzberg studio - 3 months minimum  Berlin, Germany   

         room_type    price  latitude  longitude  
0  Entire home/apt  $105.00  52.53471   13.41810  
1  Entire home/apt  $135.00  52.53269   13.41805  
2  Entire home/apt   $75.00  52.54813   13.40366  
3  Entire home/apt   $77.00  52.50312   13.43508  
4  Entire home/apt   $40.00  52.49419   13.42166  


In [4]:
# Clean and convert the 'price' column to float
listings['price'] = (
    listings['price']
    .astype(str)  # convert to string first
    .str.replace(r'[\$,]', '', regex=True)  # remove $ and ,
    .astype(float)  # convert to float
)


#### B. calendar.csv
#### Filter only available dates and convert price to numeric

In [6]:
calendar = pd.read_csv('calendar.csv')
calendar['date'] = pd.to_datetime(calendar['date'])
# calendar = calendar[calendar['available'] == 't'].copy()
calendar['price'] = (
    calendar['price'].str.replace(r'[\$,]', '', regex=True).astype(float)
)
print(calendar.groupby('listing_id')['price'].mean().head())

listing_id
3176     105.0
9991     180.0
14325     75.0
16644     90.0
17904     25.0
Name: price, dtype: float64


In [8]:
calendar.shape

(5103124, 7)

#### Create Base Dataset
#### Let’s merge the datasets into one base DataFrame, using id from listings and listing_id from calendar.

In [10]:
# Compute mean price per listing
price_df = calendar.groupby('listing_id')['price'].mean().reset_index()
price_df.columns = ['id', 'average_price']

# Merge with listings
base_df = pd.merge(listings, price_df, on='id', how='inner')
print(base_df[['id', 'room_type', 'neighbourhood', 'average_price','price']].head())



      id        room_type    neighbourhood  average_price  price
0   3176  Entire home/apt  Berlin, Germany          105.0  105.0
1   9991  Entire home/apt  Berlin, Germany          180.0  135.0
2  14325  Entire home/apt              NaN           75.0   75.0
3  16644  Entire home/apt  Berlin, Germany           90.0   77.0
4  17904  Entire home/apt  Berlin, Germany           25.0   40.0


In [11]:
base_df.head()

Unnamed: 0,id,listing_url,scrape_id,last_scraped,source,name,description,neighborhood_overview,picture_url,host_id,...,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month,average_price
0,3176,https://www.airbnb.com/rooms/3176,20250315152850,2025-03-16,city scrape,Fabulous Flat in great Location,This beautiful first floor apartment is situa...,The neighbourhood is famous for its variety of...,https://a0.muscache.com/pictures/airflow/Hosti...,3718,...,4.92,4.61,First name and Last name: Nicolas Krotz <br/> ...,f,1,1,0,0,0.78,105.0
1,9991,https://www.airbnb.com/rooms/9991,20250315152850,2025-03-16,city scrape,Geourgeous flat - outstanding views,4 bedroom with very large windows and outstand...,Prenzlauer Berg is an amazing neighbourhood wh...,https://a0.muscache.com/pictures/42799131/59c8...,33852,...,4.86,4.86,03/Z/RA/003410-18,f,1,1,0,0,0.06,180.0
2,14325,https://www.airbnb.com/rooms/14325,20250315152850,2025-03-16,city scrape,Studio Apartment in Prenzlauer Berg,The apartment is located on the upper second f...,,https://a0.muscache.com/pictures/508703/24988a...,55531,...,4.6,4.45,,f,4,4,0,0,0.15,75.0
3,16644,https://www.airbnb.com/rooms/16644,20250315152850,2025-03-16,city scrape,In the Heart of Berlin - Kreuzberg,Light and sunny 2-Room-turn of the century-fla...,Our Part of Kreuzberg is just the best. Good v...,https://a0.muscache.com/pictures/73759174/e2ef...,64696,...,4.67,4.71,,f,2,2,0,0,0.27,90.0
4,17904,https://www.airbnb.com/rooms/17904,20250315152850,2025-03-16,city scrape,Beautiful Kreuzberg studio - 3 months minimum,"- apt is available starting September 1, 2024<...","The apartment is located in Kreuzberg, which i...",https://a0.muscache.com/pictures/d9a6f8be-54b9...,68997,...,4.87,4.65,,f,1,1,0,0,1.63,25.0


In [13]:
base_df.shape

(12912, 80)

In [None]:
# List of top amenities
# These amenities are commonly found in Airbnb listings and are often considered desirable by guests.
# They can be used to filter or analyze listings based on guest preferences.
top_amenities_list = [
    "Kitchen", "Wifi", "Hair dryer","Smoke alarm", "Hot water", "Cooking basics", "Refrigerator", "Iron",
    "Bed linens", "Washer", "Dishes and silverware", "Dedicated workspace",
    "Heating", "Hot water kettle", "Dishwasher", "TV", "Oven"
]


In [16]:
# Function to extract amenities from a string
# This function takes a string representation of amenities (e.g., from a DataFrame column)
# and returns a list of amenities, stripping any extra characters like quotes or whitespace.
# It handles NaN values by returning an empty list.
def extract_amenities(amenities_str):
    if pd.isna(amenities_str):
        return []
    amenities_str = amenities_str.strip('{}')
    amenities = [a.strip().strip('"').strip("'") for a in amenities_str.split(',')]
    return amenities

In [17]:
# Create binary columns for each top amenity
# This will create a new column for each amenity in the top_amenities_list,
# indicating whether that amenity is present (1) or not (0) in each listing.
for amenity in top_amenities_list:
    col_name = f'amenity_{amenity.replace(" ", "_").lower()}'
    base_df[col_name] = base_df['amenities'].apply(
        lambda x: int(amenity in extract_amenities(x))
    )


In [18]:
keep_columns = [
    'id', 'price', 'accommodates',
    'bedrooms', 'bathrooms', 'number_of_reviews',
    'review_scores_rating', 'availability_365', 'minimum_nights',
    'host_is_superhost', 'instant_bookable',
    'latitude', 'longitude', 'picture_url',
    
    # Amenity binary features
    'amenity_kitchen', 'amenity_wifi',
    'amenity_hair_dryer', 'amenity_smoke_alarm', 'amenity_hot_water',
    'amenity_cooking_basics', 'amenity_refrigerator', 'amenity_iron',
    'amenity_bed_linens', 'amenity_washer', 'amenity_dishes_and_silverware',
    'amenity_dedicated_workspace', 'amenity_heating','amenity_hot_water_kettle', 
    'amenity_dishwasher', 'amenity_tv', 'amenity_oven',
]

base_df = base_df[keep_columns]


In [19]:
base_df.columns

Index(['id', 'price', 'accommodates', 'bedrooms', 'bathrooms',
       'number_of_reviews', 'review_scores_rating', 'availability_365',
       'minimum_nights', 'host_is_superhost', 'instant_bookable', 'latitude',
       'longitude', 'picture_url', 'amenity_kitchen', 'amenity_wifi',
       'amenity_hair_dryer', 'amenity_smoke_alarm', 'amenity_hot_water',
       'amenity_cooking_basics', 'amenity_refrigerator', 'amenity_iron',
       'amenity_bed_linens', 'amenity_washer', 'amenity_dishes_and_silverware',
       'amenity_dedicated_workspace', 'amenity_heating',
       'amenity_hot_water_kettle', 'amenity_dishwasher', 'amenity_tv',
       'amenity_oven'],
      dtype='object')

In [22]:
# Convert 'instant_bookable' to binary 1 for 't' and 0 for 'f'
base_df['instant_bookable'] = base_df['instant_bookable'].map({'t': 1, 'f': 0})


In [23]:
base_df.head()

Unnamed: 0,id,price,accommodates,bedrooms,bathrooms,number_of_reviews,review_scores_rating,availability_365,minimum_nights,host_is_superhost,...,amenity_iron,amenity_bed_linens,amenity_washer,amenity_dishes_and_silverware,amenity_dedicated_workspace,amenity_heating,amenity_hot_water_kettle,amenity_dishwasher,amenity_tv,amenity_oven
0,3176,105.0,2,1.0,1.0,149,4.63,286,63,f,...,1,1,1,1,0,1,1,0,1,1
1,9991,135.0,7,4.0,2.5,7,5.0,109,6,f,...,1,1,1,1,0,1,0,1,0,1
2,14325,75.0,1,0.0,1.0,26,4.68,165,150,t,...,0,0,1,1,0,0,0,0,1,0
3,16644,77.0,4,1.0,1.0,48,4.72,196,93,f,...,1,0,1,0,0,0,0,0,1,0
4,17904,40.0,2,0.0,1.0,299,4.77,29,92,f,...,1,0,0,0,0,1,0,0,1,0


In [24]:
base_df.shape

(12912, 31)

## Tabular Features

In [28]:
# Select only tabular columns
tabular_columns = [
    'price', 'accommodates', 'bedrooms', 'bathrooms',
    'number_of_reviews', 'review_scores_rating',
    'availability_365', 'minimum_nights', 'instant_bookable',
    
    # Amenity binary features
    'amenity_kitchen', 'amenity_wifi','amenity_hair_dryer', 'amenity_smoke_alarm', 'amenity_hot_water',
    'amenity_cooking_basics', 'amenity_refrigerator', 'amenity_iron',
    'amenity_bed_linens', 'amenity_washer', 'amenity_dishes_and_silverware',
    'amenity_dedicated_workspace', 'amenity_heating','amenity_hot_water_kettle',
    'amenity_dishwasher', 'amenity_tv', 'amenity_oven',
]

# Select only the tabular columns from the base DataFrame
tabular_df = base_df[tabular_columns].copy()

# Optional: Handle missing values (simple approach: fill with median)
tabular_df.fillna(tabular_df.median(numeric_only=True), inplace=True)


In [29]:
tabular_df.shape

(12912, 26)

## Spatial features

In [30]:
# City center of Berlin (Mitte)
berlin_center = (52.5200, 13.4050)

# Extract coordinates
spatial_df = base_df[['latitude', 'longitude']].copy()

# Compute distance to city center
spatial_df['dist_to_center_km'] = spatial_df.apply(
    lambda row: geodesic((row['latitude'], row['longitude']), berlin_center).km,
    axis=1
)


In [31]:
spatial_df.head()

Unnamed: 0,latitude,longitude,dist_to_center_km
0,52.53471,13.4181,1.862755
1,52.53269,13.41805,1.666888
2,52.54813,13.40366,3.131556
3,52.50312,13.43508,2.774675
4,52.49419,13.42166,3.086801
