# Importing Libraries and loading Datasets

In [30]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [31]:
import pandas as pd
import numpy as np
from geopy.distance import geodesic
from datetime import datetime
from collections import Counter
import ast
import os
import requests
from PIL import Image
from io import BytesIO
from tqdm import tqdm
import torch
import torchvision.transforms as transforms
from torchvision.models import resnet50
from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input
from tensorflow.keras.preprocessing import image
from tensorflow.keras.models import Model


## Loadind and inspecting each data set

### A. listings.csv

In [32]:
# Load listings
listings = pd.read_csv('/content/drive/My Drive/Machine_ML/listings.csv')

# Clean and convert the 'price' column to float'
#listings['price'] = (listings['price'].str.replace(r'[\$,]', '', regex=True).astype(float))
print(listings.shape)
print(listings[['id', 'name', 'neighbourhood', 'room_type', 'price', 'latitude', 'longitude']].head())

(14187, 79)
      id                                           name    neighbourhood  \
0   3176                Fabulous Flat in great Location  Berlin, Germany   
1   9991            Geourgeous flat - outstanding views  Berlin, Germany   
2  14325            Studio Apartment in Prenzlauer Berg              NaN   
3  16644             In the Heart of Berlin - Kreuzberg  Berlin, Germany   
4  17904  Beautiful Kreuzberg studio - 3 months minimum  Berlin, Germany   

         room_type    price  latitude  longitude  
0  Entire home/apt  $105.00  52.53471   13.41810  
1  Entire home/apt  $135.00  52.53269   13.41805  
2  Entire home/apt   $75.00  52.54813   13.40366  
3  Entire home/apt      NaN  52.50312   13.43508  
4  Entire home/apt   $28.00  52.49419   13.42166  


In [33]:
listings

Unnamed: 0,id,listing_url,scrape_id,last_scraped,source,name,description,neighborhood_overview,picture_url,host_id,...,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,3176,https://www.airbnb.com/rooms/3176,20250620182343,2025-06-21,city scrape,Fabulous Flat in great Location,This beautiful first floor apartment is situa...,The neighbourhood is famous for its variety of...,https://a0.muscache.com/pictures/airflow/Hosti...,3718,...,4.70,4.92,4.61,,f,1,1,0,0,0.76
1,9991,https://www.airbnb.com/rooms/9991,20250620182343,2025-06-21,city scrape,Geourgeous flat - outstanding views,4 bedroom with very large windows and outstand...,Prenzlauer Berg is an amazing neighbourhood wh...,https://a0.muscache.com/pictures/42799131/59c8...,33852,...,5.00,4.86,4.86,03/Z/RA/003410-18,f,1,1,0,0,0.06
2,14325,https://www.airbnb.com/rooms/14325,20250620182343,2025-06-21,city scrape,Studio Apartment in Prenzlauer Berg,The apartment is located on the upper second f...,,https://a0.muscache.com/pictures/508703/24988a...,55531,...,4.85,4.60,4.45,,f,4,4,0,0,0.14
3,16644,https://www.airbnb.com/rooms/16644,20250620182343,2025-06-21,previous scrape,In the Heart of Berlin - Kreuzberg,Light and sunny 2-Room-turn of the century-fla...,Our Part of Kreuzberg is just the best. Good v...,https://a0.muscache.com/pictures/73759174/e2ef...,64696,...,4.86,4.67,4.71,,f,2,2,0,0,0.26
4,17904,https://www.airbnb.com/rooms/17904,20250620182343,2025-06-21,city scrape,Beautiful Kreuzberg studio - 3 months minimum,"- apt is available starting September 1, 2024<...","The apartment is located in Kreuzberg, which i...",https://a0.muscache.com/pictures/d9a6f8be-54b9...,68997,...,4.92,4.88,4.65,,f,1,1,0,0,1.60
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14182,1446408006383528333,https://www.airbnb.com/rooms/1446408006383528333,20250620182343,2025-06-20,city scrape,cozy room,It is a room with lots of light and you look o...,,https://a0.muscache.com/pictures/hosting/Hosti...,420514457,...,,,,08/Z/AZ/010492-21,f,2,0,2,0,
14183,1446861918532076953,https://www.airbnb.com/rooms/1446861918532076953,20250620182343,2025-06-20,city scrape,Beautiful 1-Bedroom Apartment in Wedding,This stunning furnished one bedroom apartment ...,"Wedding is a lively and diverse neighborhood, ...",https://a0.muscache.com/pictures/prohost-api/H...,595670462,...,,,,,f,69,69,0,0,
14184,1446862242596527946,https://www.airbnb.com/rooms/1446862242596527946,20250620182343,2025-06-20,city scrape,Stunning Studio in Wedding,This stunning furnished studio apartment featu...,"Wedding is a lively and diverse neighborhood, ...",https://a0.muscache.com/pictures/prohost-api/H...,595670462,...,,,,,f,69,69,0,0,
14185,1446993798027111825,https://www.airbnb.com/rooms/1446993798027111825,20250620182343,2025-06-20,city scrape,Gemütliches Boot mit Seeblick,"Spend unforgettable hours on our charming, rom...",,https://a0.muscache.com/pictures/miso/Hosting-...,569536089,...,,,,First name and Last name: Taya P. <br/> Contac...,t,1,1,0,0,


In [35]:
# Clean and convert the 'price' column to float
listings['price'] = (
    listings['price']
    .astype(str)  # convert to string first
    .str.replace(r'[\$,]', '', regex=True)  # remove $ and ,
    .astype(float)  # convert to float
)


### B. calendar.csv
### Filter only available dates and convert price to numeric

In [36]:
calendar = pd.read_csv('/content/drive/My Drive/Machine_ML/calendar.csv')
calendar['date'] = pd.to_datetime(calendar['date'])
# calendar = calendar[calendar['available'] == 't'].copy()
calendar['price'] = (
    calendar['price'].str.replace(r'[\$,]', '', regex=True).astype(float)
)
print(calendar.groupby('listing_id')['price'].mean().head())

listing_id
3176     105.0
9991     180.0
14325     75.0
16644     90.0
17904     25.0
Name: price, dtype: float64


In [37]:
calendar

Unnamed: 0,listing_id,date,available,price,adjusted_price,minimum_nights,maximum_nights
0,3176,2024-12-21,f,105.0,,63,730
1,3176,2024-12-22,f,105.0,,63,730
2,3176,2024-12-23,f,105.0,,63,730
3,3176,2024-12-24,f,105.0,,63,730
4,3176,2024-12-25,f,105.0,,63,730
...,...,...,...,...,...,...,...
5103119,1315853067146320598,2025-12-16,t,103.0,,92,365
5103120,1315853067146320598,2025-12-17,t,103.0,,92,365
5103121,1315853067146320598,2025-12-18,t,103.0,,92,365
5103122,1315853067146320598,2025-12-19,t,103.0,,92,365


In [38]:
calendar.shape

(5103124, 7)

### C. reviews.csv
### Check:

### Columns: listing_id, date, comments

### Count number of reviews

### Step 4: Create Base Dataset
### Let’s merge the datasets into one base DataFrame, using id from listings and listing_id from calendar/reviews.

In [39]:
# Compute mean price per listing
price_df = calendar.groupby('listing_id')['price'].mean().reset_index()
price_df.columns = ['id', 'average_price']

# Merge with listings
base_df = pd.merge(listings, price_df, on='id', how='inner')
print(base_df[['id', 'room_type', 'neighbourhood', 'average_price']].head())



      id        room_type    neighbourhood  average_price
0   3176  Entire home/apt  Berlin, Germany          105.0
1   9991  Entire home/apt  Berlin, Germany          180.0
2  14325  Entire home/apt              NaN           75.0
3  16644  Entire home/apt  Berlin, Germany           90.0
4  17904  Entire home/apt  Berlin, Germany           25.0


In [40]:
base_df.head()

Unnamed: 0,id,listing_url,scrape_id,last_scraped,source,name,description,neighborhood_overview,picture_url,host_id,...,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month,average_price
0,3176,https://www.airbnb.com/rooms/3176,20250620182343,2025-06-21,city scrape,Fabulous Flat in great Location,This beautiful first floor apartment is situa...,The neighbourhood is famous for its variety of...,https://a0.muscache.com/pictures/airflow/Hosti...,3718,...,4.92,4.61,,f,1,1,0,0,0.76,105.0
1,9991,https://www.airbnb.com/rooms/9991,20250620182343,2025-06-21,city scrape,Geourgeous flat - outstanding views,4 bedroom with very large windows and outstand...,Prenzlauer Berg is an amazing neighbourhood wh...,https://a0.muscache.com/pictures/42799131/59c8...,33852,...,4.86,4.86,03/Z/RA/003410-18,f,1,1,0,0,0.06,180.0
2,14325,https://www.airbnb.com/rooms/14325,20250620182343,2025-06-21,city scrape,Studio Apartment in Prenzlauer Berg,The apartment is located on the upper second f...,,https://a0.muscache.com/pictures/508703/24988a...,55531,...,4.6,4.45,,f,4,4,0,0,0.14,75.0
3,16644,https://www.airbnb.com/rooms/16644,20250620182343,2025-06-21,previous scrape,In the Heart of Berlin - Kreuzberg,Light and sunny 2-Room-turn of the century-fla...,Our Part of Kreuzberg is just the best. Good v...,https://a0.muscache.com/pictures/73759174/e2ef...,64696,...,4.67,4.71,,f,2,2,0,0,0.26,90.0
4,17904,https://www.airbnb.com/rooms/17904,20250620182343,2025-06-21,city scrape,Beautiful Kreuzberg studio - 3 months minimum,"- apt is available starting September 1, 2024<...","The apartment is located in Kreuzberg, which i...",https://a0.muscache.com/pictures/d9a6f8be-54b9...,68997,...,4.88,4.65,,f,1,1,0,0,1.6,25.0


In [41]:
base_df.columns

Index(['id', 'listing_url', 'scrape_id', 'last_scraped', 'source', 'name',
       'description', 'neighborhood_overview', 'picture_url', 'host_id',
       'host_url', 'host_name', 'host_since', 'host_location', 'host_about',
       'host_response_time', 'host_response_rate', 'host_acceptance_rate',
       'host_is_superhost', 'host_thumbnail_url', 'host_picture_url',
       'host_neighbourhood', 'host_listings_count',
       'host_total_listings_count', 'host_verifications',
       'host_has_profile_pic', 'host_identity_verified', 'neighbourhood',
       'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'latitude',
       'longitude', 'property_type', 'room_type', 'accommodates', 'bathrooms',
       'bathrooms_text', 'bedrooms', 'beds', 'amenities', 'price',
       'minimum_nights', 'maximum_nights', 'minimum_minimum_nights',
       'maximum_minimum_nights', 'minimum_maximum_nights',
       'maximum_maximum_nights', 'minimum_nights_avg_ntm',
       'maximum_nights_avg_ntm', 'ca

In [42]:
keep_columns = [
    'id', 'price', 'accommodates',
    'bedrooms', 'bathrooms', 'number_of_reviews',
    'review_scores_rating', 'availability_365', 'minimum_nights',
    'host_is_superhost', 'instant_bookable',
    'latitude', 'longitude',
    'picture_url'
]

base_df = base_df[keep_columns]


In [43]:
base_df.columns

Index(['id', 'price', 'accommodates', 'bedrooms', 'bathrooms',
       'number_of_reviews', 'review_scores_rating', 'availability_365',
       'minimum_nights', 'host_is_superhost', 'instant_bookable', 'latitude',
       'longitude', 'picture_url'],
      dtype='object')

In [44]:
base_df['host_is_superhost'] = base_df['host_is_superhost'].map({'t': 1, 'f': 0})
base_df['instant_bookable'] = base_df['instant_bookable'].map({'t': 1, 'f': 0})


In [45]:
base_df.tail()

Unnamed: 0,id,price,accommodates,bedrooms,bathrooms,number_of_reviews,review_scores_rating,availability_365,minimum_nights,host_is_superhost,instant_bookable,latitude,longitude,picture_url
12272,1313899686620868892,150.0,2,1.0,1.0,0,,170,21,0.0,0,52.553352,13.411766,https://a0.muscache.com/pictures/hosting/Hosti...
12273,1314372197012793402,69.0,1,1.0,1.0,1,5.0,270,1,0.0,1,52.512834,13.426722,https://a0.muscache.com/pictures/hosting/Hosti...
12274,1314378755944554373,64.0,1,1.0,1.0,8,4.88,311,1,0.0,1,52.512795,13.426636,https://a0.muscache.com/pictures/hosting/Hosti...
12275,1315368682473897294,44.0,2,1.0,1.0,0,,279,92,0.0,1,52.514735,13.459341,https://a0.muscache.com/pictures/hosting/Hosti...
12276,1315853067146320598,155.0,7,3.0,1.0,0,,140,92,0.0,0,52.497378,13.324859,https://a0.muscache.com/pictures/prohost-api/H...


In [None]:
base_df.shape

(12912, 14)

In [48]:
print("Original listings:", listings.shape[0])
print("Listings with prices:", price_df.shape[0])
print("Merged dataset:", base_df.shape[0])


Original listings: 14187
Listings with prices: 13984
Merged dataset: 12277


In [49]:
print("Total columns:", base_df.shape[1])
print("\nSample of final columns:")
print(base_df.columns[:20])  # show first 20

# Optional: Print how many columns came from each file manually
calendar_cols = ['target_price']  # or however you named it
review_cols = ['review_count', 'avg_sentiment']  # if you extracted these
listing_cols = [col for col in base_df.columns if col not in calendar_cols + review_cols]

print(f"\nColumns from listings: {len(listing_cols)}")
print(f"Columns from calendar (target): {len(calendar_cols)}")
print(f"Columns from reviews: {len(review_cols)}")


Total columns: 14

Sample of final columns:
Index(['id', 'price', 'accommodates', 'bedrooms', 'bathrooms',
       'number_of_reviews', 'review_scores_rating', 'availability_365',
       'minimum_nights', 'host_is_superhost', 'instant_bookable', 'latitude',
       'longitude', 'picture_url'],
      dtype='object')

Columns from listings: 14
Columns from calendar (target): 1
Columns from reviews: 2


## Tabular Features

In [50]:
# Select only tabular columns
tabular_columns = [
    'price', 'accommodates', 'bedrooms', 'bathrooms',
    'number_of_reviews', 'review_scores_rating',
    'availability_365', 'minimum_nights',
    'host_is_superhost', 'instant_bookable'
]

tabular_df = base_df[tabular_columns].copy()

# Optional: Handle missing values (simple approach: fill with median)
tabular_df.fillna(tabular_df.median(numeric_only=True), inplace=True)


In [51]:
tabular_df.shape

(12277, 10)

## Spatial features

In [52]:
# City center of Berlin (Mitte)
berlin_center = (52.5200, 13.4050)

# Extract coordinates
spatial_df = base_df[['latitude', 'longitude']].copy()

# Compute distance to city center
spatial_df['dist_to_center_km'] = spatial_df.apply(
    lambda row: geodesic((row['latitude'], row['longitude']), berlin_center).km,
    axis=1
)


In [53]:
spatial_df.head()

Unnamed: 0,latitude,longitude,dist_to_center_km
0,52.53471,13.4181,1.862755
1,52.53269,13.41805,1.666888
2,52.54813,13.40366,3.131556
3,52.50312,13.43508,2.774675
4,52.49419,13.42166,3.086801
