In [295]:
# Import necessary libraries

import pandas as pd
import geopandas as gpd
import zipfile
import numpy as np
# Import the regular expressions module to work with text pattern matching and manipulation
import re


In [296]:
# Create a data frame from listings.csv.zip and neighbourhoods.geojson

with zipfile.ZipFile("../sources/listings.csv.zip", 'r') as zip_ref:
    df = pd.read_csv(zip_ref.open('listings.csv'))

gdf = gpd.read_file("../sources/neighbourhoods.geojson")

In [297]:
# Check df

df.head()

Unnamed: 0,id,listing_url,scrape_id,last_scraped,source,name,description,neighborhood_overview,picture_url,host_id,...,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,3176,https://www.airbnb.com/rooms/3176,20250620182343,2025-06-21,city scrape,Fabulous Flat in great Location,This beautiful first floor apartment is situa...,The neighbourhood is famous for its variety of...,https://a0.muscache.com/pictures/airflow/Hosti...,3718,...,4.7,4.92,4.61,,f,1,1,0,0,0.76
1,9991,https://www.airbnb.com/rooms/9991,20250620182343,2025-06-21,city scrape,Geourgeous flat - outstanding views,4 bedroom with very large windows and outstand...,Prenzlauer Berg is an amazing neighbourhood wh...,https://a0.muscache.com/pictures/42799131/59c8...,33852,...,5.0,4.86,4.86,03/Z/RA/003410-18,f,1,1,0,0,0.06
2,14325,https://www.airbnb.com/rooms/14325,20250620182343,2025-06-21,city scrape,Studio Apartment in Prenzlauer Berg,The apartment is located on the upper second f...,,https://a0.muscache.com/pictures/508703/24988a...,55531,...,4.85,4.6,4.45,,f,4,4,0,0,0.14
3,16644,https://www.airbnb.com/rooms/16644,20250620182343,2025-06-21,previous scrape,In the Heart of Berlin - Kreuzberg,Light and sunny 2-Room-turn of the century-fla...,Our Part of Kreuzberg is just the best. Good v...,https://a0.muscache.com/pictures/73759174/e2ef...,64696,...,4.86,4.67,4.71,,f,2,2,0,0,0.26
4,17904,https://www.airbnb.com/rooms/17904,20250620182343,2025-06-21,city scrape,Beautiful Kreuzberg studio - 3 months minimum,"- apt is available starting September 1, 2024<...","The apartment is located in Kreuzberg, which i...",https://a0.muscache.com/pictures/d9a6f8be-54b9...,68997,...,4.92,4.88,4.65,,f,1,1,0,0,1.6


In [298]:
# Check columns

df.columns

Index(['id', 'listing_url', 'scrape_id', 'last_scraped', 'source', 'name',
       'description', 'neighborhood_overview', 'picture_url', 'host_id',
       'host_url', 'host_name', 'host_since', 'host_location', 'host_about',
       'host_response_time', 'host_response_rate', 'host_acceptance_rate',
       'host_is_superhost', 'host_thumbnail_url', 'host_picture_url',
       'host_neighbourhood', 'host_listings_count',
       'host_total_listings_count', 'host_verifications',
       'host_has_profile_pic', 'host_identity_verified', 'neighbourhood',
       'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'latitude',
       'longitude', 'property_type', 'room_type', 'accommodates', 'bathrooms',
       'bathrooms_text', 'bedrooms', 'beds', 'amenities', 'price',
       'minimum_nights', 'maximum_nights', 'minimum_minimum_nights',
       'maximum_minimum_nights', 'minimum_maximum_nights',
       'maximum_maximum_nights', 'minimum_nights_avg_ntm',
       'maximum_nights_avg_ntm', 'ca

In [299]:

# Dropping unnecessary columns that are either metadata, URLs, images, verbose text, or too granular for analysis

columns_to_drop = [
    'listing_url',
    'scrape_id',
    'last_scraped',
    'source',
    'name',
    'description',
    'neighborhood_overview',
    'picture_url',
    'host_url',
    'host_name',
    'host_since',
    'host_location',
    'host_about',
    'host_response_time',
    'host_response_rate',
    'host_acceptance_rate',
    'host_is_superhost',
    'host_thumbnail_url',
    'host_picture_url',
    'host_neighbourhood',
    'host_verifications',
    'host_has_profile_pic',
    'host_identity_verified',
    'host_location',
    'neighbourhood',            # Has only 'Berlin, Germany', not actual neighborhood
    'bathrooms',                # Keep `bathrooms_text` instead, because `bathrooms` has many missing values and `bathrooms_text` not, plus it has info if a bathroom is shared or not
    'calendar_updated',
    'calendar_last_scraped',
    'availability_eoy',  
    'number_of_reviews_ly', 
    'number_of_reviews_ltm',
    'number_of_reviews_l30d',
    'estimated_occupancy_l365d',
    'estimated_revenue_l365d',
    'first_review',
    'last_review',
    'license',
    'minimum_minimum_nights',
    'maximum_minimum_nights',
    'minimum_maximum_nights',
    'maximum_maximum_nights',
    'minimum_nights_avg_ntm',
    'maximum_nights_avg_ntm',
    'has_availability',
    'availability_30',
    'availability_60',
    'availability_90',
    'availability_365',
    'instant_bookable',
    'calculated_host_listings_count',
    'calculated_host_listings_count_entire_homes',
    'calculated_host_listings_count_private_rooms',
    'calculated_host_listings_count_shared_rooms'
]

In [300]:
# Apply the column drop to the DataFrame

df.drop(columns=columns_to_drop, inplace=True)

In [301]:
# Check df

df.head()

Unnamed: 0,id,host_id,host_listings_count,host_total_listings_count,neighbourhood_cleansed,neighbourhood_group_cleansed,latitude,longitude,property_type,room_type,...,maximum_nights,number_of_reviews,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,reviews_per_month
0,3176,3718,1.0,1.0,Prenzlauer Berg Südwest,Pankow,52.53471,13.4181,Entire rental unit,Entire home/apt,...,730,149,4.63,4.67,4.52,4.65,4.7,4.92,4.61,0.76
1,9991,33852,1.0,1.0,Prenzlauer Berg Südwest,Pankow,52.53269,13.41805,Entire rental unit,Entire home/apt,...,14,7,5.0,5.0,5.0,5.0,5.0,4.86,4.86,0.06
2,14325,55531,4.0,5.0,Prenzlauer Berg Nordwest,Pankow,52.54813,13.40366,Entire rental unit,Entire home/apt,...,1125,26,4.68,5.0,4.85,4.7,4.85,4.6,4.45,0.14
3,16644,64696,4.0,4.0,nördliche Luisenstadt,Friedrichshain-Kreuzberg,52.50312,13.43508,Entire condo,Entire home/apt,...,365,48,4.72,4.86,4.86,4.93,4.86,4.67,4.71,0.26
4,17904,68997,2.0,5.0,Reuterstraße,Neukölln,52.49419,13.42166,Entire rental unit,Entire home/apt,...,365,298,4.77,4.82,4.71,4.89,4.92,4.88,4.65,1.6


In [302]:
# Check df shape

df.shape


(14187, 27)

In [303]:
# Check df columns

df.columns

Index(['id', 'host_id', 'host_listings_count', 'host_total_listings_count',
       'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'latitude',
       'longitude', 'property_type', 'room_type', 'accommodates',
       'bathrooms_text', 'bedrooms', 'beds', 'amenities', 'price',
       'minimum_nights', 'maximum_nights', 'number_of_reviews',
       'review_scores_rating', 'review_scores_accuracy',
       'review_scores_cleanliness', 'review_scores_checkin',
       'review_scores_communication', 'review_scores_location',
       'review_scores_value', 'reviews_per_month'],
      dtype='object')

In [304]:
# Check the number of null values in each column

df.isnull().sum()


id                                 0
host_id                            0
host_listings_count               13
host_total_listings_count         13
neighbourhood_cleansed             0
neighbourhood_group_cleansed       0
latitude                           0
longitude                          0
property_type                      0
room_type                          0
accommodates                       0
bathrooms_text                     7
bedrooms                        2023
beds                            5003
amenities                          0
price                           5004
minimum_nights                     0
maximum_nights                     0
number_of_reviews                  0
review_scores_rating            3349
review_scores_accuracy          3351
review_scores_cleanliness       3349
review_scores_checkin           3352
review_scores_communication     3350
review_scores_location          3352
review_scores_value             3354
reviews_per_month               3349
d

In [305]:
# Get an overview of the dataset

df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14187 entries, 0 to 14186
Data columns (total 27 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   id                            14187 non-null  int64  
 1   host_id                       14187 non-null  int64  
 2   host_listings_count           14174 non-null  float64
 3   host_total_listings_count     14174 non-null  float64
 4   neighbourhood_cleansed        14187 non-null  object 
 5   neighbourhood_group_cleansed  14187 non-null  object 
 6   latitude                      14187 non-null  float64
 7   longitude                     14187 non-null  float64
 8   property_type                 14187 non-null  object 
 9   room_type                     14187 non-null  object 
 10  accommodates                  14187 non-null  int64  
 11  bathrooms_text                14180 non-null  object 
 12  bedrooms                      12164 non-null  float64
 13  b

In [306]:
# Remove $ and commas in `price` column, then convert to float

df['price'] = df['price'].replace(r'[\$,]', '', regex=True).astype(float).round(2)


In [307]:
# Check data types of each column

df.dtypes


id                                int64
host_id                           int64
host_listings_count             float64
host_total_listings_count       float64
neighbourhood_cleansed           object
neighbourhood_group_cleansed     object
latitude                        float64
longitude                       float64
property_type                    object
room_type                        object
accommodates                      int64
bathrooms_text                   object
bedrooms                        float64
beds                            float64
amenities                        object
price                           float64
minimum_nights                    int64
maximum_nights                    int64
number_of_reviews                 int64
review_scores_rating            float64
review_scores_accuracy          float64
review_scores_cleanliness       float64
review_scores_checkin           float64
review_scores_communication     float64
review_scores_location          float64


In [308]:
# Change float type to int

cols_to_int = [
    'host_listings_count',
    'host_total_listings_count',
    'bedrooms',
    'beds'
]

df[cols_to_int] = df[cols_to_int].astype('Int64')

In [309]:
# object columns to clean

cat_cols = df.select_dtypes(include='object').columns
print(cat_cols)


Index(['neighbourhood_cleansed', 'neighbourhood_group_cleansed',
       'property_type', 'room_type', 'bathrooms_text', 'amenities'],
      dtype='object')


In [310]:
# Check for leading/trailing spaces and inconsistent capitalization

for col in cat_cols:
    # Count entries with leading/trailing spaces
    n_spaces = (df[col].str.strip() != df[col]).sum()
    
    # Check capitalization inconsistency by comparing unique counts
    unique_lower = df[col].str.lower().nunique()
    unique_original = df[col].nunique()
    
    if n_spaces > 0:
        print(f"Column '{col}' has {n_spaces} entries with leading/trailing spaces")
    if unique_lower != unique_original:
        print(f"Column '{col}' has inconsistent capitalization")


Column 'bathrooms_text' has 7 entries with leading/trailing spaces


In [311]:
for col in cat_cols:
    df[col] = df[col].astype(str).str.strip()   # Remove leading/trailing spaces

In [312]:
# Change object type to string

# Select columns with object type

object_cols = df.select_dtypes(include=['object']).columns.tolist()

# Change to string

df[object_cols] = df[object_cols].astype('string')

In [313]:
df['neighbourhood_group_cleansed'].unique()


<StringArray>
[                  'Pankow', 'Friedrichshain-Kreuzberg',
                 'Neukölln',                    'Mitte',
     'Charlottenburg-Wilm.',   'Tempelhof - Schöneberg',
              'Lichtenberg',       'Treptow - Köpenick',
    'Steglitz - Zehlendorf',                  'Spandau',
            'Reinickendorf',    'Marzahn - Hellersdorf']
Length: 12, dtype: string

In [314]:
# Define your mapping (match keys exactly as they appear in your data)
name_map = {
    'Mitte': 'Mitte',
    'Friedrichshain-Kreuzberg': 'Friedrichshain-Kreuzberg',
    'Pankow': 'Pankow',
    'Charlottenburg-Wilm.': 'Charlottenburg-Wilmersdorf',
    'Spandau': 'Spandau',
    'Steglitz - Zehlendorf': 'Steglitz-Zehlendorf',
    'Tempelhof - Schöneberg': 'Tempelhof-Schöneberg',
    'Neukölln': 'Neukölln',
    'Treptow - Köpenick': 'Treptow-Köpenick',
    'Marzahn - Hellersdorf': 'Marzahn-Hellersdorf',
    'Lichtenberg': 'Lichtenberg',
    'Reinickendorf': 'Reinickendorf'
}

In [315]:
# Apply the mapping
df['neighbourhood_group_cleansed'] = df['neighbourhood_group_cleansed'].replace(name_map)

In [316]:
print(df['neighbourhood_group_cleansed'].unique())

<StringArray>
[                    'Pankow',   'Friedrichshain-Kreuzberg',
                   'Neukölln',                      'Mitte',
 'Charlottenburg-Wilmersdorf',       'Tempelhof-Schöneberg',
                'Lichtenberg',           'Treptow-Köpenick',
        'Steglitz-Zehlendorf',                    'Spandau',
              'Reinickendorf',        'Marzahn-Hellersdorf']
Length: 12, dtype: string


In [317]:
# Rename the column 'neighbourhood_group_cleansed' to 'district' for better readability and consistency

df.rename(columns={'neighbourhood_group_cleansed': 'district'}, inplace=True)

In [318]:
# Rename the column 'neighbourhood_cleansed' to 'neighborhood' for better readability and consistency

df.rename(columns={'neighbourhood_cleansed': 'neighborhood'}, inplace=True)

In [319]:
df.head()

Unnamed: 0,id,host_id,host_listings_count,host_total_listings_count,neighborhood,district,latitude,longitude,property_type,room_type,...,maximum_nights,number_of_reviews,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,reviews_per_month
0,3176,3718,1,1,Prenzlauer Berg Südwest,Pankow,52.53471,13.4181,Entire rental unit,Entire home/apt,...,730,149,4.63,4.67,4.52,4.65,4.7,4.92,4.61,0.76
1,9991,33852,1,1,Prenzlauer Berg Südwest,Pankow,52.53269,13.41805,Entire rental unit,Entire home/apt,...,14,7,5.0,5.0,5.0,5.0,5.0,4.86,4.86,0.06
2,14325,55531,4,5,Prenzlauer Berg Nordwest,Pankow,52.54813,13.40366,Entire rental unit,Entire home/apt,...,1125,26,4.68,5.0,4.85,4.7,4.85,4.6,4.45,0.14
3,16644,64696,4,4,nördliche Luisenstadt,Friedrichshain-Kreuzberg,52.50312,13.43508,Entire condo,Entire home/apt,...,365,48,4.72,4.86,4.86,4.93,4.86,4.67,4.71,0.26
4,17904,68997,2,5,Reuterstraße,Neukölln,52.49419,13.42166,Entire rental unit,Entire home/apt,...,365,298,4.77,4.82,4.71,4.89,4.92,4.88,4.65,1.6


In [320]:
# Count duplicates

num_duplicates = df.duplicated().sum()
print(f"Number of duplicate rows: {num_duplicates}")

Number of duplicate rows: 0


In [321]:
if df['id'].is_unique:
    print("ID column is unique")
else:
    print("ID column has duplicates")


ID column is unique


In [322]:
# Separate `bathrooms_text` into `bathrooms` and `is_shared`
 
# Extract the number (float), preserving NaNs

df['bathrooms'] = df['bathrooms_text'].str.extract(r'(\d+\.?\d*)')
df['bathrooms'] = df['bathrooms'].astype(float)

# Determine if it's a shared bathroom

df['is_shared'] = df['bathrooms_text'].apply(
    lambda x: 1 if isinstance(x, str) and 'shared' in x.lower() else (0 if isinstance(x, str) else pd.NA)
).astype('Int64')

In [323]:
# Drop `bathrooms_text` column

df = df.drop(columns=['bathrooms_text'])

In [324]:
# Change gdf neighbourhood column to neighborhood for joining two dfs

gdf = gdf.rename(columns={'neighbourhood': 'neighborhood'})

In [325]:
# Join df with gdf

gdf_subset = gdf[['neighborhood', 'geometry']]
df = df.merge(gdf_subset, on='neighborhood', how='left')

In [326]:
df.head()

Unnamed: 0,id,host_id,host_listings_count,host_total_listings_count,neighborhood,district,latitude,longitude,property_type,room_type,...,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,reviews_per_month,bathrooms,is_shared,geometry
0,3176,3718,1,1,Prenzlauer Berg Südwest,Pankow,52.53471,13.4181,Entire rental unit,Entire home/apt,...,4.67,4.52,4.65,4.7,4.92,4.61,0.76,1.0,0,"MULTIPOLYGON (((13.41253 52.54089, 13.41409 52..."
1,9991,33852,1,1,Prenzlauer Berg Südwest,Pankow,52.53269,13.41805,Entire rental unit,Entire home/apt,...,5.0,5.0,5.0,5.0,4.86,4.86,0.06,2.5,0,"MULTIPOLYGON (((13.41253 52.54089, 13.41409 52..."
2,14325,55531,4,5,Prenzlauer Berg Nordwest,Pankow,52.54813,13.40366,Entire rental unit,Entire home/apt,...,5.0,4.85,4.7,4.85,4.6,4.45,0.14,1.0,0,"MULTIPOLYGON (((13.40354 52.5402, 13.40339 52...."
3,16644,64696,4,4,nördliche Luisenstadt,Friedrichshain-Kreuzberg,52.50312,13.43508,Entire condo,Entire home/apt,...,4.86,4.86,4.93,4.86,4.67,4.71,0.26,1.0,0,"MULTIPOLYGON (((13.4443 52.50066, 13.44266 52...."
4,17904,68997,2,5,Reuterstraße,Neukölln,52.49419,13.42166,Entire rental unit,Entire home/apt,...,4.82,4.71,4.89,4.92,4.88,4.65,1.6,1.0,0,"MULTIPOLYGON (((13.43515 52.48076, 13.43492 52..."


In [327]:
# Save to csv (if needed):

# df.to_csv('listings_clean.csv')

In [329]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14187 entries, 0 to 14186
Data columns (total 29 columns):
 #   Column                       Non-Null Count  Dtype   
---  ------                       --------------  -----   
 0   id                           14187 non-null  int64   
 1   host_id                      14187 non-null  int64   
 2   host_listings_count          14174 non-null  Int64   
 3   host_total_listings_count    14174 non-null  Int64   
 4   neighborhood                 14187 non-null  object  
 5   district                     14187 non-null  string  
 6   latitude                     14187 non-null  float64 
 7   longitude                    14187 non-null  float64 
 8   property_type                14187 non-null  string  
 9   room_type                    14187 non-null  string  
 10  accommodates                 14187 non-null  int64   
 11  bedrooms                     12164 non-null  Int64   
 12  beds                         9184 non-null   Int64   
 13  a