# Initiate data

### Load the data of the different cities:

In [54]:
# Common libraries imports
import numpy as np
import pandas as pd
import sklearn
import seaborn as sns

In [55]:
Athens = pd.read_csv('CityData/listingsAthens.csv')
Athens.insert(0,'City','Athens')
print(Athens.shape)

(13182, 76)


In [56]:
Lisbon = pd.read_csv('CityData/listingsLisbon.csv')
Lisbon.insert(0,'City','Lisbon')
print(Lisbon.shape)

(22751, 76)


In [57]:
London = pd.read_csv('CityData/listingsLondon.csv')
London.insert(0,'City','London')
print(London.shape)

(91778, 76)


In [58]:
Prague = pd.read_csv('CityData/listingsPrague.csv')
Prague.insert(0,'City','Prague')
print(Prague.shape)

(9388, 76)


### Merge the city data into one dataframe:

In [59]:
merged = pd.concat([Athens, Lisbon, London, Prague],ignore_index=True)
merged

Unnamed: 0,City,id,listing_url,scrape_id,last_scraped,source,name,description,neighborhood_overview,picture_url,...,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,Athens,40042598,https://www.airbnb.com/rooms/40042598,20231225075512,2023-12-26,previous scrape,Rental unit in Athina · 1 bedroom · 1 bed · 1 ...,,,https://a0.muscache.com/pictures/fb6d0257-52ec...,...,,,,,f,1,0,1,0,
1,Athens,39069205,https://www.airbnb.com/rooms/39069205,20231225075512,2023-12-26,previous scrape,Rental unit in Athina · 2 bedrooms · 2 beds · ...,,Is located in one of the hottest spots in Kolo...,https://a0.muscache.com/pictures/8965800b-9101...,...,,,,,f,1,1,0,0,
2,Athens,653274914834812593,https://www.airbnb.com/rooms/653274914834812593,20231225075512,2023-12-26,city scrape,Condo in Athina · ★5.0 · 1 bedroom · 2 beds · ...,,Η γειτονιά βρίσκεται μια ανάσα από τον Παρθενώ...,https://a0.muscache.com/pictures/b8926124-f861...,...,5.0,4.98,4.96,00001652202,t,1,1,0,0,2.68
3,Athens,54361219,https://www.airbnb.com/rooms/54361219,20231225075512,2023-12-26,city scrape,Rental unit in Athina · ★4.75 · 1 bedroom · 1 ...,,,https://a0.muscache.com/pictures/miso/Hosting-...,...,5.0,4.98,4.83,00001402443,t,1,1,0,0,1.83
4,Athens,51258073,https://www.airbnb.com/rooms/51258073,20231225075512,2023-12-26,city scrape,Rental unit in Athina · ★5.0 · Studio · 3 beds...,,,https://a0.muscache.com/pictures/2e0592af-8983...,...,5.0,5.00,5.00,00002160731,f,3,3,0,0,0.31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
137094,Prague,1049289098484116632,https://www.airbnb.com/rooms/1049289098484116632,20231220035754,2023-12-20,city scrape,Rental unit in Praha 2 · ★New · 1 bedroom · 1 ...,,,https://a0.muscache.com/pictures/miso/Hosting-...,...,,,,,t,7,0,7,0,
137095,Prague,1049298171311642069,https://www.airbnb.com/rooms/1049298171311642069,20231220035754,2023-12-20,city scrape,Rental unit in Praha 3 · ★New · 1 bedroom · 1 ...,,,https://a0.muscache.com/pictures/miso/Hosting-...,...,,,,,t,7,0,7,0,
137096,Prague,1049310644080944967,https://www.airbnb.com/rooms/1049310644080944967,20231220035754,2023-12-20,city scrape,Rental unit in Praha 3 · ★New · 1 bedroom · 1 ...,,,https://a0.muscache.com/pictures/miso/Hosting-...,...,,,,,t,7,0,7,0,
137097,Prague,1049329121454141045,https://www.airbnb.com/rooms/1049329121454141045,20231220035754,2023-12-20,city scrape,Rental unit in Praha 3 · ★New · 1 bedroom · 1 ...,,,https://a0.muscache.com/pictures/miso/Hosting-...,...,,,,,t,7,0,7,0,


# Data cleaning

### Removing columns that are not useful:

In [60]:
merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 137099 entries, 0 to 137098
Data columns (total 76 columns):
 #   Column                                        Non-Null Count   Dtype  
---  ------                                        --------------   -----  
 0   City                                          137099 non-null  object 
 1   id                                            137099 non-null  int64  
 2   listing_url                                   137099 non-null  object 
 3   scrape_id                                     137099 non-null  int64  
 4   last_scraped                                  137099 non-null  object 
 5   source                                        137099 non-null  object 
 6   name                                          137099 non-null  object 
 7   description                                   0 non-null       float64
 8   neighborhood_overview                         75451 non-null   object 
 9   picture_url                                   13

In [61]:
# Drop all columns including 'host', except for 'host_is_superhost' and 'host_identity_verified'
host_columns = list(merged.filter(regex='host'))
host_columns.remove('host_is_superhost')
host_columns.remove('host_identity_verified')
merged.drop(host_columns,axis=1,inplace=True)

In [62]:
# Drop uninformative features:
merged.drop(columns = ['listing_url',
                       'scrape_id',
                       'last_scraped',
                       'source',
                       'name',
                       'description',
                       'neighborhood_overview',
                       'picture_url',
                       'bathrooms',
                       'calendar_updated',
                       'calendar_last_scraped',
                       'license'],inplace=True)

In [63]:
# Drop redundant min_min_night, min_max_night, max_min_night, max_max_night cols
nigth_columns = list(merged.filter(regex='nights'))
nigth_columns.remove('minimum_nights')
nigth_columns.remove('maximum_nights')
merged.drop(columns = nigth_columns, inplace=True)


For number of reviews we have several different variables.
- number_of_reviews: total number of reviews
- number_of_reviews_ltm: number of reviews in the last twelve months
- number_of_reviews_l30d: number of reviews in the last 30 days

We keep these in the dataset.

We also have variables first_review and last_review, which give the dates of the first and the last review. These can be dropped.

In [64]:
merged.drop(columns = ['first_review','last_review'],inplace=True)

In [66]:
merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 137099 entries, 0 to 137098
Data columns (total 36 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   City                          137099 non-null  object 
 1   id                            137099 non-null  int64  
 2   host_is_superhost             136878 non-null  object 
 3   host_identity_verified        137094 non-null  object 
 4   neighbourhood                 75452 non-null   object 
 5   neighbourhood_cleansed        137099 non-null  object 
 6   neighbourhood_group_cleansed  22751 non-null   object 
 7   latitude                      137099 non-null  float64
 8   longitude                     137099 non-null  float64
 9   property_type                 137099 non-null  object 
 10  room_type                     137099 non-null  object 
 11  accommodates                  137099 non-null  int64  
 12  bathrooms_text                136947 non-nul