# Contents List:
### 1. Importing libraries and data
### 2. Data Wrangling
### 3. Data Cleaning
### 4  Plotting a Choropleth
### 5. Answer to Analysis

### 1. Importing libraries and data

In [79]:
# Import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import os
import folium
import json
import geopandas as gpd
from geopy.distance import geodesic

# for t-test / statistical testing for categorical variable: "host_is_superhost" and numerical variable: "number_of_reviews"
from scipy.stats import ttest_ind

In [80]:
# This command propts matplotlib visuals to appear in the notebook 
%matplotlib inline

In [81]:
# make path variable
path = r'C:\Users\justi\Downloads\2024AirbnbAmsterdam'

In [82]:
# import Amsterdam listings dataset, ensure "index_col=0" removes the "Unnamed:0 index column", and address memory usage concerns with "low_memory= False"
df_listings = pd.read_csv(os.path.join(path, 'listings_checked2.csv'), index_col = 0, low_memory= False)

In [83]:
# Import ".json file for Netherlands" from CF's reference link to use: https://github.com/johan/world.geo.json/blob/master/countries/NLD.geo.json
json_netherlands = open(r'C:\Users\justi\Downloads\2024AirbnbAmsterdam\NLD.geo.json')
  
geo_data = gpd.read_file(json_netherlands)

In [84]:
df_listings.columns

Index(['caption', 'host_id', 'host_is_superhost', 'host_listings_count',
       'neighbourhood_cleansed', 'latitude', 'longitude', 'property_type',
       'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'amenities',
       'price', 'guests_included', 'extra_people', 'minimum_nights',
       'maximum_nights', 'number_of_reviews', 'review_scores_rating',
       'instant_bookable', 'cancellation_policy', 'first_review',
       'last_review', 'is_apartment', 'color'],
      dtype='object')

In [85]:
df_listings

Unnamed: 0_level_0,caption,host_id,host_is_superhost,host_listings_count,neighbourhood_cleansed,latitude,longitude,property_type,room_type,accommodates,...,minimum_nights,maximum_nights,number_of_reviews,review_scores_rating,instant_bookable,cancellation_policy,first_review,last_review,is_apartment,color
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2818,Quiet Garden View Room & Super Fast WiFi,3159,1,1,Oostelijk Havengebied - Indische Buurt,52.365755,4.941419,Apartment,Private room,2,...,3,15,248,97,1,strict_14_with_grace_period,3/30/2009,11/28/2018,Apartment,orange
3209,"Quiet apt near center, great view",3806,0,1,Westerpark,52.390225,4.873924,Apartment,Entire home/apt,5,...,4,20,42,96,0,moderate,7/31/2015,8/29/2018,Apartment,orange
20168,100%Centre-Studio 1 Private Floor/Bathroom,59484,0,2,Centrum-Oost,52.365087,4.893541,Townhouse,Entire home/apt,2,...,1,1000,233,87,0,strict_14_with_grace_period,3/2/2010,11/30/2018,Not Apartment,green
25428,Lovely apt in City Centre (Jordaan),56142,0,2,Centrum-West,52.373114,4.883668,Apartment,Entire home/apt,3,...,14,60,1,100,0,strict_14_with_grace_period,1/21/2018,1/21/2018,Apartment,green
27886,"Romantic, stylish B&B houseboat in canal district",97647,1,1,Centrum-West,52.386727,4.892078,Houseboat,Private room,2,...,2,730,171,99,1,strict_14_with_grace_period,1/9/2012,11/25/2018,Not Apartment,green
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30576148,Family House City + free Parking+garden (160 m2),13399651,0,1,Watergraafsmeer,52.345999,4.952145,House,Entire home/apt,4,...,7,12,0,0,0,strict_14_with_grace_period,,,Not Apartment,orange
30577727,Home Sweet Home in Indische Buurt,1595885,0,2,Oostelijk Havengebied - Indische Buurt,52.362412,4.932467,Apartment,Entire home/apt,3,...,3,14,0,0,0,flexible,,,Apartment,orange
30578037,Amsterdam Cozy apartment nearby center,87866499,0,2,Oud-Oost,52.362431,4.926912,Apartment,Entire home/apt,2,...,10,22,0,0,0,moderate,,,Apartment,orange
30579673,Home Sweet Home for a Guest or a Couple,1595885,0,2,Oostelijk Havengebied - Indische Buurt,52.363780,4.932493,Apartment,Private room,2,...,2,15,0,0,0,flexible,,,Apartment,orange


### 2. Data Wranging 

In [86]:
# Rename 'neighbourhood_cleansed'--> neighborhood
df_listings.rename(columns = {'neighbourhood_cleansed' : 'neighborhood'}, inplace = True)

In [87]:
# Rename 'color'--> Within_1Mile_DamSquare
df_listings.rename(columns = {'color' : 'Within_1Mile_DamSquare'}, inplace = True)

### 3. Data Cleaning - addressing extreme values

In [88]:
# Filtering the data for apartments labeled as "green" (Within 1 mile from Dam Square)
apartments_green = df_listings[(df_listings['property_type'] == 'Apartment') & (df_listings['Within_1Mile_DamSquare'] == 'green')]

In [89]:
apartments_green

Unnamed: 0_level_0,caption,host_id,host_is_superhost,host_listings_count,neighborhood,latitude,longitude,property_type,room_type,accommodates,...,minimum_nights,maximum_nights,number_of_reviews,review_scores_rating,instant_bookable,cancellation_policy,first_review,last_review,is_apartment,Within_1Mile_DamSquare
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
25428,Lovely apt in City Centre (Jordaan),56142,0,2,Centrum-West,52.373114,4.883668,Apartment,Entire home/apt,3,...,14,60,1,100,0,strict_14_with_grace_period,1/21/2018,1/21/2018,Apartment,green
28871,Comfortable double room,124245,1,3,Centrum-West,52.367187,4.890918,Apartment,Private room,2,...,2,1825,215,97,0,moderate,8/22/2010,12/3/2018,Apartment,green
29051,Comfortable single room,124245,1,3,Centrum-West,52.367725,4.891512,Apartment,Private room,1,...,2,730,383,95,0,moderate,3/16/2011,12/5/2018,Apartment,green
41125,Amsterdam Center Entire Apartment,178515,0,1,Centrum-West,52.378915,4.883205,Apartment,Entire home/apt,2,...,3,21,76,95,0,moderate,11/25/2010,10/7/2018,Apartment,green
44129,Luxury design with canal view,187728,1,11,Centrum-West,52.380711,4.886104,Apartment,Entire home/apt,3,...,2,1125,176,98,0,strict_14_with_grace_period,8/16/2010,10/1/2018,Apartment,green
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30551479,Amsterdam Apartment in City Centre,151566094,0,1,Centrum-West,52.375609,4.894486,Apartment,Entire home/apt,4,...,1,1125,0,0,1,flexible,,,Apartment,green
30556050,Amsterdam Downtown- West,22701985,1,1,De Baarsjes - Oud-West,52.363157,4.875763,Apartment,Entire home/apt,4,...,3,1125,0,0,0,strict_14_with_grace_period,,,Apartment,green
30556993,Cheap Cozy Balcony Apartment Jordaan District,229109698,0,1,Centrum-West,52.370966,4.882705,Apartment,Entire home/apt,4,...,3,1125,0,0,1,strict_14_with_grace_period,,,Apartment,green
30563877,Large comfortable apartments in the center,228749822,0,1,Centrum-Oost,52.365812,4.896044,Apartment,Entire home/apt,4,...,7,1125,0,0,0,flexible,,,Apartment,green


In [90]:
# Calculating the Interquartile Range (IQR) for the 'price'
Q1 = apartments_green['price'].quantile(0.25)
Q3 = apartments_green['price'].quantile(0.75)
IQR = Q3 - Q1

# Defining the lower and upper bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

In [93]:
# Filtering out the outliers from the data
apartments_filtered = apartments_green[(apartments_green['price'] >= lower_bound) & (apartments_green['price'] <= upper_bound)]

In [94]:
# Identifying the outliers line items
outliers = apartments_green[(apartments_green['price'] < lower_bound) | (apartments_green['price'] > upper_bound)]

In [95]:
# Displaying the number of rows before filtering outliers, after filtering outliers, and specified line items of the outliers
initial_count = len(apartments_green)
filtered_count = len(apartments_filtered)

print(f"Initial listings: {initial_count} apartments labeled as 'green.'")
print(f"Listings after outlier removal: {filtered_count} apartments.")
print(outliers)

Initial listings: 3679 apartments labeled as 'green.'
Listings after outlier removal: 3422 apartments.
                                                    caption    host_id  \
id                                                                       
48076                    Amsterdam Central and lot of space     219080   
245927    Sonnenberg - Canal side & view - Most central ...    1005087   
507598                   Canal View Apartment! Jordaan Area    2501656   
1079362                    Central large & modern apartment    5936488   
1233749                       Amsterdam Stay Apartment 1571     329249   
...                                                     ...        ...   
30221647                                   President suites  117151599   
30286344            Lovely Two floor apartment for 3 people   70014679   
30356862     Luxury Penthouse with view on the Royal Palace   23824257   
30495375        90 sqm Jordaan apartment with roof terrace!   10698721   
30498473 

In [96]:
# 257/ 3679 = ~7% are outliers.
# Since I am price conscious and it is not a significant amount of data compared the remainder 3422 options, remove the outliers by using 'apartments_filtered' subset moving forward
apartments_filtered 

Unnamed: 0_level_0,caption,host_id,host_is_superhost,host_listings_count,neighborhood,latitude,longitude,property_type,room_type,accommodates,...,minimum_nights,maximum_nights,number_of_reviews,review_scores_rating,instant_bookable,cancellation_policy,first_review,last_review,is_apartment,Within_1Mile_DamSquare
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
25428,Lovely apt in City Centre (Jordaan),56142,0,2,Centrum-West,52.373114,4.883668,Apartment,Entire home/apt,3,...,14,60,1,100,0,strict_14_with_grace_period,1/21/2018,1/21/2018,Apartment,green
28871,Comfortable double room,124245,1,3,Centrum-West,52.367187,4.890918,Apartment,Private room,2,...,2,1825,215,97,0,moderate,8/22/2010,12/3/2018,Apartment,green
29051,Comfortable single room,124245,1,3,Centrum-West,52.367725,4.891512,Apartment,Private room,1,...,2,730,383,95,0,moderate,3/16/2011,12/5/2018,Apartment,green
41125,Amsterdam Center Entire Apartment,178515,0,1,Centrum-West,52.378915,4.883205,Apartment,Entire home/apt,2,...,3,21,76,95,0,moderate,11/25/2010,10/7/2018,Apartment,green
44129,Luxury design with canal view,187728,1,11,Centrum-West,52.380711,4.886104,Apartment,Entire home/apt,3,...,2,1125,176,98,0,strict_14_with_grace_period,8/16/2010,10/1/2018,Apartment,green
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30551479,Amsterdam Apartment in City Centre,151566094,0,1,Centrum-West,52.375609,4.894486,Apartment,Entire home/apt,4,...,1,1125,0,0,1,flexible,,,Apartment,green
30556050,Amsterdam Downtown- West,22701985,1,1,De Baarsjes - Oud-West,52.363157,4.875763,Apartment,Entire home/apt,4,...,3,1125,0,0,0,strict_14_with_grace_period,,,Apartment,green
30556993,Cheap Cozy Balcony Apartment Jordaan District,229109698,0,1,Centrum-West,52.370966,4.882705,Apartment,Entire home/apt,4,...,3,1125,0,0,1,strict_14_with_grace_period,,,Apartment,green
30563877,Large comfortable apartments in the center,228749822,0,1,Centrum-Oost,52.365812,4.896044,Apartment,Entire home/apt,4,...,7,1125,0,0,0,flexible,,,Apartment,green


In [97]:
# Check for missing values
apartments_filtered.isnull().sum()

caption                     8
host_id                     0
host_is_superhost           0
host_listings_count         0
neighborhood                0
latitude                    0
longitude                   0
property_type               0
room_type                   0
accommodates                0
bathrooms                   0
bedrooms                    0
amenities                   0
price                       0
guests_included             0
extra_people                0
minimum_nights              0
maximum_nights              0
number_of_reviews           0
review_scores_rating        0
instant_bookable            0
cancellation_policy         0
first_review              345
last_review               345
is_apartment                0
Within_1Mile_DamSquare      0
dtype: int64

In [98]:
# Find rows where 'first_review' or 'last_review' are missing
missing_reviews = apartments_filtered[apartments_filtered['first_review'].isna() | apartments_filtered['last_review'].isna()]

In [68]:
# Display the rows with missing 'first_review' or 'last_review' data
print(missing_reviews)

                                                caption    host_id  \
id                                                                   
626872                    Beautiful Apartment Amsterdam    3111814   
1003865             Newly Renov Elegant apt in Old West      56142   
1127128   Bright and cozy apartment in trendy district!    6179713   
1147324      City Centre full apartment minimal 20 days    6289853   
1971204          AMSTERDAM APARTMENT (CENTRAL LOCATION)    8317919   
...                                                 ...        ...   
30551479             Amsterdam Apartment in City Centre  151566094   
30556050                       Amsterdam Downtown- West   22701985   
30556993  Cheap Cozy Balcony Apartment Jordaan District  229109698   
30563877     Large comfortable apartments in the center  228749822   
30573892                     Clean and perfect location  229361236   

          host_is_superhost  host_listings_count            neighborhood  \
id           

In [99]:
apartments_filtered['price'].mean()

154.64026884862653

In [100]:
missing_reviews['price'].mean()

151.15072463768115

#### 8 captions missing is ok; the host must be using the platform for the first time
#### missing_reviews['price'].mean() shows me the 345 listings don't have reviews are trying to be competitive average price ~151/night vs. ~154/night
#### KEEP THESE LINE ITEMS TO REVIEW IN BELOW CHOROPLETH

### 4. Plot a choropleth

#### 4a. Address t-test results regarding "Superhost" and "Number of Review" status

In [149]:
# Group Division: Divide hosts into two groups based on the "host_is_superhost" column:
# Superhosts: Hosts who are marked as superhosts (host_is_superhost = 1).
# Non-Superhosts: Hosts who are not marked as superhosts (host_is_superhost = 0).

# Separate the data into two groups: superhosts and non-superhosts
superhost_reviews = df_listings[df_listings['host_is_superhost'] == 1]['number_of_reviews']
not_superhost_reviews = df_listings[df_listings['host_is_superhost'] == 0]['number_of_reviews']

In [150]:
# Number_of_Reviews: Look at this variable for each group. This column tells how many reviews each host has received.
# Perform indepedent t-test to compare average "number_of_reviews" between two groups to see if there is a significant difference.
t_stat, p_value = ttest_ind(superhost_reviews, not_superhost_reviews)

In [151]:
print(f'T-statistic: {t_stat}')
print(f'P-value: {p_value}')

T-statistic: 48.929516089799364
P-value: 0.0


#### Answer: 
#### T-test = 48.93 suggests that the difference in the average "number_of_reviews" between superhosts and non-superhosts is large. 
#### P-value = 0.0 indicates that the difference in the average "number_of_reviews" between superhosts and non-superhosts is statistically significant.

#### Superhosts vs. Non-Superhosts: The results suggest that superhosts and non-superhosts have very different average numbers of reviews. Since the t-statistic is large and the p-value is extremely small, confidently say that superhosts generally have a different (probably higher) number of reviews compared to non-superhosts.
#### It is very unlikely that this difference happened by random chance.
#### It suggest that superhosts either attract more reviews or that the status of being a superhost contributes to receiving more reviews.

### 4b. Plotting a choropleth for Superhosts under 1 mile from Dam Square

In [103]:
# Coordinates for Dam Square
dam_square_coords = (52.3732, 4.8936)

In [153]:
# Calculate distance to Dam Square
df_listings['distance_to_dam_square'] = df_listings.apply(lambda row: geodesic((row['latitude'], row['longitude']), dam_square_coords).miles, axis=1)

In [155]:
# Filter apartments under 1 mile from Dam Square, average price under $154
filtered_apartments = df_listings[
    (df_listings['property_type'] == 'Apartment') &
    (df_listings['distance_to_dam_square'] < 1) &
    (df_listings['price'] < 154)
]

In [156]:
# Create a base map centered around Amsterdam
m = folium.Map(location=[52.3732, 4.8936], zoom_start=13)

In [157]:
# Add circle markers for filtered apartments
for _, row in filtered_apartments.iterrows():
    
    # Determine color based on superhost status
    color = 'green' if row['host_is_superhost'] == 1 else 'orange'
    
    folium.CircleMarker(
        location=(row['latitude'], row['longitude']),
        radius=1,
        color=color,
        fill=True,
        fill_color=color,
        fill_opacity=0.1,
        popup=f"Price: ${row['price']}, Distance to Dam Square: {row['distance_to_dam_square']:.2f} miles, Superhost: {'Yes' if row['host_is_superhost'] == 1 else 'No'}"
    ).add_to(m)

In [158]:
# Display the map
m

In [161]:
# Save map to an HTML file
m.save('amsterdam_filtered_apartments_map.html')

### 5. Answer to Analysis

In [168]:
# Create 2 more apartment subsets: filter apart to get superhosts vs. not_superhosts
superhost_apartments = filtered_apartments[filtered_apartments['host_is_superhost'] == 1]
not_superhost_apartments = filtered_apartments[filtered_apartments['host_is_superhost'] == 0]

# Calculate the average price that superhosts vs. not_superhosts charge per night in this subset
average_price_superhost = superhost_apartments['price'].mean()
average_price_not_superhost = not_superhost_apartments['price'].mean()

In [169]:
# Print the statements
print(f"The average price ~$154/night for apartments under 1 mile distance from epicenter: Dam Square.")
print(f"The average price 'Superhosts' charge near Dam Square: ${average_price_superhost:.2f}")
print(f"The average price 'Not Superhosts' charge near Dam Square: ${average_price_not_superhost:.2f}")

The average price ~$154/night for apartments under 1 mile distance from epicenter: Dam Square.
The average price 'Superhosts' charge near Dam Square: $115.02
The average price 'Not Superhosts' charge near Dam Square: $115.01


In [170]:
# Further filter by cancellation policy containing keywords
strict_superhosts = superhost_apartments[superhost_apartments['cancellation_policy'].str.contains('strict', case=False, na=False)]
flexible_superhosts = superhost_apartments[superhost_apartments['cancellation_policy'].str.contains('flexible', case=False, na=False)]
moderate_superhosts = superhost_apartments[superhost_apartments['cancellation_policy'].str.contains('moderate', case=False, na=False)]

# Print the number of listings for each subset
print(f"Number of superhost apartments with a cancellation policy containing 'strict': {len(strict_superhosts)}")
print(f"Number of superhost apartments with a cancellation policy containing 'flexible': {len(flexible_superhosts)}")
print(f"Number of superhost apartments with a cancellation policy containing 'moderate': {len(moderate_superhosts)}")

Number of superhost apartments with a cancellation policy containing 'strict': 147
Number of superhost apartments with a cancellation policy containing 'flexible': 40
Number of superhost apartments with a cancellation policy containing 'moderate': 132


In [171]:
# Calculate and print the average price for each subset
average_price_strict_superhosts = strict_superhosts['price'].mean()
average_price_flexible_superhosts = flexible_superhosts['price'].mean()
average_price_moderate_superhosts = moderate_superhosts['price'].mean()

print(f"Average price for superhost apartments with a cancellation policy containing 'strict': ${average_price_strict_superhosts:.2f}")
print(f"Average price for superhost apartments with a cancellation policy containing 'flexible': ${average_price_flexible_superhosts:.2f}")
print(f"Average price for superhost apartments with a cancellation policy containing 'moderate': ${average_price_moderate_superhosts:.2f}")

Average price for superhost apartments with a cancellation policy containing 'strict': $119.21
Average price for superhost apartments with a cancellation policy containing 'flexible': $110.58
Average price for superhost apartments with a cancellation policy containing 'moderate': $111.70


#### Given that there are ~40 apartments within 1 mile of Dam Square that is "superhost", "flexible", "average price= ~110", my next research question would be: 
#### How many number of reviews did these "Superhost flexible apartments" get per month?
#### Is it too popular? How many months advance do I need to secure my Airbnb reservation and then book my flight?