In [1]:
import pandas as pd
import numpy as np
from math import radians, cos, sin, asin, sqrt

In [2]:
listings_data = pd.read_csv('listings.csv.gz')

In [3]:
listings_data

Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,description,neighborhood_overview,picture_url,host_id,host_url,...,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,10080,https://www.airbnb.com/rooms/10080,20210412154228,2021-04-13,D1 - Million Dollar View 2 BR,"Stunning two bedroom, two bathroom apartment. ...",,https://a0.muscache.com/pictures/55778229/c2f7...,30899,https://www.airbnb.com/users/show/30899,...,9.0,10.0,9.0,18-476608,f,40,40,0,0,0.14
1,13358,https://www.airbnb.com/rooms/13358,20210412154228,2021-04-13,Monthly (or Longer ) Designer One Bedroom Down...,<b>The space</b><br />This suites central loca...,,https://a0.muscache.com/pictures/c23bb7ef-e300...,52116,https://www.airbnb.com/users/show/52116,...,10.0,10.0,9.0,18-611603,f,1,1,0,0,3.27
2,13490,https://www.airbnb.com/rooms/13490,20210412154228,2021-04-13,Vancouver's best kept secret,This apartment rents for one month blocks of t...,"In the heart of Vancouver, this apartment has ...",https://a0.muscache.com/pictures/73394727/79d5...,52467,https://www.airbnb.com/users/show/52467,...,10.0,10.0,10.0,,f,1,1,0,0,0.73
3,14267,https://www.airbnb.com/rooms/14267,20210412154228,2021-04-13,EcoLoft Vancouver,"The Ecoloft is located in the lovely, family r...",We live in the centre of the city of Vancouver...,https://a0.muscache.com/pictures/3646de9b-934e...,56030,https://www.airbnb.com/users/show/56030,...,9.0,10.0,9.0,21-156500,t,1,1,0,0,0.26
4,16254,https://www.airbnb.com/rooms/16254,20210412154228,2021-04-12,Close to PNE/Hastings Park and East Village,2 Bedroom garden level guest suite.(lower leve...,"Good Eats, Cute Shops and all the personality ...",https://a0.muscache.com/pictures/90623667/583c...,63238,https://www.airbnb.com/users/show/63238,...,10.0,10.0,9.0,19-162421,t,1,1,0,0,0.15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4294,49132295,https://www.airbnb.com/rooms/49132295,20210412154228,2021-04-13,Luxurious 3 Bedroom Unit With English Bay Views!,"Stunning large, modern, brand-new 3 bedroom an...",,https://a0.muscache.com/pictures/d5f23f5b-f10b...,219146817,https://www.airbnb.com/users/show/219146817,...,,,,,f,5,5,0,0,
4295,49138797,https://www.airbnb.com/rooms/49138797,20210412154228,2021-04-13,19-9-10 Private room,Private room in a new house with shared bathro...,,https://a0.muscache.com/pictures/02048308-7b1f...,356993612,https://www.airbnb.com/users/show/356993612,...,,,,21-107408,f,14,1,13,0,
4296,49146266,https://www.airbnb.com/rooms/49146266,20210412154228,2021-04-13,Prime Coal harbour | Luxury Finishings,Welcome to this beautiful fully furnished Prim...,"Location, Location, Location! Steps from the S...",https://a0.muscache.com/pictures/ee697899-6185...,396405989,https://www.airbnb.com/users/show/396405989,...,,,,,f,1,1,0,0,
4297,49148812,https://www.airbnb.com/rooms/49148812,20210412154228,2021-04-13,DT- Ocean View 1BD w/ Private Bathroom Kitchen...,Private one bedroom suite apartment in heart o...,"Heart of Downtown Vancouver , walking distance...",https://a0.muscache.com/pictures/02a2a9da-68e0...,88405999,https://www.airbnb.com/users/show/88405999,...,,,,,f,1,1,0,0,


In [4]:
listings_data.columns

Index(['id', 'listing_url', 'scrape_id', 'last_scraped', 'name', 'description',
       'neighborhood_overview', 'picture_url', 'host_id', 'host_url',
       'host_name', 'host_since', 'host_location', 'host_about',
       'host_response_time', 'host_response_rate', 'host_acceptance_rate',
       'host_is_superhost', 'host_thumbnail_url', 'host_picture_url',
       'host_neighbourhood', 'host_listings_count',
       'host_total_listings_count', 'host_verifications',
       'host_has_profile_pic', 'host_identity_verified', 'neighbourhood',
       'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'latitude',
       'longitude', 'property_type', 'room_type', 'accommodates', 'bathrooms',
       'bathrooms_text', 'bedrooms', 'beds', 'amenities', 'price',
       'minimum_nights', 'maximum_nights', 'minimum_minimum_nights',
       'maximum_minimum_nights', 'minimum_maximum_nights',
       'maximum_maximum_nights', 'minimum_nights_avg_ntm',
       'maximum_nights_avg_ntm', 'calendar_upd

In [5]:
#ML PART

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor

In [6]:
X = listings_data.drop('price',1)

In [7]:
y = listings_data['price']

In [8]:
def clean_amenities_data(amenities_data, amenities_required):

    #find unique amenities and the number of them to choose which are important for a traveller
    # print(amenities_data['amenity'].value_counts())

    #adapted from : https://www.kite.com/python/answers/how-to-filter-a-pandas-dataframe-with-a-list-by-%60in%60-or-%60not-in%60-in-python
    bool_series = amenities_data.amenity.isin(amenities_required)
    filtered_amenities_df = amenities_data[bool_series]
    filtered_amenities_df = filtered_amenities_df.drop(['timestamp','tags'], axis=1).dropna() # dropping unnecessary columns, and filter out NA values
    filtered_amenities_df.reset_index(inplace=True, drop=True)

    return filtered_amenities_df

In [9]:
# return a dictionary with number of amenities in a 1km radius of this lat and lon
def num_amenities(lat, lon, amenities_data_clean):
    distance = haversine_distance(amenities_data_clean, lon, lat)
    amenities_data_clean['distance'] = distance
    data_withinR = amenities_data_clean.loc[amenities_data_clean['distance'] < 1000].reset_index(drop=True)
    amenities_series = data_withinR.pivot_table(columns = ['amenity'], aggfunc='size')  # Counts # of amenities, type=pd.series
    amenities_dict = amenities_series.to_dict()# converts series to dict
    return amenities_dict
    

def ameneties_score(my_dict):
    
    num_different_amenities = len(my_dict)
    score = num_different_amenities * 10 
        
    for key in my_dict:
        if (my_dict[key] > 30):
            score+=30
        else:
            score+= my_dict[key]
    return score


#reference: https://stackoverflow.com/questions/4913349/haversine-formula-in-python-bearing-and-distance-between-two-gps-points
def haversine_distance(df, lon2, lat2):
    # convert decimal degrees to radians 
    lon1=np.radians(df['lon'])
    lat1=np.radians(df['lat'])
    lon2=np.radians(lon2)
    lat2=np.radians(lat2)
    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = (dlat/2).apply(sin)**2 + (lat1).apply(sin) * cos(lat2) * (dlon/2).apply(sin)**2
    c = 2 * ((a).apply(sqrt).apply(asin)) 
    r = 6371 # Radius of earth in kilometers. Use 3956 for miles
    return c * r * 1000

In [10]:
amenities_data = pd.read_json('amenities-vancouver.json.gz', lines=True)
amenities_required = ['restaurant', 'fast_food', 'cafe','bank','atm','pharmacy','bicycle_rental','fuel','pub','bar','car_sharing','car_rental','clinic','doctors','hospital','ice_cream','fountain','theatre','police','bus_station']
amenities_data_clean = clean_amenities_data (amenities_data, amenities_required)

In [11]:
cols = ['latitude', 'longitude', 'host_response_time', 'host_response_rate', 'host_acceptance_rate','host_is_superhost','host_listings_count', 'host_total_listings_count', 'host_identity_verified','neighbourhood_cleansed', 'property_type', 'room_type', 'accommodates', 'bedrooms', 'beds', 'amenities', 'price',   'minimum_nights', 'maximum_nights', 'maximum_nights_avg_ntm',  'availability_30', 'availability_60', 'availability_90','availability_365','number_of_reviews', 'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin','review_scores_communication','review_scores_location','review_scores_value', 'reviews_per_month' ]

In [12]:
listings_data = listings_data[cols]

In [13]:
listings_data = listings_data.dropna(how='any',axis=0)

# find number of amenities provided by the host
listings_data['num_amenities'] = listings_data['amenities'].apply(lambda x: len(x)).astype('float64')
listings_data.drop('amenities', axis= 1, inplace = True)

In [14]:
from sklearn.preprocessing import LabelEncoder

#Label Encoding categorical Data
lb_make = LabelEncoder()
listings_data['host_response_time'] = lb_make.fit_transform(listings_data['host_response_time'])
listings_data['host_is_superhost'] = lb_make.fit_transform(listings_data['host_is_superhost'])
listings_data['host_identity_verified'] = lb_make.fit_transform(listings_data['host_identity_verified'])
listings_data['neighbourhood_cleansed'] = lb_make.fit_transform(listings_data['neighbourhood_cleansed'])
listings_data['property_type'] = lb_make.fit_transform(listings_data['property_type'])
listings_data['room_type'] = lb_make.fit_transform(listings_data['room_type'])

#convert strings to float
listings_data['host_response_rate'] = listings_data['host_response_rate'].apply(lambda x: float(x.replace('%','')))
listings_data['host_acceptance_rate'] = listings_data['host_acceptance_rate'].apply(lambda x: float(x.replace('%','')))
listings_data['price'] = listings_data['price'].apply(lambda x: float(x.replace('$','').replace(',','')))

In [15]:
listings_data_clean = listings_data

In [16]:
#add a column for number of amenities nearby to each listing
listings_data_clean['num_amenities_nearby'] = listings_data_clean.apply(lambda x: num_amenities(x['latitude'], x['longitude'], amenities_data_clean), axis = 1)

#add a column 'amenities_score' based on the number of different amenities nearby
listings_data_clean['amenities_score'] = listings_data_clean['num_amenities_nearby'].apply(lambda x : ameneties_score(x))
listings_data_clean = listings_data_clean.drop(['num_amenities_nearby'],1)

In [17]:
X = listings_data_clean.drop('price',1)
y = listings_data_clean['price']

X_train, X_valid, y_train, y_valid = train_test_split(X, y)

X_train_without_amenity_score = X_train.drop('amenities_score', 1)
X_valid_without_amenity_score = X_valid.drop('amenities_score', 1)

In [18]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import kneighbors_graph
from sklearn.ensemble import GradientBoostingRegressor

knn = KNeighborsRegressor(100)
knn.fit(X_train_without_amenity_score, y_train)
knn_sc = knn.score(X_valid_without_amenity_score, y_valid)
print("Knn Score:", knn_sc)

rf = RandomForestRegressor(100, max_depth=40)
rf.fit(X_train_without_amenity_score, y_train)
rf_sc = rf.score(X_valid_without_amenity_score, y_valid)
print("Random Forest Score:",rf_sc)

gb =  GradientBoostingRegressor()
gb.fit(X_train_without_amenity_score, y_train)    
gb_sc = gb.score(X_valid_without_amenity_score, y_valid)
print("Gradient Boosting Score:",gb_sc)

Knn Score: 0.027542405204356135
Random Forest Score: 0.6726830042758933
Gradient Boosting Score: 0.6460726112619264


In [19]:
knn_A = KNeighborsRegressor(100)
knn_A.fit(X_train, y_train)
knn_A_sc = knn_A.score(X_valid, y_valid)
print("\nKnn Score with ameneties_score:",knn_A_sc)

rf_A = RandomForestRegressor(100, max_depth=40)
# rf_A = RandomForestRegressor()
rf_A.fit(X_train, y_train)
rf_A_sc = rf_A.score(X_valid, y_valid)
print("Random Forest Score with amenities_score:",rf_A_sc)

gb_A = GradientBoostingRegressor()
gb_A.fit(X_train, y_train)  
gb_A_sc = gb_A.score(X_valid, y_valid)
print("Gradient Boosting Score with amenities_score:",gb_A_sc)


Knn Score with ameneties_score: 0.04096030328405598
Random Forest Score with amenities_score: 0.6791965091741946
Gradient Boosting Score with amenities_score: 0.6528613771823115
