In [10]:
import pandas as pd
import numpy as np
from math import radians, cos, sin, asin, sqrt
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import kneighbors_graph
from sklearn.ensemble import GradientBoostingRegressor
import re

# returns input datasets 
def read_data():
    listings_data = pd.read_csv('listings.csv.gz')
    amenities_data = pd.read_json('amenities-vancouver.json.gz', lines=True)
    user_input1 = pd.read_csv('input2.txt', sep=':\s', names=['Preference','Preference_Data'], engine='python')
    
    return listings_data, amenities_data, user_input1

# returns cleaned amenities df
def clean_amenities_data(amenities_data, amenities_required):

    # find unique amenities and the number of them to choose which are important for a traveller

    #adapted from : https://www.kite.com/python/answers/how-to-filter-a-pandas-dataframe-with-a-list-by-%60in%60-or-%60not-in%60-in-python
    bool_series = amenities_data.amenity.isin(amenities_required)
    filtered_amenities_df = amenities_data[bool_series]
    filtered_amenities_df = filtered_amenities_df.drop(['timestamp','tags'], axis=1).dropna() # dropping unnecessary columns, and filter out NA values
    filtered_amenities_df.reset_index(inplace=True, drop=True)

    return filtered_amenities_df

# returns the difference between two points
# reference: https://stackoverflow.com/questions/4913349/haversine-formula-in-python-bearing-and-distance-between-two-gps-points
def haversine_distance(df, lon2, lat2):
    # convert decimal degrees to radians 
    lon1=np.radians(df['lon'])
    lat1=np.radians(df['lat'])
    lon2=np.radians(lon2)
    lat2=np.radians(lat2)
    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = (dlat/2).apply(sin)**2 + (lat1).apply(sin) * cos(lat2) * (dlon/2).apply(sin)**2
    c = 2 * ((a).apply(sqrt).apply(asin)) 
    r = 6371 # Radius of earth in kilometers. Use 3956 for miles
    return c * r * 1000

# returns cleaned data listings
def clean_listings_data(listings_data, accommodates_input, room_input, price_range_input, exact, amenities_data_clean):
    #keep only the columns we need
    columns_needed = ['id', 'listing_url', 'name', 'description', 'picture_url', 'latitude', 'longitude', 'property_type', 'accommodates', 'bedrooms', 'beds', 'amenities', 'price',  'review_scores_value']
    listings_data = listings_data[columns_needed].copy()
    listings_data['price'] = listings_data['price'].apply(lambda x: x.replace(',','').replace('$','')).astype(float)
    
    pd.set_option('mode.chained_assignment', None)
    # extract price_range from string
    p_range = [float(s) for s in re.findall('[0-9]+', price_range_input)]
    min_price = p_range[0]
    max_price = p_range[1]
    
    bedrooms=listings_data['bedrooms']
    accommodates=listings_data['accommodates']
    
    # if user is fine with referencing their input as minimum requirements -> if not exact
    # if user wants exact filter -> else 
    
    if not exact:
        # find listing data with bedrooms >= room_input and accommodates >= accomodates_input
        listings_data = listings_data[(bedrooms >= room_input) & (accommodates >= accommodates_input)]
        listings_data = listings_data[(listings_data['price'] <= max_price) & (listings_data['price'] >= min_price)]
    else:
        # find listing data with bedrooms == room_input and accommodates == accomodates_input
        listings_data = listings_data[(bedrooms == room_input) & (accommodates == accommodates_input)].reset_index(drop=True)
        listings_data = listings_data[(listings_data['price'] <= max_price) & (listings_data['price'] >= min_price)]
    
    # if listings_data is empty
    if listings_data.empty:
        print("Cannot find any listings with current filter, please try with other filters.\n")
        return   
    #add a column for number of amenities nearby to each listing
    listings_data['num_amenities_nearby'] = listings_data.apply(lambda x: num_amenities(x['latitude'], x['longitude'], amenities_data_clean), axis = 1)

    #add a column 'amenities_score' based on the number of different amenities nearby
    listings_data['amenities_score'] = listings_data['num_amenities_nearby'].apply(lambda x : ameneties_score(x))
    
    listings_data=listings_data.reset_index(drop=True)
    
    return listings_data


# return a dictionary with number of amenities in a 1km radius of this lat and lon
def num_amenities(lat, lon, amenities_data_clean):
    distance = haversine_distance(amenities_data_clean, lon, lat)
    amenities_data_clean['distance'] = distance
    data_withinR = amenities_data_clean.loc[amenities_data_clean['distance'] < 1000].reset_index(drop=True)
    amenities_series = data_withinR.pivot_table(columns = ['amenity'], aggfunc='size')  # Counts # of amenities, type=pd.series
    amenities_dict = amenities_series.to_dict()# converts series to dict
    return amenities_dict    

# returns calculated amenity score
def ameneties_score(my_dict):
    
    num_different_amenities = len(my_dict)
    score = num_different_amenities * 10 
        
    for key in my_dict:
        if (my_dict[key] > 30):
            score+=30
        else:
            score+= my_dict[key]
    return score

# returns cleaned amenities df and listings df sorted by amenity score
def find_best_listing(listings_data, amenities_data, user_input1):

    # handle input file
    accommodates_input, room_input, price_range_input, exact = handle_input(user_input1)
    
    # clean OSM amenities data
    amenities_required = ['restaurant', 'fast_food', 'cafe','bank','atm','pharmacy','bicycle_rental','fuel','pub','bar','car_sharing','car_rental','clinic','doctors','hospital','ice_cream','fountain','theatre','police','bus_station']
    amenities_data_clean = clean_amenities_data(amenities_data, amenities_required)
    
    # clean AirBnb listings data
    listings_data_clean = clean_listings_data(listings_data, accommodates_input, room_input, price_range_input, exact, amenities_data_clean)
    
    # sort by amenities_score in descending order, drop na scores(?)
    listings_by_ascore = listings_data_clean.sort_values(['amenities_score'], ascending=False).dropna().reset_index(drop=True)
    
    return amenities_data_clean, listings_by_ascore

# returns cleaned listings data for ML
def clean_data_ML(listings_data):
    
    columns_needed = ['latitude', 'longitude', 'host_response_time', 'host_response_rate', 'host_acceptance_rate','host_is_superhost','host_listings_count', 'host_total_listings_count', 'host_identity_verified','neighbourhood_cleansed', 'property_type', 'room_type', 'accommodates', 'bedrooms', 'beds', 'amenities', 'price',   'minimum_nights', 'maximum_nights', 'maximum_nights_avg_ntm',  'availability_30', 'availability_60', 'availability_90','availability_365','number_of_reviews', 'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin','review_scores_communication','review_scores_location','review_scores_value', 'reviews_per_month' ]
    listings_data = listings_data[columns_needed]

    # remove all rows with any Null Value
    listings_data = listings_data.dropna(how='any',axis=0) 

    # find number of amenities provided by the host
    listings_data['num_amenities'] = listings_data['amenities'].apply(lambda x: len(x)).astype('float64')
    listings_data.drop('amenities', axis= 1, inplace = True)

    #Label Encoding categorical Data
    lb_make = LabelEncoder()
    listings_data['host_response_time'] = lb_make.fit_transform(listings_data['host_response_time'])
    listings_data['host_is_superhost'] = lb_make.fit_transform(listings_data['host_is_superhost'])
    listings_data['host_identity_verified'] = lb_make.fit_transform(listings_data['host_identity_verified'])
    listings_data['neighbourhood_cleansed'] = lb_make.fit_transform(listings_data['neighbourhood_cleansed'])
    listings_data['property_type'] = lb_make.fit_transform(listings_data['property_type'])
    listings_data['room_type'] = lb_make.fit_transform(listings_data['room_type'])

    #convert strings to float
    listings_data['host_response_rate'] = listings_data['host_response_rate'].apply(lambda x: float(x.replace('%','')))
    listings_data['host_acceptance_rate'] = listings_data['host_acceptance_rate'].apply(lambda x: float(x.replace('%','')))
    listings_data['price'] = listings_data['price'].apply(lambda x: float(x.replace('$','').replace(',','')))
    return listings_data

# returns the scores of different models
def run_ml(listings_data_clean, amenities_data_clean):
    
    X = listings_data_clean.drop('price',1)
    y = listings_data_clean['price']

    X_train, X_valid, y_train, y_valid = train_test_split(X, y)
    knn = KNeighborsRegressor(n_neighbors=50)
    knn.fit(X_train, y_train)
    knn_sc = knn.score(X_valid, y_valid)
    print("Knn Score:", knn_sc)
    
    rf = RandomForestRegressor(100, max_depth=40)
    rf.fit(X_train, y_train)
    rf_sc = rf.score(X_valid, y_valid)
    print("Random Forest Score:",rf_sc)
    
    gb =  GradientBoostingRegressor()
    gb.fit(X_train, y_train)    
    gb_sc = gb.score(X_valid, y_valid)
    print("Gradient Boosting Score:",gb_sc)

    # Now we want to see if adding amenities score improves our model
    #add a column for number of amenities nearby to each listing
    listings_data_clean['num_amenities_nearby'] = listings_data_clean.apply(lambda x: num_amenities(x['latitude'], x['longitude'], amenities_data_clean), axis = 1)
    
    #add a column 'amenities_score' based on the number of different amenities nearby
    listings_data_clean['amenities_score'] = listings_data_clean['num_amenities_nearby'].apply(lambda x : ameneties_score(x))
    listings_data_clean = listings_data_clean.drop('num_amenities_nearby',1)

    X = listings_data_clean.drop('price',1)
    y = listings_data_clean['price']

    X_train, X_valid, y_train, y_valid = train_test_split(X, y)

    knn_A = KNeighborsRegressor(50)
    knn_A.fit(X_train, y_train)
    knn_A_sc = knn_A.score(X_valid, y_valid)
    print("Knn Score with ameneties_score:",knn_A_sc)

    rf_A = RandomForestRegressor(100, max_depth=40)
    rf_A.fit(X_train, y_train)
    rf_A_sc = rf_A.score(X_valid, y_valid)
    print("Random Forest Score with amenities_score:",rf_A_sc)

    gb_A = GradientBoostingRegressor()
    gb_A.fit(X_train, y_train)  
    gb_A_sc = gb_A.score(X_valid, y_valid)
    print("Gradient Boosting Score with amenities_score:",gb_A_sc)

    return knn_sc, knn_A_sc, rf_sc, rf_A_sc, gb_sc, gb_A_sc

# handles returns converted input values from file 
def handle_input(inputfile):
    for (x) in range(len(inputfile)):
        if inputfile['Preference'].iloc[x].lower() == "accommodates":
            accommodates_input = float(inputfile['Preference_Data'].iloc[x])
        if inputfile['Preference'].iloc[x].lower() == "bedrooms":
            room_input = float(inputfile['Preference_Data'].iloc[x])
        if inputfile['Preference'].iloc[x].lower() == "price range":
            price_range_input = inputfile['Preference_Data'].iloc[x]
        if inputfile['Preference'].iloc[x].lower() == "exact":
            exact = inputfile['Preference_Data'].iloc[x]
    if exact.lower() == "true":
        exact = True
    else:
        exact = False
           
    return accommodates_input, room_input, price_range_input, exact

def main():
    # read data
    listings_data, amenities_data, user_input1 = read_data()
        
    # sort filtered data by best review score, best amenity score, and best price
    [amenities_data_clean, sorted_output] = find_best_listing(listings_data, amenities_data, user_input1)

    # Perform ML Trials and store output score to df
    [knn_sc, knn_A_sc, rf_sc, rf_A_sc, gb_sc, gb_A_sc] = run_ml(clean_data_ML(listings_data), amenities_data_clean)
    ML_RES = [[knn_sc,knn_A_sc], 
              [rf_sc, rf_A_sc], 
              [gb_sc,gb_A_sc]]
    ML_df = pd.DataFrame(ML_RES, 
                         columns=["AirBnb's Listing Info", "AirBnb's Listing Info with Amenity Scores"], 
                         index=['K-Nearest Neighbors','Random Forest','Gradient Boosting'])
    ML_df.index.name = "Regressors Used"
    
    # outputting the filtered listings --> top 3 and total
    TOTAL_OUT = sorted_output.to_csv("Total_Filtered_ABNB_Listings.csv",na_rep='(missing)')
    
    # outputting the ML prediction results
    ML_OUT = ML_df.to_csv("ML_Price_Prediction.csv",na_rep='(missing)')


In [11]:
main()

Knn Score: 0.03929510300509198
Random Forest Score: 0.465551286663479
Gradient Boosting Score: 0.48327601185524005
Knn Score with ameneties_score: 0.049345659397194264
Random Forest Score with amenities_score: 0.43386240110955565
Gradient Boosting Score with amenities_score: 0.40866321408733375


In [13]:
listings_data, amenities_data, user_input1 = read_data()
        
# sort filtered data by best review score, best amenity score, and best price
[amenities_data_clean, sorted_output] = find_best_listing(listings_data, amenities_data, user_input1)


In [14]:
sorted_output

Unnamed: 0,id,listing_url,name,description,picture_url,latitude,longitude,property_type,accommodates,bedrooms,beds,amenities,price,review_scores_value,num_amenities_nearby,amenities_score
0,21581560,https://www.airbnb.com/rooms/21581560,Trendy Downtown Condo with 2 Beds and Parking ...,Beautifully decorated 2 bedroom apartment with...,https://a0.muscache.com/pictures/690ecd3b-7ab7...,49.28212,-123.12293,Entire condominium,3,2.0,2.0,"[""Iron"", ""Dryer"", ""Elevator"", ""Hot water"", ""Pa...",140.0,10.0,"{'atm': 1, 'bank': 31, 'bar': 30, 'bicycle_ren...",429
1,9046022,https://www.airbnb.com/rooms/9046022,Downtown Living - 2 Bedroom with Parking (E2),Newly furnished 2 Bedroom apartment suite with...,https://a0.muscache.com/pictures/6ae31dae-6e9f...,49.28295,-123.12323,Entire condominium,3,2.0,2.0,"[""Iron"", ""Dryer"", ""Elevator"", ""Hot water"", ""Fi...",140.0,10.0,"{'atm': 1, 'bank': 30, 'bar': 30, 'bicycle_ren...",427
2,24993810,https://www.airbnb.com/rooms/24993810,Trendy Downtown 2 Bedroom Condo,Newly furnished 2 Bedroom with secure undergro...,https://a0.muscache.com/pictures/bdda7e20-5bb2...,49.28106,-123.12318,Entire condominium,3,2.0,2.0,"[""Iron"", ""Dryer"", ""Bed linens"", ""Cooking basic...",140.0,9.0,"{'atm': 1, 'bank': 32, 'bar': 31, 'bicycle_ren...",426
3,30892524,https://www.airbnb.com/rooms/30892524,Downtown Gem - Jr 2 Bedroom Suite with Parking...,Newly furnished 2 Bedroom with secure undergro...,https://a0.muscache.com/pictures/f604ad69-3944...,49.28029,-123.12292,Entire apartment,3,2.0,2.0,"[""Heating"", ""Essentials"", ""Kitchen"", ""Iron"", ""...",135.0,10.0,"{'atm': 1, 'bank': 32, 'bar': 29, 'bicycle_ren...",424
4,5423637,https://www.airbnb.com/rooms/5423637,Gorgeous City Views from a Relaxed Apartment,Start the day looking out on sprawling city vi...,https://a0.muscache.com/pictures/f2c9bd09-55da...,49.28270,-123.12410,Entire condominium,3,2.0,2.0,"[""Iron"", ""Dryer"", ""Elevator"", ""Full kitchen"", ...",117.0,9.0,"{'atm': 1, 'bank': 30, 'bar': 29, 'bicycle_ren...",422
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66,248014,https://www.airbnb.com/rooms/248014,"Stunning views, comfort, location!!",Gorgeous views and fabulous location. Availabl...,https://a0.muscache.com/pictures/21342517/ad2c...,49.29103,-123.05782,Entire house,3,2.0,0.0,"[""Iron"", ""Dryer"", ""Cooking basics"", ""Hot water...",100.0,10.0,"{'bar': 1, 'bicycle_rental': 1, 'cafe': 3, 'fa...",82
67,27141391,https://www.airbnb.com/rooms/27141391,Home sweet home! Lucky Owl welcomes you. 小栖驿站欢迎您。,紧邻高尔夫球场，位处家庭居住区，安全舒适温馨。交通便利，有大量免费停车位，步行可到多条公交线...,https://a0.muscache.com/pictures/a2b8e110-c14e...,49.21700,-123.10268,Private room in house,3,2.0,2.0,"[""Heating"", ""Essentials"", ""Kitchen"", ""Fire ext...",99.0,10.0,"{'bank': 1, 'cafe': 4, 'fast_food': 2, 'pharma...",77
68,35572515,https://www.airbnb.com/rooms/35572515,Private Residential Retreat in Vancouver,Quiet private unit with view of the mountains ...,https://a0.muscache.com/pictures/cf86513a-a65d...,49.25794,-123.03547,Entire guest suite,3,2.0,2.0,"[""Iron"", ""Dryer"", ""Cooking basics"", ""Hot water...",129.0,10.0,"{'bank': 2, 'cafe': 4, 'fast_food': 3, 'fuel':...",76
69,17944736,https://www.airbnb.com/rooms/17944736,"Clean, Comfortable, Central Location.",Welcome to a very comfortable and clean suite ...,https://a0.muscache.com/pictures/4192ea71-1524...,49.23516,-123.07951,Entire guest suite,3,2.0,2.0,"[""Iron"", ""Dryer"", ""Bed linens"", ""Cooking basic...",65.0,10.0,"{'cafe': 3, 'fast_food': 3, 'fuel': 2, 'pub': ...",61
