In [37]:
import pandas as pd
import numpy as np
from math import radians, cos, sin, asin, sqrt

def read_data():
    listings_data = pd.read_csv('listings.csv.gz')
    amenities_data = pd.read_json('amenities-vancouver.json.gz', lines=True)
    return listings_data, amenities_data


def clean_amenities_data(amenities_data, amenities_required):

    #find unique amenities and the number of them to choose which are important for a traveller
    # print(amenities_data['amenity'].value_counts())


    #adapted from : https://www.kite.com/python/answers/how-to-filter-a-pandas-dataframe-with-a-list-by-%60in%60-or-%60not-in%60-in-python
    bool_series = amenities_data.amenity.isin(amenities_required)
    filtered_amenities_df = amenities_data[bool_series]
    filtered_amenities_df=filtered_amenities_df.drop(['timestamp','tags'], axis=1).dropna() # dropping unnecessary columns, and filter out NA values
    filtered_amenities_df.reset_index(inplace=True, drop=True)

    return filtered_amenities_df


#reference: https://stackoverflow.com/questions/4913349/haversine-formula-in-python-bearing-and-distance-between-two-gps-points
def haversine_distance(lon1, lat1, lon2, lat2):
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    r = 6371 # Radius of earth in kilometers. Use 3956 for miles
    return c * r * 1000

def haversine_distance2(df, lon2, lat2):
    # convert decimal degrees to radians 
    lon1=np.radians(df['lon'])
    lat1=np.radians(df['lat'])
    lon2=np.radians(lon2)
    lat2=np.radians(lat2)
    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = (dlat/2).apply(sin)**2 + (lat1).apply(sin) * cos(lat2) * (dlon/2).apply(sin)**2
    c = 2 * ((a).apply(sqrt).apply(asin)) 
    r = 6371 # Radius of earth in kilometers. Use 3956 for miles
    return c * r * 1000

def clean_listings_data(listings_data):
    #keep only the columns we need
    columns_needed = ['id', 'listing_url', 'name', 'description', 'picture_url', 'latitude', 'longitude', 'property_type', 'accommodates', 'bedrooms', 'beds', 'amenities', 'price']
    listings_data = listings_data[columns_needed]
    return listings_data


# #TODO: return a dictionary with number of amenities in a 1km radius of this lat and lon
def num_amenities(lat, lon, amenities_data_clean):
    distance = haversine_distance2(amenities_data_clean, lon, lat)
    amenities_data_clean['distance']=distance
    data_withinR = amenities_data_clean.loc[amenities_data_clean['distance']<1000].reset_index(drop=True)
    amenities_series=data_withinR.pivot_table(columns=['amenity'], aggfunc='size')  # Counts # of amenities, type=pd.series
    amenities_dict=amenities_series.to_dict()# converts series to dict
    return amenities_dict
#     amenities_dict = dict.fromkeys(amenities_required,0)

    


In [49]:

#Read Data
listings_data, amenities_data = read_data()

# Change amenities here (updated the "restaurant" typo)
amenities_required = ['restaurant', 'fast_food', 'cafe','bank','atm','pharmacy','bicycle_rental','fuel','pub','bar','car_sharing','car_rental','clinic','doctors','hospital','ice_cream','fountain','theatre','police','bus_station']

#Data Cleaning
amenities_data_clean = clean_amenities_data(amenities_data, amenities_required)
listings_data_clean = clean_listings_data(listings_data)

#Return a dict of amenities:
lat_input = 49.225164  # sample lat input
lon_input = -123.003742  # sample lon input
hello=num_amenities(lat_input, lon_input, amenities_data_clean)
hello

{'atm': 8,
 'bank': 16,
 'bar': 17,
 'bus_station': 1,
 'cafe': 27,
 'car_rental': 1,
 'clinic': 8,
 'fast_food': 61,
 'fountain': 3,
 'fuel': 1,
 'ice_cream': 1,
 'pharmacy': 9,
 'police': 1,
 'pub': 1,
 'restaurant': 56}

In [45]:
xx = haversine_distance2(amenities_data_clean, lon_input, lat_input)
amenities_data_clean[amenities_data_clean.isna().any(axis=1)]

Unnamed: 0,lat,lon,amenity,name,distance
8,49.269129,-123.056105,fuel,,6377.824537
15,49.193668,-123.180288,atm,,14239.757186
34,49.114494,-123.077270,fuel,,13580.918620
48,49.272918,-123.154026,car_sharing,,12899.815374
49,49.272915,-123.153977,car_sharing,,12896.194244
...,...,...,...,...,...
6886,49.138999,-122.888983,clinic,,13123.475921
6887,49.139018,-122.889496,restaurant,,13094.583522
6921,49.252447,-122.736873,fountain,,21092.423927
6922,49.251870,-122.737161,fountain,,21060.873049


In [48]:
# osm_data=amenities_data_clean.drop(['timestamp','tags'], axis=1).dropna()
amenities_data_clean

Unnamed: 0,lat,lon,amenity,name,distance
0,49.260812,-123.125736,cafe,Starbucks,10332.880832
1,49.260953,-123.125704,fast_food,Salad Loop,10336.604714
2,49.264041,-123.153407,fuel,Shell,12479.705692
3,49.126650,-123.182470,restaurant,Best Bite Indian Cuisine,17749.459983
4,49.283192,-123.109050,pub,The Cambie,10464.596293
...,...,...,...,...,...
6928,49.250408,-123.076261,restaurant,House of Dosas,6328.526158
6929,49.278424,-122.806704,cafe,Creekside Coffee,16512.808454
6930,49.278770,-122.797628,restaurant,Togo Sushi,17190.793065
6931,49.282666,-122.826978,pub,Brown's Social House,15235.282889


In [None]:
lat = osm_data['lat']
lon = osm_data['lon']
osm_data['amenity'].unique()
clinic_coord = osm_data[osm_data['amenity']=='clinic']
clinic_coord

In [None]:
np.radians(osm_data['lat']) # converts latitude to radians
np.radians(osm_data['lon']) # converts longitude to radians

In [None]:
#TODO: Add a column to listings dataset:  
# each element with a dictionary of number of amenities in a 1km radius of a listing.

# example input


lat1=49.126141
lon1=-123.183341
# 
distance = haversine_distance(lon_posR, lat_posR, lon_input, lat_input)

distance2 = haversine_distance(lon_negR, lat_negR, lon_input, lat_input)

display(distance, distance2)

In [None]:
#TODO: return a dictionary with number of amenities in a 1km radius of this lat and lon

# def num_amenities(lat, lon, amenities_data_clean, amenities_required):
#     amenities_dict = dict.fromkeys(amenities_required,0)

# lat + 1km
lat_input = 49.225164
lon_input = -123.003742
earth_radius=6371
m=(1/((2*pi/360)*earth_radius))/1000

lat_posR = lat_input+(1000*m)
lon_posR = lon_input+((1000*m)/cos(lat_input*(pi/180)))
lat_negR = lat_input-(1/6371)*(180/pi)
lon_negR = lon_input-(1/6371)*(180/pi)/cos(lat_input*pi/180)

display(lat_posR,lon_posR)

In [None]:

origin_pair=(lat_input,lon_input)
type(origin_pair)

In [None]:
from geopy import distance
print(distance.distance(origin_par,(lat1,lon1)))

In [None]:
# osm_data['newlon']=lon_input-osm_data['lon']
# osm_data['newlat']=lat_input-osm_data['lat']
# osm_data['newlat']=osm_data['newlat']/2
# osm_data['newlat']=osm_data['newlat'].apply(np.sin)
# osm_data['newlat']=osm_data['newlat']**2

In [None]:
def haversine_distance2(df, lon2, lat2):
    # convert decimal degrees to radians 
    lon1=np.radians(df['lon'])
    lat1=np.radians(df['lat'])
    lon2=np.radians(lon2)
    lat2=np.radians(lat2)
    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = (dlat/2).apply(np.sin)**2 + (lat1).apply(np.sin) * cos(lat2) * (dlon/2).apply(np.sin)**2
    c = 2 * ((a).apply(sqrt).apply(asin)) 
    r = 6371 # Radius of earth in kilometers. Use 3956 for miles
    return c * r * 1000


In [None]:
distance = haversine_distance2(osm_data, lon_input, lat_input)
osm_data['distance']=distance
data_withinR = osm_data.loc[osm_data['distance']<1000].reset_index(drop=True)
data_withinR

In [None]:
num_of_amen=data_withinR.pivot_table(columns=['amenity'], aggfunc='size')  # Counts # of amenities, type=pd.series

num_of_amen.to_dict()# converts series to dict

In [None]:
amenities_dict = dict.fromkeys(amenities_required,0)

In [None]:
# #TODO: return a dictionary with number of amenities in a 1km radius of this lat and lon
def num_amenities(lat, lon, amenities_data_clean):
    distance = haversine_distance2(amenities_data_clean, lon, lat)
    osm_data['distance']=distance
    data_withinR = osm_data.loc[osm_data['distance']<1000].reset_index(drop=True)
    amenities_series=data_withinR.pivot_table(columns=['amenity'], aggfunc='size')  # Counts # of amenities, type=pd.series
    amenities_dict=amenities_series.to_dict()# converts series to dict
    return amenities_dict
#     amenities_dict = dict.fromkeys(amenities_required,0)


In [None]:
hello=num_amenities(lat_input, lon_input, amenities_data_clean, amenities_required)
display(hello)

In [None]:
osm_data
hello