In [229]:
import pandas as pd
import numpy as np
from math import radians, cos, sin, asin, sqrt

def read_data():
    listings_data = pd.read_csv('listings.csv.gz')
    amenities_data = pd.read_json('amenities-vancouver.json.gz', lines=True)
    return listings_data, amenities_data


def clean_amenities_data(amenities_data, amenities_required):

    #find unique amenities and the number of them to choose which are important for a traveller
    # print(amenities_data['amenity'].value_counts())

    
    #adapted from : https://www.kite.com/python/answers/how-to-filter-a-pandas-dataframe-with-a-list-by-%60in%60-or-%60not-in%60-in-python
    bool_series = amenities_data.amenity.isin(amenities_required)
    filtered_amenities_df = amenities_data[bool_series]
    filtered_amenities_df.reset_index(inplace=True, drop=True)

    return filtered_amenities_df


#reference: https://stackoverflow.com/questions/4913349/haversine-formula-in-python-bearing-and-distance-between-two-gps-points
def haversine_distance(lon1, lat1, lon2, lat2):
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    r = 6371 # Radius of earth in kilometers. Use 3956 for miles
    return c * r * 1000

def haversine_distance2(df, lon2, lat2):
    # convert decimal degrees to radians 
    lon1=np.radians(df['lon'])
    lat1=np.radians(df['lat'])
    lon2=np.radians(lon2)
    lat2=np.radians(lat2)
    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = (dlat/2).apply(np.sin)**2 + (lat1).apply(np.sin) * cos(lat2) * (dlon/2).apply(np.sin)**2
    c = 2 * ((a).apply(sqrt).apply(asin)) 
    r = 6371 # Radius of earth in kilometers. Use 3956 for miles
    return c * r * 1000

def clean_listings_data(listings_data):
    #keep only the columns we need
    columns_needed = ['id', 'listing_url', 'name', 'description', 'picture_url', 'latitude', 'longitude', 'property_type', 'accommodates', 'bedrooms', 'beds', 'amenities', 'price']
    listings_data = listings_data[columns_needed]
    return listings_data


# #TODO: return a dictionary with number of amenities in a 1km radius of this lat and lon
def num_amenities(lat, lon, amenities_data_clean):
    distance = haversine_distance2(amenities_data_clean, lon, lat)
    osm_data['distance']=distance
    data_withinR = osm_data.loc[osm_data['distance']<1000].reset_index(drop=True)
    amenities_series=data_withinR.pivot_table(columns=['amenity'], aggfunc='size')  # Counts # of amenities, type=pd.series
    amenities_dict=amenities_series.to_dict()# converts series to dict
    return amenities_dict
#     amenities_dict = dict.fromkeys(amenities_required,0)

    


In [241]:

#Read Data
listings_data, amenities_data = read_data()

# Change amenities here 
amenities_required = ['restaurant', 'fast_food', 'cafe','bank','atm','pharmacy','bicycle_rental','fuel','pub','bar','car_sharing','car_rental','clinic','doctors','hospital','ice_cream','fountain','theatre','police','bus_station']

#Data Cleaning
amenities_data_clean = clean_amenities_data(amenities_data, amenities_required)
listings_data_clean = clean_listings_data(listings_data)

#Return a dict of amenities:
lat_input = 49.225164  # sample lat input
lon_input = -123.003742  # sample lon input
hello=num_amenities(lat_input, lon_input, amenities_data_clean, amenities_required)



In [231]:
num_amenities(49, -123, amenities_data_clean, amenities_required)

In [232]:
osm_data=amenities_data_clean.drop(['timestamp','tags'], axis=1).dropna()

In [35]:
lat = osm_data['lat']
lon = osm_data['lon']
osm_data['amenity'].unique()
clinic_coord = osm_data[osm_data['amenity']=='clinic']
clinic_coord

Unnamed: 0,lat,lon,amenity,name
56,49.225164,-123.003742,clinic,J-Von Medical Centre
199,49.126141,-123.183341,clinic,Empower Physiotherapy Clinic
338,49.165260,-122.661622,clinic,Medical
339,49.165260,-122.661622,clinic,Redwood Medical Clinic
417,49.185445,-122.847307,clinic,Central City Medical Clinic
...,...,...,...,...
6612,49.049523,-122.292133,clinic,Abbotsford Community Renal Services
6790,49.108015,-122.650452,clinic,Glover Medical Centre
6850,49.262480,-123.070379,clinic,Robert and Lily Lee Family Community Health Ce...
6885,49.139961,-122.888325,clinic,Aventus


In [25]:
np.radians(osm_data['lat']) # converts latitude to radians
np.radians(osm_data['lon']) # converts longitude to radians

0      -2.148949
1      -2.148949
2      -2.149432
3      -2.149940
4      -2.148658
          ...   
6928   -2.148086
6929   -2.143381
6930   -2.143223
6931   -2.143735
6932   -2.143724
Name: lon, Length: 6566, dtype: float64

In [233]:
#TODO: Add a column to listings dataset:  
# each element with a dictionary of number of amenities in a 1km radius of a listing.

# example input


lat1=49.126141
lon1=-123.183341
# 
distance = haversine_distance(lon_posR, lat_posR, lon_input, lat_input)

distance2 = haversine_distance(lon_negR, lat_negR, lon_input, lat_input)

display(distance, distance2)

1414.1492092944136

1414.2779047650502

In [155]:
#TODO: return a dictionary with number of amenities in a 1km radius of this lat and lon

# def num_amenities(lat, lon, amenities_data_clean, amenities_required):
#     amenities_dict = dict.fromkeys(amenities_required,0)

# lat + 1km
lat_input = 49.225164
lon_input = -123.003742
earth_radius=6371
m=(1/((2*pi/360)*earth_radius))/1000

lat_posR = lat_input+(1000*m)
lon_posR = lon_input+((1000*m)/cos(lat_input*(pi/180)))
lat_negR = lat_input-(1/6371)*(180/pi)
lon_negR = lon_input-(1/6371)*(180/pi)/cos(lat_input*pi/180)

display(lat_posR,lon_posR)

49.23415721605919

-122.98997170402656

In [160]:

origin_pair=(lat_input,lon_input)
type(origin_pair)

tuple

In [161]:
from geopy import distance
print(distance.distance(origin_par,(lat1,lon1)))

ModuleNotFoundError: No module named 'geopy'

In [180]:
# osm_data['newlon']=lon_input-osm_data['lon']
# osm_data['newlat']=lat_input-osm_data['lat']
# osm_data['newlat']=osm_data['newlat']/2
# osm_data['newlat']=osm_data['newlat'].apply(np.sin)
# osm_data['newlat']=osm_data['newlat']**2

In [234]:
def haversine_distance2(df, lon2, lat2):
    # convert decimal degrees to radians 
    lon1=np.radians(df['lon'])
    lat1=np.radians(df['lat'])
    lon2=np.radians(lon2)
    lat2=np.radians(lat2)
    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = (dlat/2).apply(np.sin)**2 + (lat1).apply(np.sin) * cos(lat2) * (dlon/2).apply(np.sin)**2
    c = 2 * ((a).apply(sqrt).apply(asin)) 
    r = 6371 # Radius of earth in kilometers. Use 3956 for miles
    return c * r * 1000


In [235]:
distance = haversine_distance2(osm_data, lon_input, lat_input)
osm_data['distance']=distance
data_withinR = osm_data.loc[osm_data['distance']<1000].reset_index(drop=True)
data_withinR

Unnamed: 0,lat,lon,amenity,name,distance
0,49.225164,-123.003742,clinic,J-Von Medical Centre,0.013594
1,49.227030,-123.004458,pharmacy,Rexall,214.881966
2,49.229246,-123.004694,fountain,Mermaid Fountain,459.944393
3,49.229028,-122.998951,cafe,Mr. Mustache Tea & Dessert,570.068693
4,49.221814,-122.996932,fast_food,Domino's,649.905726
...,...,...,...,...,...
195,49.225303,-123.004036,cafe,Fondway Cafe,27.690813
196,49.232176,-123.009826,pharmacy,The Medicine Shoppe,913.413939
197,49.231303,-123.005032,pharmacy,Shoppers Drug Mart,690.008969
198,49.225606,-122.996612,cafe,CoCo Fresh Tea & Juice,559.751885


In [277]:
num_of_amen=data_withinR.pivot_table(columns=['amenity'], aggfunc='size')  # Counts # of amenities, type=pd.series

num_of_amen.to_dict()# converts series to dict

{'bank': 16,
 'bar': 16,
 'bus_station': 1,
 'cafe': 27,
 'car_rental': 1,
 'clinic': 8,
 'fast_food': 61,
 'fountain': 1,
 'fuel': 1,
 'ice_cream': 1,
 'pharmacy': 9,
 'police': 1,
 'pub': 1,
 'restaurant': 56}

In [278]:
amenities_dict = dict.fromkeys(amenities_required,0)

In [286]:
# #TODO: return a dictionary with number of amenities in a 1km radius of this lat and lon
def num_amenities(lat, lon, amenities_data_clean):
    distance = haversine_distance2(amenities_data_clean, lon, lat)
    osm_data['distance']=distance
    data_withinR = osm_data.loc[osm_data['distance']<1000].reset_index(drop=True)
    amenities_series=data_withinR.pivot_table(columns=['amenity'], aggfunc='size')  # Counts # of amenities, type=pd.series
    amenities_dict=amenities_series.to_dict()# converts series to dict
    return amenities_dict
#     amenities_dict = dict.fromkeys(amenities_required,0)


In [287]:
hello=num_amenities(lat_input, lon_input, amenities_data_clean, amenities_required)
display(hello)

{'bank': 16,
 'bar': 16,
 'bus_station': 1,
 'cafe': 27,
 'car_rental': 1,
 'clinic': 8,
 'fast_food': 61,
 'fountain': 1,
 'fuel': 1,
 'ice_cream': 1,
 'pharmacy': 9,
 'police': 1,
 'pub': 1,
 'restaurant': 56}

In [288]:
osm_data
hello

{'bank': 16,
 'bar': 16,
 'bus_station': 1,
 'cafe': 27,
 'car_rental': 1,
 'clinic': 8,
 'fast_food': 61,
 'fountain': 1,
 'fuel': 1,
 'ice_cream': 1,
 'pharmacy': 9,
 'police': 1,
 'pub': 1,
 'restaurant': 56}