In [65]:
import numpy as np
import pandas as pd
import re
import geopy.distance
pd.pandas.set_option('display.max_columns', None)

In [3]:
df = pd.read_csv('gmaps_scrape_data_cleaned.csv')
df = df.drop('Unnamed: 0', axis=1)
df.head()

Unnamed: 0,name,stars,count_reviews,coordinate
0,Central Park Jakarta,4.7,96897,"-6.1773686,106.7880974,17"
1,Ciplaz Cengkareng,4.6,2441,"-6.1530098,106.7256653,17"
2,CityWalk Gajah Mada,4.5,2978,"-6.1453373,106.8125703,17"
3,Green Sedayu Mall,4.6,3745,"-6.1388522,106.7261465,17"
4,HubLife Taman Anggrek,4.6,2225,"-6.208428,106.7824432,15"


In [5]:
df['latitude'] = df['coordinate'].apply(lambda x: x.split(',')[0])
df['longitude'] = df['coordinate'].apply(lambda x: x.split(',')[1])
df.head()

Unnamed: 0,name,stars,count_reviews,coordinate,latitude,longitude
0,Central Park Jakarta,4.7,96897,"-6.1773686,106.7880974,17",-6.1773686,106.7880974
1,Ciplaz Cengkareng,4.6,2441,"-6.1530098,106.7256653,17",-6.1530098,106.7256653
2,CityWalk Gajah Mada,4.5,2978,"-6.1453373,106.8125703,17",-6.1453373,106.8125703
3,Green Sedayu Mall,4.6,3745,"-6.1388522,106.7261465,17",-6.1388522,106.7261465
4,HubLife Taman Anggrek,4.6,2225,"-6.208428,106.7824432,15",-6.208428,106.7824432


In [11]:
for name in df['name']:
    print(name)

Central Park Jakarta
Ciplaz Cengkareng
CityWalk Gajah Mada
Green Sedayu Mall
HubLife Taman Anggrek
Lindeteves Trade Centre
Lippo Mall Puri
Mall Ciputra Jakarta
Mall Puri Indah
Mall Taman Anggrek
Mall Taman Palem
Neo Soho Podomoro City
Plaza Slipi Jaya
Seasons City
Citywalk Sudirman
FX Sudirman
Gajah Mada Plaza
Grand Indonesia Shopping Town
Green Pramuka Square
ITC Cempaka Mas
ITC Mangga Dua
ITC Roxy Mas
Mangga Dua Mall
Mangga Dua Square
Pasar Pagi Mangga Dua
Plaza Atrium
Plaza Indonesia
Plaza Senayan
Ratu Plaza
Sarinah
Senayan City
Senayan Park
Thamrin City
Aeon Mall Tanjung Barat
Astha District 8
Blok M Plaza
Blok M Square
Cilandak Town Square
Epiwalk
Gandaria City
Grand ITC Permata Hijau
ITC Cipulir
ITC Fatmawati
Kalibata City Square
Kota Kasablanka
Kuningan City
Lippo Mall Kemang
Lotte Shopping Avenue
Mall Ambasador
Mall Metro Cipulir
One Belpark Mall
Pacific Place
Pasaraya Blok M
Pejaten Village
Plaza Festival Mall Kuningan
Plaza Kalibata
Plaza Semanggi
Poins Square
Pondok Indah Ma

Let's categorize the facilities into malls, stations, bus stations, and hospitals

In [13]:
df['category'] = None
df['category'][:84] = 'mall'
df['category'][84:135] = 'train_station'
df['category'][135:349] = 'bus_station'
df['category'][349:] = 'hospital'
df['category'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['category'][:84] = 'mall'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['category'][84:135] = 'train_station'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['category'][135:349] = 'bus_station'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['category'][349:] = 'hospital'


category
bus_station      214
hospital         170
mall              84
train_station     51
Name: count, dtype: int64

In [14]:
# count facilities mall, hospital
# max stars mall, hospital
# sum count reviews mall, hospital
# count facilities bus, train stations
# sum count reviews bus, train stations

In [52]:
# https://stackoverflow.com/questions/1253499/simple-calculations-for-working-with-lat-lon-and-km-distance
# Latitude: 1 deg = 110.574 km
# Longitude: 1 deg = 111.320*cos(latitude) km
def get_facilities_near(lat, long, radius_km, df_map):
    """
    Dependencies: numpy as np, pandas as pd, geopy.distance
    Input: Latitude (float), Longitude (float), Radius in km (float)
    Output: Columns of facility features, that is:
        1. Count Mall facilities near lat and long
        2. Max stars of the Mall facilities near lat and long
        3. Sum of count reviews of the Mall facilities near lat and long
        4. Count Hospital facilities near lat and long
        5. Max stars of the Hospital facilities near lat and long
        6. Sum of count reviews of the Hospital facilities near lat and long
        7. Count Bus Station facilities near lat and long
        9. Sum of count reviews of the Bus Station facilities near lat and long
        8. Max stars of the Train Station facilities near lat and long
        10. Sum of count reviews of the Train Station facilities near lat and long
    """
    d_lat = radius_km/110 # let's use upper bound of 110000
    d_long = radius_km/111 # let's assume upper bound of cos(0) and 111000
    df = df_map.copy(deep=True)
    df = df[ (df['latitude'].between(lat-d_lat, lat+d_lat)) & (df['longitude'].between(long-d_long, long+d_long)) ] # bbox
    try:
        df['distance'] = df.apply(lambda x: geopy.distance.distance((lat, long) , (x['latitude'], x['longitude'])).km, axis=1)
    except:
        df['distance'] = np.nan
    df = df[ df['distance'] <= radius_km ]
    return [
        df[ df['category'] == 'mall']['name'].nunique()
        , df[ df['category'] == 'mall']['stars'].max()
        , df[ df['category'] == 'mall']['count_reviews'].sum()
        , df[ df['category'] == 'hospital']['name'].nunique()
        , df[ df['category'] == 'hospital']['stars'].max()
        , df[ df['category'] == 'hospital']['count_reviews'].sum()
        , df[ df['category'] == 'bus_station']['name'].nunique()
        , df[ df['category'] == 'bus_station']['count_reviews'].sum()
        , df[ df['category'] == 'train_station']['name'].nunique()
        , df[ df['category'] == 'train_station']['count_reviews'].sum()
    ]

In [54]:
df_lamudi = pd.read_csv('lamudi_njop_flood_dataset.csv')

In [55]:
geo_point_s = sorted(df_lamudi['data_geo_point_s'].astype('str').unique())
print('Addresses count:', len(geo_point_s))
count_mall = []
max_stars_mall = []
sum_reviews_mall = []
count_hospital = []
max_stars_hospital = []
sum_reviews_hospital = []
count_bus_st = []
sum_reviews_bus_st = []
count_train_st = []
sum_reviews_train_st = []
df_map = df
df_map[['latitude', 'longitude']] = df_map[['latitude', 'longitude']].astype('float')
counter = 0
for address in geo_point_s:
    counter += 1
    print('Now on address:', counter)
    try:
        address = eval(address)
        sol = get_facilities_near(address[1], address[0], 2, df_map)
        count_mall.append(sol[0])
        max_stars_mall.append(sol[1])
        sum_reviews_mall.append(sol[2])
        count_hospital.append(sol[3])
        max_stars_hospital.append(sol[4])
        sum_reviews_hospital.append(sol[5])
        count_bus_st.append(sol[6])
        sum_reviews_bus_st.append(sol[7])
        count_train_st.append(sol[8])
        sum_reviews_train_st.append(sol[9])
    except:
        count_mall.append(np.nan)
        max_stars_mall.append(np.nan)
        sum_reviews_mall.append(np.nan)
        count_hospital.append(np.nan)
        max_stars_hospital.append(np.nan)
        sum_reviews_hospital.append(np.nan)
        count_bus_st.append(np.nan)
        sum_reviews_bus_st.append(np.nan)
        count_train_st.append(np.nan)
        sum_reviews_train_st.append(np.nan)

Addresses count: 3449
Now on address: 1
Now on address: 2
Now on address: 3
Now on address: 4
Now on address: 5
Now on address: 6
Now on address: 7
Now on address: 8
Now on address: 9
Now on address: 10
Now on address: 11
Now on address: 12
Now on address: 13
Now on address: 14
Now on address: 15
Now on address: 16
Now on address: 17
Now on address: 18
Now on address: 19
Now on address: 20
Now on address: 21
Now on address: 22
Now on address: 23
Now on address: 24
Now on address: 25
Now on address: 26
Now on address: 27
Now on address: 28
Now on address: 29
Now on address: 30
Now on address: 31
Now on address: 32
Now on address: 33
Now on address: 34
Now on address: 35
Now on address: 36
Now on address: 37
Now on address: 38
Now on address: 39
Now on address: 40
Now on address: 41
Now on address: 42
Now on address: 43
Now on address: 44
Now on address: 45
Now on address: 46
Now on address: 47
Now on address: 48
Now on address: 49
Now on address: 50
Now on address: 51
Now on address: 52

In [62]:
df_facilities = pd.DataFrame(data={'count_mall': count_mall
                                    , 'max_stars_mall': max_stars_mall
                                    , 'sum_reviews_mall': sum_reviews_mall
                                    , 'count_hospital': count_hospital
                                    , 'max_stars_hospital': max_stars_hospital
                                    , 'sum_reviews_hospital': sum_reviews_hospital
                                    , 'count_bus_st': count_bus_st
                                    , 'sum_reviews_bus_st': sum_reviews_bus_st
                                    , 'count_train_st': count_train_st
                                    , 'sum_reviews_train_st': sum_reviews_train_st
                                    , 'data_geo_point_s': geo_point_s})

In [63]:
df_facilities

Unnamed: 0,count_mall,max_stars_mall,sum_reviews_mall,count_hospital,max_stars_hospital,sum_reviews_hospital,count_bus_st,sum_reviews_bus_st,count_train_st,sum_reviews_train_st,data_geo_point_s
0,0.0,,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,"[0,0]"
1,0.0,,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,"[1,-6.1136504]"
2,0.0,,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,"[1,-6.1368875]"
3,0.0,,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,"[1,-6.2317563]"
4,0.0,,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,"[1,-6.3454]"
...,...,...,...,...,...,...,...,...,...,...,...
3444,0.0,,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,"[40,12]"
3445,0.0,,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,"[50,13]"
3446,0.0,,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,"[89.8765,-56.351]"
3447,0.0,,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,"[9,-6.1886]"


In [64]:
df_lamudi['data_geo_point_s'] = df_lamudi['data_geo_point_s'].astype('str')
df_lamudi = df_lamudi.merge(df_facilities, on='data_geo_point_s', how='left')
df_lamudi

Unnamed: 0,data_price_s,data_category_s,data_subcategories_s,data_bedrooms_s,data_bathrooms_s,data_building_size_s,data_land_size_s,data_furnished_s,data_sku_s,data_geo_point_s,...,count_mall,max_stars_mall,sum_reviews_mall,count_hospital,max_stars_hospital,sum_reviews_hospital,count_bus_st,sum_reviews_bus_st,count_train_st,sum_reviews_train_st
0,2.000000e+08,house,"[""house"",""single-family-house""]",2,2.0,52.0,40.0,,HO63342840B9EB3ID,,...,,,,,,,,,,
1,1.250000e+09,house,"[""house"",""single-family-house""]",3,2.0,200.0,200.0,,HO63CE42B5E3B0AID,,...,,,,,,,,,,
2,6.910000e+08,house,"[""house"",""single-family-house""]",2,2.0,55.0,60.0,,HO62B162B0CB941ID,,...,,,,,,,,,,
3,1.200000e+09,house,"[""house"",""single-family-house""]",3,1.0,80.0,150.0,,HO5E2567A098B1DID,"[106.95499,-6.19651]",...,1.0,7293.0,429.0,0.0,,0.0,0.0,0.0,0.0,0.0
4,2.100000e+09,house,"[""house"",""single-family-house""]",6,2.0,150.0,305.0,,HO60111E9AC8A8EID,"[106.95499,-6.19651]",...,1.0,7293.0,429.0,0.0,,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79135,4.300000e+09,house,"[""house"",""single-family-house""]",3,2.0,245.0,126.0,,HO6364C8B7A3C09ID,"[106.865237,-6.137171]",...,2.0,4.5,12280.0,4.0,4.6,4656.0,3.0,110.0,0.0,0.0
79136,2.900000e+10,house,"[""house"",""single-family-house""]",6,4.0,1200.0,1040.0,,HO63D662483CD7CID,"[106.865237,-6.137171]",...,2.0,4.5,12280.0,4.0,4.6,4656.0,3.0,110.0,0.0,0.0
79137,3.300000e+09,house,"[""house"",""single-family-house""]",3,3.0,200.0,96.0,,HO643FB884DFB95ID,"[106.865237,-6.137171]",...,2.0,4.5,12280.0,4.0,4.6,4656.0,3.0,110.0,0.0,0.0
79138,2.500000e+09,house,"[""house"",""single-family-house""]",4,3.0,110.0,93.0,,HO642FE87DB55C9ID,"[106.865237,-6.137171]",...,2.0,4.5,12280.0,4.0,4.6,4656.0,3.0,110.0,0.0,0.0


In [66]:
df_lamudi.to_csv('final_dataset_unfiltered.csv', index=False)