In [1]:
import os
import time

import bs4
import numpy as np
import pandas as pd
import requests

from geopy.geocoders import Nominatim

pd.options.display.max_columns = 99

In [2]:
os.chdir("/Users/yenchenchou/Documents/RMDS_YC/RiskScore/RMDS_COVID19_riskgenerator")

In [3]:
df = pd.read_csv("data/poi_extended.csv")
df.head(3)

Unnamed: 0,safegraph_place_id,parent_safegraph_place_id,location_name,safegraph_brand_ids,brands,top_category,sub_category,naics_code,latitude,longitude,street_address,city_original,region,postal_code,iso_country_code,phone_number,open_hours,category_tags,area_square_feet,community,city
0,sg:077ddeecd49a4e06945b050fc6e32b05,,University Of South California Health Sciences...,,,"Colleges, Universities, and Professional Schools","Colleges, Universities, and Professional Schools",611310.0,34.059879,-118.206742,1540 Alcazar St Ste 133,Los Angeles,CA,90033,US,,"{ ""Mon"": [[""8:30"", ""17:00""]], ""Tue"": [[""8:30"",...",,11482546,Boyle Heights,Los Angeles
1,sg:15255592d706415e882fe44114cee333,,Neighborly Painting,,,Automotive Repair and Maintenance,"Automotive Body, Paint, and Interior Repair an...",811121.0,34.10949,-118.28725,2066 Hillhurst Ave,Los Angeles,CA,90027,US,13232100000.0,,,4319,Los Feliz,Los Angeles
2,sg:1f7f0d0e89db49bebe20047263ba674f,,Park Central Building,,,,,,34.047555,-118.253715,412 W 6th St,Los Angeles,CA,90014,US,,,,972,Downtown,Los Angeles


#### Things to remove
* Recomve communities in Orange County
* Move some values into the correct columns

In [4]:
len(df["city"].sort_values().unique())

174

In [6]:
def get_community_list():
    
    url = "https://media.ocgov.com/about/infooc/links/oc/occities.asp"
    obj = requests.get(url)
    web_content = bs4.BeautifulSoup(obj.text, "html.parser")
    target_content = web_content.find_all("a", {"class":"catch-external"})
    orange_county_comm = \
        [row.get_text().replace("City of ", "") for row in target_content]
    orange_county_comm.append("Orange County")
    
    return orange_county_comm


def exclude_community(df):
    
    idx_list = []
    for idx in range(len(df)):
        if df.iloc[idx]["city"] in orange_county_comm:
            idx_list.append(idx)
    df.drop(df.index[idx_list], inplace = True)
    
    return df


In [7]:
orange_county_comm = get_community_list()
df = exclude_community(df)

In [8]:
df.loc[df["city"] == "Los Angeles County", :].index

Int64Index([ 1221,  3175,  3203,  3541,  3703,  3841,  5614, 11654, 12738,
            14325, 15352, 15551, 15661, 16087, 16289, 16615, 19035, 19732,
            20860, 21525, 22109, 22540, 24642, 25499, 25986, 26189, 26515,
            27357, 30469, 33779, 33895, 36437, 41543, 42799, 44442],
           dtype='int64')

In [9]:
def get_address():
    geolocator = Nominatim(user_agent = "POI_city_finder", timeout = 500)
    la_community_list = list(df["city"].sort_values().unique())
    city_series = df.loc[df["city"] == "Los Angeles County", :]
    location_list, location_list_new = [], []
    for idx in city_series.index:
        address = list(df[["street_address", "city_original"]].iloc[idx])
        address = ", ".join(address)
        location = geolocator.geocode(address) 
        if location is None:
            address2 = list(df[["location_name", "street_address", "postal_code"]].iloc[idx].astype("str"))
            address2 = ", ".join(address2)
            location = geolocator.geocode(address2) 
            try:
                location = location[0]
            except TypeError:
                location = "missing"      

        location_list.append(location)
    return location_list, la_community_list


def match_community(location, la_community_list):
    
    for community in la_community_list:
        if community in location[0]:
            return community
    else:
        return "missing"
    
    
def get_community():
    
    location_list, la_community_list = get_address()
    location_list_new = []    
    for location in location_list:  
        community = match_community(location, la_community_list)
        location_list_new.append(community)     

    return location_list_new


def replace_value(df):
    
    city_series = df.loc[df["city"] == "Los Angeles County", "community"]
    df["community"].replace({"Downtown": "Los Angeles"}, inplace = True)
    df.loc[df["city"] == "Los Angeles County", "community"] = get_community()
    df.loc[df["city"] == "Los Angeles County", "city"] = city_series
    
    return df

In [10]:
df_new = replace_value(df)

In [11]:
df_new.head()

Unnamed: 0,safegraph_place_id,parent_safegraph_place_id,location_name,safegraph_brand_ids,brands,top_category,sub_category,naics_code,latitude,longitude,street_address,city_original,region,postal_code,iso_country_code,phone_number,open_hours,category_tags,area_square_feet,community,city
0,sg:077ddeecd49a4e06945b050fc6e32b05,,University Of South California Health Sciences...,,,"Colleges, Universities, and Professional Schools","Colleges, Universities, and Professional Schools",611310.0,34.059879,-118.206742,1540 Alcazar St Ste 133,Los Angeles,CA,90033,US,,"{ ""Mon"": [[""8:30"", ""17:00""]], ""Tue"": [[""8:30"",...",,11482546,Boyle Heights,Los Angeles
1,sg:15255592d706415e882fe44114cee333,,Neighborly Painting,,,Automotive Repair and Maintenance,"Automotive Body, Paint, and Interior Repair an...",811121.0,34.10949,-118.28725,2066 Hillhurst Ave,Los Angeles,CA,90027,US,13232100000.0,,,4319,Los Feliz,Los Angeles
2,sg:1f7f0d0e89db49bebe20047263ba674f,,Park Central Building,,,,,,34.047555,-118.253715,412 W 6th St,Los Angeles,CA,90014,US,,,,972,Los Angeles,Los Angeles
3,sg:221d3c3854874d43967491e653d0595c,,Rockdale Elementary,,,Elementary and Secondary Schools,Elementary and Secondary Schools,611110.0,34.133269,-118.194519,1303 Yosemite Dr,Los Angeles,CA,90041,US,,,,2507,Eagle Rock,Los Angeles
4,sg:25177520258a44d083f2842e2e49c7f4,,Jessica's Market,,,Grocery Stores,Supermarkets and Other Grocery (except Conveni...,445110.0,34.042517,-118.182423,3867 Hammel St,Los Angeles,CA,90063,US,13232690000.0,"{ ""Mon"": [[""7:00"", ""10:00""]], ""Tue"": [[""7:00"",...",,997,City Terrace,East Los Angeles


In [12]:
df_new.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 44597 entries, 0 to 44888
Data columns (total 21 columns):
safegraph_place_id           44597 non-null object
parent_safegraph_place_id    4366 non-null object
location_name                44597 non-null object
safegraph_brand_ids          4576 non-null object
brands                       4576 non-null object
top_category                 42991 non-null object
sub_category                 42991 non-null object
naics_code                   42992 non-null float64
latitude                     44597 non-null float64
longitude                    44597 non-null float64
street_address               44597 non-null object
city_original                44597 non-null object
region                       44597 non-null object
postal_code                  44597 non-null int64
iso_country_code             44597 non-null object
phone_number                 33353 non-null float64
open_hours                   22477 non-null object
category_tags           

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 44597 entries, 0 to 44888
Data columns (total 21 columns):
safegraph_place_id           44597 non-null object
parent_safegraph_place_id    4366 non-null object
location_name                44597 non-null object
safegraph_brand_ids          4576 non-null object
brands                       4576 non-null object
top_category                 42991 non-null object
sub_category                 42991 non-null object
naics_code                   42992 non-null float64
latitude                     44597 non-null float64
longitude                    44597 non-null float64
street_address               44597 non-null object
city_original                44597 non-null object
region                       44597 non-null object
postal_code                  44597 non-null int64
iso_country_code             44597 non-null object
phone_number                 33353 non-null float64
open_hours                   22477 non-null object
category_tags           

In [14]:
POI = df.copy()

In [15]:
POI.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 44597 entries, 0 to 44888
Data columns (total 21 columns):
safegraph_place_id           44597 non-null object
parent_safegraph_place_id    4366 non-null object
location_name                44597 non-null object
safegraph_brand_ids          4576 non-null object
brands                       4576 non-null object
top_category                 42991 non-null object
sub_category                 42991 non-null object
naics_code                   42992 non-null float64
latitude                     44597 non-null float64
longitude                    44597 non-null float64
street_address               44597 non-null object
city_original                44597 non-null object
region                       44597 non-null object
postal_code                  44597 non-null int64
iso_country_code             44597 non-null object
phone_number                 33353 non-null float64
open_hours                   22477 non-null object
category_tags           

In [16]:
os.getcwd()

'/Users/yenchenchou/Documents/RMDS_YC/RiskScore/RMDS_COVID19_riskgenerator'

In [17]:
os.chdir("/Users/yenchenchou/Documents/RMDS_YC/RiskScore/RMDS_COVID19_riskgenerator/data")

In [18]:
POI.to_csv("RMDS_poi_extended.csv")