# Imports

In [1]:
import re
import requests
import time
import random
import numpy as np
import pandas as pd

from urllib.parse import quote

# Constant

In [2]:
DATASET_PATH = '/kaggle/input/zocdoc/df_combined.csv'

# Utils

In [3]:
def extract_address_components(address):
    components = address.split(',')
    
    if len(components) >= 3:
        street_number_and_name = components[0].strip() + ', ' + components[1].strip()
        city = components[2].strip()
        return f"{street_number_and_name}, {city}"
    else:
        return "N/A"


def fetch_facility_name(street, state, postal_code):
    street_encoded = quote(street)
    url = f"https://nominatim.openstreetmap.org/search.php?street={street_encoded}&state={state}&postalcode={postal_code}&format=jsonv2&limit=1"
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    response = requests.get(url, headers=headers)
    
    if response.status_code == 200:
        data = response.json()
        if len(data) > 0:
            name = data[0].get('name')
            address = extract_address_components(data[0].get('display_name'))
            return (name.lower(), address.lower())
            print(f"Name: {name}, Add: {address}")
    return ("N/A", "N/A")

# Code

In [4]:
df = pd. read_csv(DATASET_PATH)
df.head(5)

Unnamed: 0,name,job_title,rating,street_address,address_locality,region,postal_code,is_dr,is_nurse,is_assistant
0,"dr. david bolon, md",cardiologist,4.93,"133 e 58th st, ste 1402",new york,NY,10022,1,0,0
1,"dr. anil gupta, md",cardiologist,5.0,"1314 hooper ave - ste 2b - 1314 hooper ave, st...",toms river,NJ,8753,1,0,0
2,"dr. fadi a elatat, md",cardiologist,4.69,"facv consultants pc-montclair - 127 pine st, s...",montclair,NJ,7042,1,0,0
3,"dr. stephen sherer, md",cardiologist,4.9,714 bergen blvd,ridgefield,NJ,7657,1,0,0
4,"dr. fadi a elatat, md",cardiologist,4.69,"facv consultants pc-union - 1945 morris ave, s...",union,NJ,7083,1,0,0


In [5]:
df = df.assign(facility_name='', facility_address='')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2249 entries, 0 to 2248
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   name              2249 non-null   object 
 1   job_title         2249 non-null   object 
 2   rating            2164 non-null   float64
 3   street_address    2025 non-null   object 
 4   address_locality  2025 non-null   object 
 5   region            2249 non-null   object 
 6   postal_code       2249 non-null   int64  
 7   is_dr             2249 non-null   int64  
 8   is_nurse          2249 non-null   int64  
 9   is_assistant      2249 non-null   int64  
 10  facility_name     2249 non-null   object 
 11  facility_address  2249 non-null   object 
dtypes: float64(1), int64(4), object(7)
memory usage: 211.0+ KB


In [7]:
df.head(2)

Unnamed: 0,name,job_title,rating,street_address,address_locality,region,postal_code,is_dr,is_nurse,is_assistant,facility_name,facility_address
0,"dr. david bolon, md",cardiologist,4.93,"133 e 58th st, ste 1402",new york,NY,10022,1,0,0,,
1,"dr. anil gupta, md",cardiologist,5.0,"1314 hooper ave - ste 2b - 1314 hooper ave, st...",toms river,NJ,8753,1,0,0,,


In [8]:
# %%time
# for row in df[:100].itertuples(index=True):
#     if pd.notna(row.street_address):
#         street_address = re.split(',|-', row.street_address)[0]
#         facility_name, facility_add = fetch_facility_name(street=street_address, state=row.region, postal_code=row.postal_code)
#         df.at[row.Index, 'facility_name'] = facility_name
#         df.at[row.Index, 'facility_address'] = facility_add
#         print(row.Index, facility_name, facility_add)
#         time.sleep(random.uniform(1, 3))

In [9]:
# %%time
# for row in df[99:1000].itertuples(index=True):
#     if pd.notna(row.street_address):
#         street_address = re.split(',|-', row.street_address)[0]
#         facility_name, facility_add = fetch_facility_name(street=street_address, state=row.region, postal_code=row.postal_code)
#         df.at[row.Index, 'facility_name'] = facility_name
#         df.at[row.Index, 'facility_address'] = facility_add
#         print(row.Index, facility_name, facility_add)
#         time.sleep(random.uniform(1, 3))

In [10]:
# %%time
# for row in df[1000:1200].itertuples(index=True):
#     if pd.notna(row.street_address):
#         street_address = re.split(',|-', row.street_address)[0]
#         facility_name, facility_add = fetch_facility_name(street=street_address, state=row.region, postal_code=row.postal_code)
#         df.at[row.Index, 'facility_name'] = facility_name
#         df.at[row.Index, 'facility_address'] = facility_add
#         print(row.Index, facility_name, facility_add)
#         time.sleep(random.uniform(1, 3))

In [11]:
# %%time
# for row in df[1200:1500].itertuples(index=True):
#     if pd.notna(row.street_address):
#         street_address = re.split(',|-', row.street_address)[0]
#         facility_name, facility_add = fetch_facility_name(street=street_address, state=row.region, postal_code=row.postal_code)
#         df.at[row.Index, 'facility_name'] = facility_name
#         df.at[row.Index, 'facility_address'] = facility_add
#         print(row.Index, facility_name, facility_add)
#         time.sleep(random.uniform(1, 3))

In [12]:
%%time
for row in df[1500:].itertuples(index=True):
    if pd.notna(row.street_address):
        street_address = re.split(',|-', row.street_address)[0]
        facility_name, facility_add = fetch_facility_name(street=street_address, state=row.region, postal_code=row.postal_code)
        df.at[row.Index, 'facility_name'] = facility_name
        df.at[row.Index, 'facility_address'] = facility_add
        print(row.Index, facility_name, facility_add)
        time.sleep(random.uniform(1, 3))

1500  630, broad street, ridgewood junction
1501  630, broad street, ridgewood junction
1502 N/A N/A
1503 N/A N/A
1504 N/A N/A
1508 princess road princess road, lawrence township, mercer county
1509 cranbury road cranbury road, arrowhead park, arrowhead village
1510 spectrum therapeutics of nj spectrum therapeutics of nj, 401, paterson-hamburg turnpike
1511  370, grand avenue, davisville
1512  300, perrine road, cedarview estates
1514 hawks bridge road hawks bridge road, cedar crest manor, carneys point township
1515  123, franklin corner road, franklin corner
1516  127, pine street, closter
1517 N/A N/A
1518 union union, morris avenue, union
1519  620, cranbury road, gillilandtown
1520  1503, east saint georges avenue, roselle
1522 hazlet hazlet, holmdel road, mechanicsville
1523  8901, john f. kennedy boulevard, hudson heights
1524  50, mount prospect avenue, athenia
1525 2 journal square plaza 2 journal square plaza, 2, john f. kennedy boulevard
1526  784, franklin avenue, nutley
15

In [13]:
df.to_csv('df_combined_facility.csv', index=False)