# Imports

In [42]:
import re
import requests
import time
import random
import numpy as np
import pandas as pd

from urllib.parse import quote

# Constant

In [2]:
DATASET_PATH = '/kaggle/input/zocdoc/df_combined.csv'

# Utils

In [39]:
def extract_address_components(address):
    components = address.split(',')
    
    if len(components) >= 3:
        street_number_and_name = components[0].strip() + ', ' + components[1].strip()
        city = components[2].strip()
        return f"{street_number_and_name}, {city}"
    else:
        return "N/A"


def fetch_facility_name(street, state, postal_code):
    street_encoded = quote(street)
    url = f"https://nominatim.openstreetmap.org/search.php?street={street_encoded}&state={state}&postalcode={postal_code}&format=jsonv2&limit=1"
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    response = requests.get(url, headers=headers)
    
    if response.status_code == 200:
        data = response.json()
        if len(data) > 0:
            name = data[0].get('name')
            address = extract_address_components(data[0].get('display_name'))
            return (name.lower(), address.lower())
            print(f"Name: {name}, Add: {address}")
    return ("N/A", "N/A")

# Code

In [23]:
df = pd. read_csv(DATASET_PATH)
df.head(5)

Unnamed: 0,name,job_title,rating,street_address,address_locality,region,postal_code,is_dr,is_nurse,is_assistant
0,"dr. david bolon, md",cardiologist,4.93,"133 e 58th st, ste 1402",new york,NY,10022,1,0,0
1,"dr. anil gupta, md",cardiologist,5.0,"1314 hooper ave - ste 2b - 1314 hooper ave, st...",toms river,NJ,8753,1,0,0
2,"dr. fadi a elatat, md",cardiologist,4.69,"facv consultants pc-montclair - 127 pine st, s...",montclair,NJ,7042,1,0,0
3,"dr. stephen sherer, md",cardiologist,4.9,714 bergen blvd,ridgefield,NJ,7657,1,0,0
4,"dr. fadi a elatat, md",cardiologist,4.69,"facv consultants pc-union - 1945 morris ave, s...",union,NJ,7083,1,0,0


In [28]:
df = df.assign(facility_name='', facility_address='')

In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2249 entries, 0 to 2248
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   name              2249 non-null   object 
 1   job_title         2249 non-null   object 
 2   rating            2164 non-null   float64
 3   street_address    2025 non-null   object 
 4   address_locality  2025 non-null   object 
 5   region            2249 non-null   object 
 6   postal_code       2249 non-null   int64  
 7   is_dr             2249 non-null   int64  
 8   is_nurse          2249 non-null   int64  
 9   is_assistant      2249 non-null   int64  
 10  facility_name     2249 non-null   object 
 11  facility_address  2249 non-null   object 
dtypes: float64(1), int64(4), object(7)
memory usage: 211.0+ KB


In [30]:
df.head(2)

Unnamed: 0,name,job_title,rating,street_address,address_locality,region,postal_code,is_dr,is_nurse,is_assistant,facility_name,facility_address
0,"dr. david bolon, md",cardiologist,4.93,"133 e 58th st, ste 1402",new york,NY,10022,1,0,0,,
1,"dr. anil gupta, md",cardiologist,5.0,"1314 hooper ave - ste 2b - 1314 hooper ave, st...",toms river,NJ,8753,1,0,0,,


In [44]:
%%time
for row in df[:100].itertuples(index=True):
    if pd.notna(row.street_address):
        street_address = re.split(',|-', row.street_address)[0]
        facility_name, facility_add = fetch_facility_name(street=street_address, state=row.region, postal_code=row.postal_code)
        df.at[row.Index, 'facility_name'] = facility_name
        df.at[row.Index, 'facility_address'] = facility_add
        print(row.Index, facility_name, facility_add)
        time.sleep(random.uniform(1, 3))

0 parsons smile center parsons smile center, 133, east 58th street
1  1314, hooper avenue, brick township
2 N/A N/A
3  714, bergen boulevard, ridgefield
4 N/A N/A
5  954, teaneck road, colonial village
6 N/A N/A
7 N/A N/A
8  449, mount pleasant avenue, woodland park
9  954, teaneck road, colonial village
10  2418, east york street, fishtown
11 N/A N/A
12 sheridan building sheridan building, 125, south 9th street
13 N/A N/A
14 N/A N/A
15  421, huguenot street, residence park
16 N/A N/A
18 55th street 55th street, ward 132, zone 10 kodambakkam
19 N/A N/A
20  855, valley road, montclair heights
21  529, 39th street, union city
22  740, marne highway, hainesport
23  1915, central park avenue, colonial heights
24 N/A N/A
25  55, morris avenue, haworth
26 N/A N/A
27  183, south broadway, hicksville
28  211, east 51st street, beekman
29  1963, williamsbridge road, morris park
30  158, east 84th street, manhattan community board 8
31  255, spring valley avenue, hackensack
32  3201, grand conco

In [46]:
%%time
for row in df[99:1000].itertuples(index=True):
    if pd.notna(row.street_address):
        street_address = re.split(',|-', row.street_address)[0]
        facility_name, facility_add = fetch_facility_name(street=street_address, state=row.region, postal_code=row.postal_code)
        df.at[row.Index, 'facility_name'] = facility_name
        df.at[row.Index, 'facility_address'] = facility_add
        print(row.Index, facility_name, facility_add)
        time.sleep(random.uniform(1, 3))

99  10, first street, finn's mobile home park & sales
100 N/A N/A
101 N/A N/A
102 mount sinai west mount sinai west, 1000, 10th avenue
103 N/A N/A
104  10, first street, finn's mobile home park & sales
105  954, teaneck road, colonial village
106  901, franklin avenue, village of garden city
107  23, shore cliff place, harbor hills section 2
108  499, ocean parkway, parkville
109 dos caminos dos caminos, 373, park avenue south
110  158, east 84th street, manhattan community board 8
111  68, colonial road, village of floral park
112  82, colonial road, village of floral park
113  1600, stewart avenue, village of westbury
114  1, dakota drive, town of east fishkill
115  954, teaneck road, colonial village
116 N/A N/A
117 mcdonald's mcdonald's, 160, broadway
118  158, east 84th street, manhattan community board 8
119  2, ohio drive, lake success quadrangle
120 N/A N/A
121  901, franklin avenue, village of garden city
122  213, clent road, russell gardens
123 dos caminos dos caminos, 373, 

In [47]:
df.to_csv('df_combined_facility.csv', index=False)