In [127]:
import re
import time
import random
import pandas as pd

from urlextract import URLExtract
import tldextract

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait

In [66]:
link = 'https://iafdb.travel.state.gov/DefaultForm.aspx'

#### Initiate

In [67]:
driver = webdriver.Firefox(executable_path='/Users/vrai/Downloads/geckodriver')
driver.get(link)

In [65]:
driver.close()

#### Fill out initial form

In [68]:
search_by = driver.find_element_by_id("rdoZipCode")
search_by.click()
driver.implicitly_wait(10)

In [69]:
zip_code = driver.find_element_by_id("txtZipCode")
zip_code.send_keys('33132')
driver.implicitly_wait(10)

In [70]:
closest = driver.find_element_by_id("rdoNumClosest")
closest.click()
driver.implicitly_wait(10)

In [71]:
Select(driver.find_element_by_id("ddnNumClosest")).select_by_value('250')
driver.implicitly_wait(10)

In [72]:
send = driver.find_element_by_id("btnSearch")
send.click()
driver.implicitly_wait(10)

#### Get JS link for all facilities

In [73]:
all_facilities = []
distances = []

In [74]:
for i in range(0,13):
    table_id = driver.find_element(By.ID, 'dgFacilityList')
    facilities = [f.get_attribute('href') for f in table_id.find_elements_by_tag_name("a") if "PostBack" not in f.get_attribute('href')]
    all_facilities.extend(facilities)
    
    for row in table_id.find_elements(By.TAG_NAME, 'tr'):
        val = row.find_elements(By.TAG_NAME, 'td')[-1].text.strip()
        if val.isdigit():
            distances.append(val)
    
    driver.execute_script("javascript:__doPostBack('dgFacilityList$ctl24$ctl01','')") # Next Page
    time.sleep(10)

In [75]:
print(len(all_facilities))
len(set(all_facilities)) == len(all_facilities)

250


True

In [76]:
print(len(distances))
len(set(distances)) == len(distances)

250


False

#### Get info for all facilities

In [15]:
facilities_info = []

In [32]:
for f in all_facilities:
    facility = []
    
    driver.execute_script(f)
    
    facility.append(driver.find_element_by_id("tcFacilityName").text)
    facility.append(driver.find_element_by_id("tcFacilityType").text)
    facility.append(driver.find_element_by_id("tcFacilityAddress").text)
    facility.append(driver.find_element_by_id("tcPublicPhone").text)

    facility.append(driver.find_element_by_id("tblConveniences").find_elements(By.TAG_NAME, "td")[3].text)
    facility.append(driver.find_element_by_id("tblHours").find_elements(By.TAG_NAME, "td")[-1].text)
    
    facilities_info.append(facility)
    
    back = driver.find_element_by_id("hlResultsBottom")
    back.click()
    driver.implicitly_wait(10)

In [83]:
driver.close()

### Create a data frame

In [80]:
df = pd.DataFrame(facilities_info)
df.columns = ['Name', 'Type', 'Address', 'Phone', 'PhotoOnSite', 'Comments']
df['Distance'] = distances
df.sample(5)

Unnamed: 0,Name,Type,Address,Phone,PhotoOnSite,Comments,Distance
238,BRANTLEY COUNTY SUPERIOR COURT,Court,"234 BRANTLEY STREET NAHUNTA, GA 31553",9124625635,No,Appointments are required please call (912)462...,387
125,OSCEOLA COUNTY CLERK OF COURT - MAIN OFFICE,Court,"2 COURTHOUSE SQ KISSIMMEE, FL 34741",4077423530,No,By Appointment Only - Please call 407-742-3530...,190
28,CITY OF MIRAMAR,Municipal,"2300 CIVIC CENTER PL MIRAMAR, FL 33025",9546023011,Yes,***BY APPOINTMENT ONLY*** To schedule an appoi...,15
73,COLLIER COUNTY CLERK OF COURT - GOLDEN GATE ANNEX,Court,"4715 GOLDEN GATE PKWY NAPLES, FL 34116",2392522750,No,Appointment Not Required | Walk-In,98
72,COLLIER COUNTY CLERK OF CIRCUIT COURT - MAIN O...,Court,"3315 E TAMIAMI TRL E NAPLES, FL 34112",2392527242,No,Appointments please call (239)252-7242 for fur...,98


#### ETL & EDA

In [82]:
df.Type.value_counts()

Postal            127
Court              87
Municipal          17
Library            11
School              5
County / State      3
Name: Type, dtype: int64

In [84]:
df.PhotoOnSite.value_counts()

Yes    189
No      61
Name: PhotoOnSite, dtype: int64

In [130]:
def url_normalizer(url):
    if url is None:
        return url
    
    url = url.lower()
    
    if "usps.com" in url:
        return "https://usps.com/scheduler"
    
    url = url.replace("http://", "https://")
    url = url.replace("www.","")
    url = url.strip(".")
    url = url.strip("/")
    
    if not url.startswith("https://"):
        url = f"https://{url}"
    
    return url
        

df['url'] = df.Comments.apply(lambda val: extractor.find_urls(val))
df['url'] = df.url.apply(lambda val: None if len(val)==0 else val[0])
df['url'] = df.url.apply(lambda val: url_normalizer(val))

df['domain'] = df.url.apply(lambda link: None if link is None else tldextract.extract(link).domain)

In [142]:
df.domain.value_counts().head(12)

usps                 121
myorangeclerk          4
mypalmbeachclerk       3
lakecountyclerk        3
mypinellasclerk        2
browardclerk           2
stlucieclerk           2
hernandoclerk          2
clayclerk              2
citrusclerk            2
duvalclerk             2
martincountyclerk      1
Name: domain, dtype: int64

In [155]:
def normalize_phone(phone):
    if phone is None:
        return None
    
    phone = re.sub("[^0-9]", "", phone)
    phone = f"{phone[:3]}-{phone[3:-4]}-{phone[-4:]}"
    return phone

df['phone'] = df.Comments.apply(lambda val: re.findall(r"\(?\b[2-9][0-9]{2}\)?[-. ]?[2-9][0-9]{2}[-. ]?[0-9]{4}\b",val))
df['phone'] = df.phone.apply(lambda val: None if len(val)==0 else val[0])
df['phone'] = df.phone.apply(lambda val: normalize_phone(val))

In [157]:
df.phone.value_counts()

855-864-3379    4
863-534-4000    3
941-861-7436    2
352-540-6768    2
941-741-4022    1
               ..
954-831-2311    1
912-427-5930    1
386-345-3574    1
561-624-6650    1
305-295-5000    1
Name: phone, Length: 69, dtype: int64

In [168]:
def comment_normalizer(comment):
    comment = comment.lower()
    
    urls = extractor.find_urls(comment)
    for url in urls:
        comment = comment.replace(url, " ")
    
    phones = re.findall(r"\(?\b[2-9][0-9]{2}\)?[-. ]?[2-9][0-9]{2}[-. ]?[0-9]{4}\b",comment)
    for phone in phones:
        comment = comment.replace(phone, " ")
        
    comment = re.sub("[^a-zA-Z\s]+", " ", comment)
    comment = comment.strip()
    
    comment = " ".join(comment.split())
    
    return comment

In [195]:
def needs_appointment(comment):
    comment = comment_normalizer(comment)
    
    no = ['not required', 'walk in', 'no appointment']
    yes = ['appointment only', 'appointment required', 
           'appointments only', 'appointments required']
    
    for n in no:
        if n in comment:
            return False
        
    for y in yes:
        if y in comment:
            return True
        
    return "Unknown"

In [196]:
df['NeedsAppointment'] = df.Comments.apply(lambda val: needs_appointment(val))

In [197]:
df.NeedsAppointment.value_counts()

True       160
False       48
Unknown     42
Name: NeedsAppointment, dtype: int64

## Best candidates

#### Let's examine USPS first

In [201]:
df_usps = df[df.domain=='usps']
df_other = df[df.domain!='usps']

In [212]:
df_usps['zip'] = df_usps.Address.apply(lambda val: val.split()[-1])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_usps['zip'] = df_usps.Address.apply(lambda val: val.split()[-1])


In [214]:
df_usps.to_csv("passport_usps.csv", index=False)

In [215]:
df_usps.tail(50)

Unnamed: 0,Name,Type,Address,Phone,PhotoOnSite,Comments,Distance,url,domain,phone,NeedsAppointment,zip
139,USPS FALKENBURG BRANCH,Postal,"10121 E ADAMO DR TAMPA, FL 33619",8136260654,Yes,"***BY APPOINTMENT ONLY*** To schedule an appointment, visit: http://www.usps.com/scheduler.",199,https://usps.com/scheduler,usps,,True,33619
140,USPS SAND LAKE BRANCH,Postal,"10450 TURKEY LAKE RD ORLANDO, FL 32819",4073510613,Yes,"***BY APPOINTMENT ONLY*** To schedule an appointment, visit: http://www.usps.com/scheduler.",200,https://usps.com/scheduler,usps,,True,32819
141,USPS DIXIE VILLAGE STATION,Postal,"2860 DELANEY AVE ORLANDO, FL 32806",4074252868,Yes,"***BY APPOINTMENT ONLY*** To schedule an appointment, visit: http://www.usps.com/scheduler.",201,https://usps.com/scheduler,usps,,True,32806
142,KATHLEEN POST OFFICE,Postal,"6136 KATHLEEN ROAD KATHLEEN, FL 33849",8638162201,Yes,"***BY APPOINTMENT ONLY*** To schedule an appointment, visit: http://www.usps.com/scheduler",203,https://usps.com/scheduler,usps,,True,33849
144,USPS DOWNTOWN ORLANDO STATION,Postal,"51 E JEFFERSON ST ORLANDO, FL 32801",4074233709,Yes,"***BY APPOINTMENT ONLY*** To schedule an appointment, visit: http://www.usps.com/scheduler.",203,https://usps.com/scheduler,usps,,True,32801
145,USPS ORLANDO MOWU,Postal,"10401 POST OFFICE BOULEVARD ORLANDO, FL 32862",4078506334,Yes,"***BY APPOINTMENT ONLY*** To schedule an appointment, visit: http://www.usps.com/scheduler.",203,https://usps.com/scheduler,usps,,True,32862
148,USPS HERNDON POST OFFICE,Postal,"821 HERNDON AVE ORLANDO, FL 32814",4078973469,Yes,"***BY APPOINTMENT ONLY*** To schedule an appointment, visit: http://www.usps.com/scheduler.",204,https://usps.com/scheduler,usps,,True,32814
149,USPS GULFWINDS STATION,Postal,"4222 22ND AVE S ST. PETERSBURG, FL 33711",7273278469,Yes,"***BY APPOINTMENT ONLY*** To schedule an appointment call 727-327-8469, visit: http://www.usps.com/scheduler.",205,https://usps.com/scheduler,usps,727-327-8469,True,33711
153,USPS MIMS POST OFFICE,Postal,"3405 KELLY RD MIMS, FL 32754",3212675122,Yes,Appointments - Website: usps.com/scheduler http://tools.usps.com,207,https://usps.com/scheduler,usps,,Unknown,32754
154,USPS GATEWAY STATION,Postal,"701 77TH AVE N ST. PETERSBURG, FL 33702",7275782802,Yes,"***BY APPOINTMENT ONLY*** To schedule an appointment, visit: http://www.usps.com/scheduler.",207,https://usps.com/scheduler,usps,,True,33702


In [218]:
df_other.head(50)

Unnamed: 0,Name,Type,Address,Phone,PhotoOnSite,Comments,Distance,url,domain,phone,NeedsAppointment
1,CITY OF MIAMI BEACH - FINANCE OFFICE,Municipal,"1755 MERIDIAN AVE MIAMI BEACH, FL 33139",3056737420,Yes,**To schedule appointments please call 305-673-7420 https://cmbappt.miamibeachfl.gov/naoa/index.jsp,3,https://cmbappt.miamibeachfl.gov/naoa/index.jsp,miamibeachfl,305-673-7420,Unknown
5,CITY OF MIAMI,Municipal,"3500 PAN AMERICAN DR MIAMI, FL 33133",3058592705,Yes,By Appointments Only Schedule Appointment visit www.miamigov.com/passport,5,https://miamigov.com/passport,miamigov,,True
6,CITY OF CORAL GABLES,Municipal,"4520 PONCE DE LEON BLVD CORAL GABLES, FL 33146",3054605351,Yes,By Appointments Only 305-460-5351 Please schedule an appointment via www.coralgables.com/passports,6,https://coralgables.com/passports,coralgables,305-460-5351,True
9,VILLAGE OF KEY BISCAYNE,Municipal,"10 VILLAGE GREEN WAY KEY BISCAYNE, FL 33149",3053658900,No,***BY APPOINTMENT ONLY*For appointments call 305-365-8900,6,,,305-365-8900,True
11,NORTH BAY VILLAGE - OFFICE OF THE VILLAGE CLERK,Municipal,"1666 KENNEDY CSWY NORTH BAY VILLAGE, FL 33141",3057567171,No,***By Appointment Only***Contact 305-756-7171 for further assistance or visit: https://northbayvillage-fl.gov,6,https://northbayvillage-fl.gov,northbayvillage-fl,305-756-7171,True
13,USPS SURFSIDE POST OFFICE,Postal,"250 95TH ST MIAMI, FL 33154",3058616054,Yes,"***BY APPOINTMENT ONLY***To schedule an appointment, visit: http://www.usp.com/international/passport.htm",7,https://usp.com/international/passport.htm,usp,,True
14,CITY OF NORTH MIAMI,Municipal,"776 NE 125TH ST NORTH MIAMI, FL 33161",3058959817,No,To schedule appointments please call(305) 895-9817 Website: https://www.northmiamifl.gov,7,https://northmiamifl.gov,northmiamifl,305-895-9817,Unknown
15,CITY OF WEST MIAMI,Municipal,"901 SW 62ND AVE WEST MIAMI, FL 33144",3052661122,Yes,By Appointments Only - For further assistance 305-266-1122,7,,,305-266-1122,True
19,NORTH MIAMI BEACH PUBLIC LIBRARY,Library,"1601 NE 164 ST NORTH MIAMI BEACH, FL 33162",3059482970,No,Appointment Only** Please call 305-948-2970 for further assistance Website: www.nmblib.com,9,https://nmblib.com,nmblib,305-948-2970,True
21,CITY OF SWEETWATER,Municipal,"512 SW 109TH AVE SWEETWATER, FL 33174",3052073343,Yes,Appointments Only: visit website: buk.app.com,10,https://buk.app.com,app,,True


In [219]:
df_other.to_csv("passport_other.csv", index=False)