### Purpose: Automatically extract links and emails from Google Search
1. Search google for links (L.1)
2. Search all of those links (L.1) for emails
3. Search all of those links (L.1) for links (L.2)
4. Search all of those link (L.2) for emails
5. Save all links and emails into a csv file

Note: add support for US data: https://pypi.org/project/us/

In [1]:
from googlesearch import search
import requests
from bs4 import BeautifulSoup
import re
email_re = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,4}")
phone_re = re.compile(r"(?:\+\d{1,2}\s)?\(?\d{3}\)?[\s.-]\d{3}[\s.-]\d{4}")
excluded_links = {'.pdf','.png','file', 'mailto:','cdc.gov','hhs.gov','freeclinics.com','medicaid.gov','freeclinicdirectory.org'}
excluded_emails = {'support@freeclinics.com','.js','support@freedentalcare.us', '.png'}

from pandas import DataFrame
import json
from itertools import zip_longest

In [2]:
def scrape_url(url):
    try:
        #Open URL
        r = requests.get(url)
        print(url,'opened')
        
        #Scrape Emails
        emails = {email for email in email_re.findall(r.text) if len(email)<35 and not any(excluded in email for excluded in excluded_emails)}
        if len(emails) ==0:
            emails = {''}
        
        #Scrape Phone Numbers
        phones = {phone for phone in phone_re.findall(r.text)}
        if len(phones) == 0:
            phones = {''}
        
        #Scrape Next layer links
        links = set()
        soup = BeautifulSoup(r.text, 'html.parser')
        for link in soup.find_all('a'):
            l = link.get('href')
            if len(l)>10:
                if 'tel:' in l:
                    #remove the tel:
                    phones.add(l[4:])
                if not any(excluded_l in l for excluded_l in excluded_links):
                    #If this is a sub-link
                    if 'www.' not in l:
                        l = url+l[1:]
                        links.add(l)
                    else:
                        links.add(l)
        
        return emails, phones, links
    except:
        print(url,"is unable to be opened.")
        return {''}, {''}, None

In [3]:
def save_csv(searchterm, dictionary):
    df = DataFrame.from_dict(dictionary)
    save = 'output/'+searchterm+'.csv'
    df.to_csv(save)
    print(save,'SAVED')

In [4]:
def email_link_search(searchterm, max_results=25):
    output = {"Link":[],"Email":[],"Phone":[]}
    try:
        google_results = search(searchterm, num_results=max_results)
        for google_link in google_results:
            if not any(ext in google_link for ext in excluded_links):
                emails, phones, second_links = scrape_url(google_link)

                if list(emails)[0]!= '' or list(phones)[0]!='':
                    for email, phone in zip_longest(emails, phones):
                        output["Link"].append(google_link)
                        output["Email"].append(email)
                        output["Phone"].append(phone)

                if second_links:
                    for link in second_links:
                        emails, phones, _ = scrape_url(link)
                        if list(emails)[0]!= '' or list(phones)[0]!='':
                            for email, phone in zip_longest(emails, phones):
                                output["Link"].append(link)
                                output["Email"].append(email)
                                output["Phone"].append(phone) 
    except:
        raise ValueError('Google Search Failed. Input new search term.')
    
    return output

In [5]:
searchterm = input()
output = email_link_search(searchterm)
with open('output/searched.json', 'w') as f_out:
    searched = {}
    for k,v in output.items():
        #Easiest way to filter a list, even if it gets out of order.
        searched[k] = list(set(v))
    json.dump(searched, f_out)

save_csv(searchterm, output)

iowa free clinics
https://www.fciowa.org/ opened
https://www.fciowa.org/clinic-services opened
https://www.fciowa.org/ttp://rpscreativegroup.com opened
https://www.fciowa.org/donors opened
https://www.fciowa.org/annual-summaries opened
https://www.linkedin.com/company/15514298/admin/ opened
https://www.fciowa.org/board-of-directors opened
https://www.fciowa.org opened
https://www.fciowa.org/become-a-member opened
https://www.fciowa.org//#comp-jwckvjif opened
https://www.fciowa.org/ttps://twitter.com/fciowa opened
https://www.fciowa.org//#comp-ky4p0vtt opened
https://www.fciowa.org/clinic-locations opened
https://www.fciowa.org/olddonate opened
https://www.fciowa.org/olddonate is unable to be opened.
https://www.fciowa.org//#comp-jww4y2um opened
https://www.fciowa.org/faq opened
https://www.fciowa.org/faq is unable to be opened.
https://www.fciowa.org/contact opened
https://www.fciowa.org/volunteer opened
https://www.fciowa.org/clinic-details opened
https://www.facebook.com/FCIowa opene

https://benefitsexplorer.com/free-clinics/iowa/des-moinesel:+1-515-964-4600 opened
https://benefitsexplorer.com/free-clinics/iowa/des-moinesfree-clinics/iowa/des-moines/28415/mercy-beaverdale-medical-clinic opened
http://www.unitypoint.org/desmoines/clinic.aspx?id=920&Pleasant+Hill+Family+Physicians opened
http://www.unitypoint.org/desmoines/clinic.aspx?id=920&Pleasant+Hill+Family+Physicians is unable to be opened.
https://benefitsexplorer.com/free-clinics/iowa/des-moinesfree-clinics/iowa/des-moines/21430/planned-parenthood-of-the-heartland-rosenfield-center opened
https://benefitsexplorer.com/free-clinics/iowa/des-moinesfood-stamp opened
http://www.mercydesmoines.org/find-a-clinic/mercy-east-pediatric-clinic opened
https://benefitsexplorer.com/free-clinics/iowa/des-moinesprivacy-policy opened
https://benefitsexplorer.com/free-clinics/iowa/des-moineslow-income-housing/rent-assistance opened
https://benefitsexplorer.com/free-clinics/iowa/des-moinesel:+1-515-266-7622 opened
https://benef

https://benefitsexplorer.com/free-clinics/iowa/des-moinesel:+1-515-262-0404 opened
https://www.dmacc.edu/students/Pages/dental.aspx opened
https://www.dmacc.edu/students/Pages/dental.aspx is unable to be opened.
https://benefitsexplorer.com/free-clinics/iowa/des-moinesfree-mental-health-clinics opened
http://www.unitypoint.org/desmoines/clinic.aspx?id=114 opened
http://www.unitypoint.org/desmoines/clinic.aspx?id=114 is unable to be opened.
https://benefitsexplorer.com/free-clinics/iowa/des-moinesel:+1-515-266-1000 opened
https://benefitsexplorer.com/free-clinics/iowa/des-moinesttps://maps.google.com/?q=330 Laurel S, Des Moines, IA 50314 opened
https://benefitsexplorer.com/free-clinics/iowa/des-moinesel:+1-515-967-0133 opened
https://benefitsexplorer.com/free-clinics/iowa/des-moinesterm-of-use opened
https://benefitsexplorer.com/free-clinics/iowa/des-moinesttp://christthekingparish.org/free-clinic opened
https://benefitsexplorer.com/free-clinics/iowa/des-moinesfinancial-assistance/uncla

https://www.plannedparenthood.org/health-center/iowa/des-moines/50315/rosenfield-center-2386-90380 opened
https://www.plannedparenthood.org/health-center/iowa/des-moines/50315/rosenfield-center-2386-90380 is unable to be opened.
https://benefitsexplorer.com/free-clinics/iowa/des-moinesfree-clinics/iowa/des-moines/25998/unitypoint-health-adolescent-behavioral-health-services opened
https://benefitsexplorer.com/free-clinics/iowa/des-moinesfree-clinics/iowa/des-moines/28439/mercy-west-urgent-care-clive opened
http://www.mercydesmoines.org/find-a-clinic/mercy-prairie-trail-family-medicine opened
https://benefitsexplorer.com/free-clinics/iowa/des-moinesel:+1-515-266-1199 opened
https://benefitsexplorer.com/free-clinics/iowa/des-moinesel:+1-515-643-0833 opened
https://benefitsexplorer.com/free-clinics/iowa/des-moinesfree-clinics/iowa/des-moines/25999/unitypoint-family-clinic-altoona opened
https://benefitsexplorer.com/free-clinics/iowa/des-moinesel:+1-800-372-6031 opened
https://benefitsexpl

http://www.udmo.com/community_facts.htm opened
http://www.udmo.com/TheFreeClinic.htmheFreeClinic.htm opened
http://www.udmo.com/TheFreeClinic.htmttps://twitter.com/UDMO_CAA?ref_src=twsrc%5Etfw opened
http://www.udmo.com/Osceola.htm opened
https://ccfcwi.org/ opened
https://ccfcwi.org/ttp://webbsatwork.com/ opened
https://ccfcwi.org/ttps://ccfcwi.org/about-us/contact/ opened
https://ccfcwi.org/ttps://instagram.com/ccfcdodgeville opened
https://ccfcwi.org/ttps://ccfcwi.org/category/news/ opened
https://ccfcwi.org/ttps://twitter.com/CCFC_info opened
https://www.instagram.com/ccfcdodgeville/ opened
https://ccfcwi.org/ttps://ccfcwi.org/2021/11/november-volunteer-of-the-month-matt-benish-pa-c/ opened
https://ccfcwi.org/ttps://ccfcwi.org/patient-services/additional-clinics/ opened
https://ccfcwi.org/about-us/our-history/ opened
https://ccfcwi.org/ttps://ccfcwi.org/about-us/our-patients/ opened
https://ccfcwi.org/ttps://ccfcwi.org/our-board-of-directors/ opened
https://ccfcwi.org/ttps://ccfcwi

In [6]:
output

{'Link': ['https://www.fciowa.org/',
  'https://www.fciowa.org/clinic-services',
  'https://www.fciowa.org/donors',
  'https://www.fciowa.org/annual-summaries',
  'https://www.fciowa.org/board-of-directors',
  'https://www.fciowa.org',
  'https://www.fciowa.org/become-a-member',
  'https://www.fciowa.org//#comp-jwckvjif',
  'https://www.fciowa.org//#comp-ky4p0vtt',
  'https://www.fciowa.org/clinic-locations',
  'https://www.fciowa.org//#comp-jww4y2um',
  'https://www.fciowa.org/contact',
  'https://www.fciowa.org/contact',
  'https://www.fciowa.org/volunteer',
  'https://www.fciowa.org/volunteer',
  'https://www.fciowa.org/clinic-details',
  'https://www.fciowa.org/clinic-details',
  'https://www.fciowa.org/clinic-details',
  'https://www.fciowa.org/clinic-details',
  'https://www.fciowa.org/clinic-details',
  'https://www.fciowa.org/clinic-details',
  'https://www.fciowa.org/clinic-details',
  'https://www.fciowa.org/clinic-details',
  'https://www.fciowa.org/clinic-details',
  'https