### Purpose: Automatically extract links and emails from Google Search
1. Search google for links (L.1)
2. Search all of those links (L.1) for emails
3. Search all of those links (L.1) for links (L.2)
4. Search all of those link (L.2) for emails
5. Save all links and emails into a csv file such that all of the phone numbers and emails are aligned with the links they were scraped from

Note: add support for US data: https://pypi.org/project/us/

In [1]:
from googlesearch import search
import requests
from bs4 import BeautifulSoup
import re
from pandas import DataFrame
import json
from itertools import zip_longest
import os
import urllib

In [2]:
email_re1 = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,4}")
email_re2 = re.compile(r"""mailto:(.+?)[\s?'"]""")
phone_re1 = re.compile(r"(?:\+\d{1,2}\s)?\(?\d{3}\)?[\s.-]\d{3}[\s.-]\d{4}")
phone_re2 = re.compile(r"""tel:(.+?)[\s?'"]""")
excluded_links = {
    ".pdf",
    ".png",
    "file",
    "mailto:",
    "cdc.gov",
    "hhs.gov",
    "nih.gov",
    "freeclinics.com",
    "medicaid.gov",
    "freeclinicdirectory.org"
}
excluded_emails = {
    "support@freeclinics.com",
    ".js",
    "support@freedentalcare.us",
    ".png"
}

In [3]:
def scrape_url(url):
    try:
        # Open URL.
        r = requests.get(url)
        print(url, "opened.")
        
        # Scrape Emails.
        emails = {
            email for email in email_re1.findall(r.text)
            if len(email) < 35 and
            not any(excluded.lower() in email.lower() for excluded in excluded_emails)
        }
        emails.update({
            email for email in email_re2.findall(r.text)
            if len(email) < 35 and
            not any(excluded.lower() in email.lower() for excluded in excluded_emails)
        })
        #This is necessary to ensure that the emails, links, and phone numbers are lined up
        if len(emails) == 0:
            emails = {""}
        
        # Scrape phone numbers.
        phones = {"".join(list(filter(str.isdigit, phone))) for phone in phone_re1.findall(r.text)
                  if len("".join(list(filter(str.isdigit, phone))))>9}
        phones.update({
            "".join(list(filter(str.isdigit, phone))) for phone in phone_re2.findall(r.text) 
            if len("".join(list((filter(str.isdigit, phone)))))>9
        })
        #This is necessary to ensure that the emails, links, and phone numbers are lined up
        if len(phones) == 0:
            phones = {""}
        
        # Scrape next layer links.
        links = set()
        soup = BeautifulSoup(r.text, "html.parser")
        for link in soup.find_all("a"):
            l = link.get("href")
            if len(l) > 10:
                if "tel:" in l.lower():
                    phone = "".join(list(filter(str.isdigit, l.replace("tel:", ""))))
                    if len(phone)>9:
                        phones.add()
                elif "mailto:" in l.lower():
                    emails.add(l.lower().replace("mailto:", ""))
                if not any(excluded_l.lower() in l.lower() for excluded_l in excluded_links):
                    # If this is a sub-link.
                    if 'www.' not in l and l.lower() != url.lower():
                        links.add(urllib.parse.urljoin(url, l).lower())
                    else:
                        links.add(l.lower())
        
        return emails, phones, links
    except:
        print(url, "is unable to be opened.")
        return {''}, {''}, None

In [4]:
l = 'tel:9043201234'
phone = filter(str.isdigit, l)
phone = "".join(list(phone))
phone

'9043201234'

In [5]:
def save_csv(searchterm, dictionary):
    df = DataFrame.from_dict(dictionary)
    save = "output/" + searchterm + ".csv"
    df.to_csv(save)
    print(save, "SAVED")

In [8]:
def email_link_search(searchterm, max_results=25):
    output = {"Link": [],"Email": [],"Phone": []}
    try:
        google_results = search(searchterm, num_results=max_results)
        for google_link in google_results:
            if not any(ext in google_link for ext in excluded_links):
                emails, phones, second_links = scrape_url(google_link)

                if list(emails)[0] != "" or list(phones)[0] != "":
                    for email, phone in zip_longest(emails, phones):
                        if (email.lower() in output["Email"] and phone in output["Phone"]):
                            pass
                        else:
                            output["Link"].append(google_link)
                            #Add these without filtering to ensure that the emails and phone numbers are lined up with links
                            output["Email"].append(email)
                            output["Phone"].append(phone)

                            '''
                            if (email is not None and len(email) > 0 and
                                "?" not in email and
                                email.lower() not in output["Email"]):
                                output["Email"].append(email.lower())
                            if phone is not None and len(phone) > 0:
                                output["Phone"].append(phone)
                            '''
                if second_links:
                    for link in second_links:
                        emails, phones, _ = scrape_url(link)
                        if list(emails)[0] != "" or list(phones)[0] != "":
                            for email, phone in zip_longest(emails, phones):
                                if (email in output["Email"] and phone in output["Phone"]):
                                    pass
                                else:
                                    output["Link"].append(link)
                                    #Add these without filtering to ensure that the emails and phone numbers are lined up with links
                                    output["Email"].append(email)
                                    output["Phone"].append(phone)
                                '''
                                if (email is not None and len(email) > 0 and
                                        "?" not in email and
                                        email.lower() not in output["Email"]):
                                    output["Email"].append(email.lower())
                                if phone is not None and len(phone) > 0:
                                    output["Phone"].append(phone)
                                '''
    except:
        raise ValueError('Google Search Failed. Input new search term.')
    
    return output

In [None]:
# Sample input: free homeless clinic contact info
searchterm = input()
output = email_link_search(searchterm, max_results=2)
if not os.path.exists("output"):
    os.mkdir("output")
with open('output/searched.json', 'w') as f_out:
    searched = {}
    for k, v in output.items():
        # Easiest way to filter a list, even if it gets out of order.
        searched[k] = list(set(v))
    json.dump(searched, f_out)

# Note: I think I broke the save_csv function because the number of
# phones, emails, and links no longer match.
save_csv(searchterm, output)

iowa free clinics
https://www.fciowa.org/ opened.
https://www.fciowa.org/#comp-ky4p0vtt opened.
https://www.fciowa.org/contact opened.
https://www.fciowa.org/contact is unable to be opened.
https://www.fciowa.org/donors opened.
https://www.facebook.com/fciowa opened.
https://www.fciowa.org/volunteer opened.
https://www.fciowa.org/olddonate opened.
https://www.fciowa.org/olddonate is unable to be opened.
https://www.fciowa.org/board-of-directors opened.
https://www.fciowa.org/#comp-jww4y2um opened.
https://www.fciowa.org/clinic-details opened.


In [None]:
output