### Purpose: Automatically extract links and emails from Google Search
1. Search google for links (L.1)
2. Search all of those links (L.1) for emails
3. Search all of those links (L.1) for links (L.2)
4. Search all of those link (L.2) for emails
5. Save all links and emails into a csv file

Note: add support for US data: https://pypi.org/project/us/

In [None]:
from googlesearch import search
import requests
from bs4 import BeautifulSoup
import re
email_re = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,4}")
phone_re = re.compile(r"^(\+\d{1,2}\s)?\(?\d{3}\)?[\s.-]\d{3}[\s.-]\d{4}$")
from pandas import DataFrame
import json
from itertools import zip_longest

In [None]:
def scrape_url(url):
    try:
        #Open URL
        r = requests.get(url)
        print(url,'opened')
        
        #Scrape Emails
        emails = {email for email in email_re.findall(r.text) if len(email)<35}
        if len(emails) ==0:
            emails = {''}
        
        #Scrape Phone Numbers
        phones = {phone for phone in phone_re.findall(r.text)}
        if len(phones) == 0:
            phones = {''}
        
        #Scrape Next layer links
        links = set()
        soup = BeautifulSoup(r.text, 'html.parser')
        for link in soup.find_all('a'):
            l = link.get('href')
            if len(l)>10:
                if 'tel:' in l:
                    #remove the tel:
                    phones.add(l[4:])
                if 'file' not in l and 'mailto:' not in l and 'who.int' not in l and 'hhs.gov' not in l  and 'hrsa.gov' not in l:
                    #If this is a sub-link
                    if 'www.' not in l:
                        l = url+l[1:]
                        links.add(l)
                    else:
                        links.add(l)
        
        return emails, phones, links
    except:
        print(url,"is unable to be opened.")
        return {''}, {''}, None

In [None]:
def save_csv(searchterm, dictionary):
    df = DataFrame.from_dict(dictionary)
    save = 'output/'searchterm+'.csv'
    df.to_csv(save)
    print(save,'SAVED')

In [None]:
def email_link_search(searchterm, max_results=2):
    output = {"Link":[],"Email":[],"Phone":[]}
    try:
        google_results = search(searchterm, num_results=max_results)
        for google_link in google_results:
            emails, phones, second_links = scrape_url(google_link)
            print('first pass',emails, phones, second_links)
            
            print(if list(emails)[0]!= '' and list(phones)[0]!='')
            if list(emails)[0]!= '' and list(phones)[0]!='':
                for email, phone in zip_longest(emails, phones):
                    output["Link"].append(google_link)
                    output["Email"].append(email)
                    output["Phone"].append(phone)
            
            if second_links:
                for link in second_links:
                    emails, phones, _ = scrape_url(link)
                    print('second layer',emails, phones)
                    if list(emails)[0]!= '' and list(phones)[0]!='':
                        for email, phone in zip_longest(emails, phones):
                            output["Link"].append(link)
                            output["Email"].append(email)
                            output["Phone"].append(phone)
                            print(link,'appended')
                            print(phone,'appended')
                            print(email,'appended')
    except:
        raise ValueError('Google Search Failed. Input new search term.')
    
    return output

In [None]:
searchterm = input()
output = email_link_search(searchterm)
with open('searched.json', 'w') as f_out:
    searched = {}
    for k,v in output.items():
        #Easiest way to filter a list, even if it gets out of order.
        searched[k] = list(set(v))
    json.dump(searched, f_out)

save_csv(searchterm, output)

In [None]:
output