<img src="workflow.png">

In [1]:
import pandas as pd
import requests
import re
import numpy as np
from bs4 import BeautifulSoup

In [2]:
# get colleges/universities list
data = pd.read_csv("\\data\\AllPrivateUniversities.csv")

In [3]:
collegesList = list(data['College Name'].values)

In [4]:
USER_AGENT = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}

# fetch google results
def fetch_results(search_term, number_results, language_code):
    assert isinstance(search_term, str), 'Search term must be a string'
    assert isinstance(number_results, int), 'Number of results must be an integer'
    escaped_search_term = search_term.replace(' ', '+')
 
    google_url = 'https://www.google.com/search?q={}&num={}&hl={}'.format(escaped_search_term, number_results, language_code)
    response = requests.get(google_url, headers=USER_AGENT)
    response.raise_for_status()
 
    return search_term, response.text

# fetch first url from google results
def parse_results(html, keyword):
    soup = BeautifulSoup(html, 'html.parser')
 
    found_results = []
    result_block = soup.find_all('div', attrs={'class': 'g'})
    for result in result_block:
        link = result.find('a', href=True)
        found_results.append(link.attrs['href'])
    return found_results

In [5]:
# Initializing the blank list
urls = []
for college in collegesList:
    if ('POLYTECHNIC' in college):
        continue
    else:
        # call fetch function to get results from google
        keyword, html = fetch_results(college+'computer science faculty', 1, 'en')
        # pull first url from google results
        url_single = parse_results(html,keyword)
        # append in urls list
        urls.append(url_single[0])

In [6]:
# regular expression format builder.
# here we are trying all possible combinations in which we can find the email ids such as abc@xyz.com OR abc[AT]xyz[DOT]com etc.

before_at = r'([\w\.-])+'
at_str = r'(@|(\[(AT|at)\]))'
after_at = r'[A-Za-z0-9]+'
dot = r'(\.|(\[(DOT|dot)\]))'
after_dot = r'[a-zA-Z]+'
reg_exp_pattern = before_at + at_str + after_at + dot + after_dot + r'(' + dot + after_dot + r')?'
url_emails = {}

In [7]:
for url in urls:
    try:
        # get the response from given url
        response = requests.get(url, timeout = 120) # timeout is for 1 min
    except:
        continue
    # status code 200 is success
    if response.status_code == 200:
        reg = re.compile(reg_exp_pattern)
        ids = reg.finditer(response.text)
        
        emailids = [match.group() for match in ids]
        emailids = list(set(emailids))
        
        if len(emailids) > 0:
            # remove email ids from free domains
            gmail_exp = re.compile(r'.*(gmail|yahoo|hotmail|rediff|aol\.).*')
            removals = []
            for email in emailids:
                junk_matches = gmail_exp.finditer(email)
                removals.extend([junk_match.group() for junk_match in junk_matches])
            
            emailids = [email for email in emailids if email not in removals]
                
            url_emails[url] = emailids
    else:
        continue

In [8]:
df = pd.DataFrame({'URL':[], 'EmailId':[]})

In [9]:
for url in url_emails.keys():
    if len(url_emails[url]) > 0:
        urlList = [url]
        valueList = url_emails[url]
        urlList *= len(valueList)
        dataSet = pd.DataFrame({'URL':urlList, 'EmailId':valueList})
        df = pd.concat((df,dataSet), axis = 0, ignore_index = True )

In [10]:
pattern = re.compile(r'.*(principal|info|support|admission|enquiry|admin|feedback|contact|office|alumni|media|exams|technical|library|finance|tpo|membership|director|webmaster|registrar).*@.*')
vals = []
# pull junk ids from the dataset and push it in
for c in df.EmailId:
    col  = pattern.finditer(c)
    vals.extend([m.group() for m in col])

for val in vals:
    index_num = df[df['EmailId'] == val].index
    df.drop(index_num, inplace = True)

In [11]:
df.to_csv('\\output\\output.csv')