# This tutorial will show you how to scrape the web using Python

## The task is to get information about every faculty member in sociology from their department profiles. We must begin with a base URL from which we can access the profiles

In [None]:
URL = "http://www.soc.cornell.edu/people/faculty/"

## There are a couple of packages we need to import

In [None]:
import requests
from bs4 import BeautifulSoup as BS

In [None]:
html = requests.get(URL)

In [None]:
html.content

In [None]:
soup = BS(html.content, "html.parser")

## We'll need to do these steps quite a lot so its useful to abstract it with a function 

In [None]:
def getSoup(url):
    html = requests.get(url)
    soup = BS(html.content, "html.parser")
    return soup

## BeautifulSoup provides some useful functions to parse the raw html

In [None]:
links = soup.findAll('a', href=True) #Finds all 'a' tags with an href object (i.e. all hyperlinks)

In [None]:
links

In [None]:
#Let's take a look at one of these items 
links[20]

In [None]:
type(links[20])

In [None]:
dir(links[20])

In [None]:
x = links[20]

In [None]:
x.contents

In [None]:
x['href']

## After experimenting with the object and determining what we want, we can then loop through all the objects returned by the query

In [None]:
profiles = []
for l in links:
    if "/people/faculty/" in l['href']:
        profiles.append(l['href'])

In [None]:
profiles

In [None]:
##We can remove the incorrect links by applying a conditional filter to profiles
profiles = [x for x in profiles if x.endswith('faculty/') == False]

In [None]:
profiles

In [None]:
#Note that there are many duplicates in the list...
print(len(profiles))
print(len(set(profiles)))

In [None]:
profiles = list(set(profiles))

## Now we have a list of URLs we can retrieve the information from each by looping through the list and applying the function we created. The results can be saved in a dictionary.

In [None]:
from time import sleep
profile_contents = {}
for p in profiles:
    print("Getting information from: ", p)
    sleep(1) #Sleeping for a time interval so we're not querying too frequently
    soup = getSoup(p)
    name = p.split('/')[-2]
    profile_contents[name] = soup

In [None]:
print(profile_contents.keys())

In [None]:
#If we want to get the information for a particular professor we can look up their dictionary entry
macy = profile_contents['macy']
macy

In [None]:
macy.find('div', {'class': 'entry-content'})

In [None]:
content = macy.find('div', {'class': 'entry-content'})
content.text

In [None]:
content_refined = content.findAll('h4')

In [None]:
content_refined[0]

In [None]:
titles = content_refined[0].text

In [None]:
titles.split('PhD')

In [None]:
title_and_education = titles.split('PhD')

In [None]:
title = title_and_education[0]
education = title_and_education[1]
education = 'PhD'+education

In [None]:
title

In [None]:
education

## Let's tidy that up and make some functions we can reuse

In [None]:
def getFacultyInfo(soup):
    info = soup.find('div', {'class': 'entry-content'})
    return info

In [None]:
def getTitleAndEducation(info):
    info_refined = info.findAll('h4')
    titles = info_refined[0].text
    title_and_education = titles.split('PhD')
    title = title_and_education[0]
    education = 'PhD'+title_and_education[1]
    return title, education

In [None]:
macy = getFacultyInfo(profile_contents['macy'])
macy_te = getTitleAndEducation(macy)
print(macy_te[0], macy_te[1])

In [None]:
heckathorn = getFacultyInfo(profile_contents['heckathorn'])
heckathorn_te = getTitleAndEducation(heckathorn)
print(heckathorn_te[0], heckathorn_te[1])

In [None]:
garip = getFacultyInfo(profile_contents['garip'])
garip_te = getTitleAndEducation(garip)
print(garip_te[0], garip_te[1])

In [None]:
garip

In [None]:
import string

def getTitleAndEducation2(info):
    info_refined = info.findAll('h4')
    titles = info_refined[0].text
    titles = ''.join(x for x in titles if x not in string.punctuation)
    title_and_education = titles.split('PhD')
    title = title_and_education[0].rstrip()
    education = 'PhD'+title_and_education[1]
    education = education.split('Curriculum')[0].rstrip() #removing additional info and whitespace
    return title, education

In [None]:
getTitleAndEducation2(garip)

## Now let's see if that works for all cases

In [None]:
for prof in profile_contents:
    print("Getting info for: ", prof)
    try:
        info = getFacultyInfo(profile_contents[prof])
        te = getTitleAndEducation(info)
        print(prof, te[0], te[1], '\n')
    except:
        print("ERROR: Failed to get info from", prof)
    sleep(1)

## OK, so it looks like we got everybody's details except Kim Weeden's. Why? Can you fix the function to get hers too.

## We should probably get some more information. Complete this function to get the correct name for each faculty member

In [None]:
def getFacultyName(soup):
    name_info = soup.findAll('h1', {'class':'entry-title'})
    name = name_info[0].text
    return name

In [None]:
for prof in profile_contents:
    name = getFacultyName(profile_contents[prof])
    print(name)

## Now we can put it all together to get a Python object containing info from each page

In [None]:
faculty_info = {}
for prof in profile_contents:
    print("Getting info for: ", prof)
    try:
        name = getFacultyName(profile_contents[prof])
        info = getFacultyInfo(profile_contents[prof])
        te = getTitleAndEducation2(info)
        print(te)
        faculty_info[name] = {'title': te[0], 'education':te[1]}
    except:
        print("ERROR: Failed to get info from", prof)
    

In [None]:
faculty_info

## OK, this looks more ore less correct. Can you see any problems?

## Once you have the information you need its often good to convert it into an easier format to read and to run any analyses on. Here we use pandas to convert it to a dataframe.

In [None]:
import pandas as pd
df = pd.DataFrame.from_dict(faculty_info, orient='index')

In [None]:
df

## You also likely want to save the data somewhere. There are many different ways of doing this, for example in a database, a JSON file, or a csv. Here we use pandas to_csv function to write it to a csv

In [None]:
df.to_csv('../data/facultyinfo.csv',encoding='utf-8')