# Two Part Scraping
The first part uses the congress.gov/members webpage to get the name and political
affiliation of each congress member. Afterwards, the names and party affiliation data is saved in a
csv file. 

The second part google searches based on the csv data and saves each image in 
a desktop folder for reference during training. 



In [100]:
# import libraries
try:
    import urllib.request as urllib2
except ImportError:
    import urllib2


from bs4 import BeautifulSoup
import time 
import os
import json

Set PATH to the data directory where you want the data saved

In [None]:
PATH="/path/to/datafolder"

## Build csv file 

The code below makes several requests to the congress.gov website and parses the page data

In [79]:
def getNamesFromPage(soup):
    names = [] 
    for link in soup.find_all('a'):
        url = link.get('href')
        if "https://www.congress.gov/member/" not in url: 
            continue
        else: 
            name = link.text
            if name not in names: 
                names.append(name)
    return names 
            
def getPartyFromPage(soup):
    parties = [] 
    for span in soup.find_all('span', attrs={'class': 'result-item'}):
        text = span.text
        if 'Republican' in text: 
            parties.append("Republican")
        if 'Democrat' in text: 
            parties.append("Democrat")
        if 'Independent' in text: 
            parties.append("Independent")
        if 'Independent Democrat' in text: 
            parties.append("Independent")
    return parties

def getCongressData():
    fullNames = []
    fullParties = [] 
    # opens each page, to get name and party information
    for i in range(23): 
        url = "https://www.congress.gov/members?page=" + str(i+1) 
        req = urllib2.Request(url, headers={'User-Agent' : "Magic Browser"}) 
        con = urllib2.urlopen( req )
        soup = BeautifulSoup(con, 'html.parser')
        names = getNamesFromPage(soup)
        parties = getPartyFromPage(soup)
        ## remove duplicates 
        parties = parties[::2]
        
        ## check that quantity of parties matches quantity of names
        if len(parties) == len(names): 
            fullNames.extend(names) 
            fullParties.extend(parties)
        time.sleep(1) 
    return fullNames, fullParties 
                             

In [80]:
names, parties = getCongressData() 

13


Page data is ready to be saved at this point

In [90]:
import csv
with open(f"{PATH}/congress.csv", 'a') as csv_file:
    writer = csv.writer(csv_file)
    writer.writerows(zip(names, parties))

## Scrape and save images

Note: a handleful of images will throw an error, and while I was experimenting these were just saved by hand

In [110]:

# adapted from http://stackoverflow.com/questions/20716842/python-download-images-from-google-image-search
def saveImages():  
    for i in range(len(names)): 
        
        name = names[i]
        party = parties[i]
        
        if 'Democrat' in party: 
            saveDirectory = PATH+'/dems'
        elif 'Republican' in party: 
            saveDirectory = PATH+'/reps'
        elif 'Independent' in party: 
            saveDirectory = PATH+'/ind'
        else: 
            saveDirectory = PATH+'unknown'
        
        if not os.path.exists(saveDirectory):
            os.makedirs(saveDirectory)
            
            
        image_type="Action"
        query=name.split()
        fn='_'.join(query) 
        fn=party+'_'+fn
        query='+'.join(query)
        url="https://www.google.co.in/search?q="+query+"&source=lnms&tbm=isch"
        header={'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36"}
        soup = BeautifulSoup(urllib2.urlopen(urllib2.Request(url,headers=header)),'html.parser')
    
        ActualImages=[]# contains the link for Large original images, type of  image
        for a in soup.find_all("div",{"class":"rg_meta"}):
            link , Type =json.loads(a.text)["ou"]  ,json.loads(a.text)["ity"]
            ActualImages.append((link,Type))
    
        for i , (img , Type) in enumerate( ActualImages[0:1]):
            try:
                ##req = urllib2.Request(img, headers={'User-Agent' : header})
                raw_img = urllib2.urlopen(img).read()
                if len(Type)==0:
                    f = open(os.path.join(saveDirectory , fn+".jpg"), 'wb')
                else :
                    f = open(os.path.join(saveDirectory , fn+"."+Type), 'wb')
                    f.write(raw_img)
                    f.close()
            except Exception as e:
                try: 
                    req = urllib2.Request(img, headers={'User-Agent' : "Magic Browser"}) 
                    raw_img = urllib2.urlopen( req ).read()
                    if len(Type)==0:
                        f = open(os.path.join(saveDirectory , fn+".jpg"), 'wb')
                    else :
                        f = open(os.path.join(saveDirectory , fn+"."+Type), 'wb')
                        f.write(raw_img)
                        f.close()
                except Exception as e: 
                    print("could not load : "+img)
                    print(e)


In [111]:
saveImages()

could not load : http://www.tulane.edu/~rice/tdp/pix/MaxCleland.jpg
HTTP Error 403: Forbidden
could not load : https://digital.content.ecu.edu/adore-djatoka/resolver?url_ver=Z39.88-2004&rft_id=http://150.216.68.252/ncgre000/00000017/00016333/00016333_ac_0001.jp2&svc_id=info:lanl-repo/svc/getRegion&svc_val_fmt=info:ofi/fmt:kev:mtx:jpeg2000&svc.format=image/jpeg&svc.level=4
<urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:833)>
could not load : https://www.ikn.army.mil/apps/IKNWMS/IKN_Websites/USAICoE/MI%20Corps%20Hall%20of%20Fame/images/HECHT,%20CHIC%20JACOB%20(SEN).jpg
<urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:833)>
could not load : https://history.redstone.army.mil/bios/jones_intro.jpg
<urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:833)>
could not load : https://digital.content.ecu.edu/adore-djatoka/resolver?url_ver=Z39.88-2004&rft_id=http://150.216.68.252/ncgre000/00000024/0002