In [33]:
import pandas as pd
import numpy as np
import urllib, urllib2, json
import requests
import re
from bs4 import BeautifulSoup


In [34]:
def request_current_page(my_title):
    """
    Find the content and time of last revision of a wikipedia page. 
    Input
    ****************
    title: (string)
    
    Output
    **************
    (string), time of last revision
    (string), content of the page   
    
    """
    url = 'http://en.wikipedia.org/w/api.php/?'
    values = {'action' : 'query',
              'prop' : 'revisions',
              'titles' : my_title,
              'rvprop' : 'timestamp|content',
              'format' : 'json'}

    #generate query
    data = urllib.urlencode(values)
    req = url+data

    #query the api
    response = urllib2.urlopen(req)
    json_page = json.loads(response.read())

    first_revision = json_page['query']['pages'].values()[0]['revisions'][0]
    revision_content = first_revision[u'*']
    
    #the date of creation
    revised_on = first_revision[u'timestamp']   
    return revised_on, revision_content.encode('utf-8')

def parse_table(page,n1,n2, find_link_name = True):
    """
    Parse wikipedia table of the senate and house of representatives.
    Input
    ****************
    page: (string), page name
    n1: (int), number of the table in order of appearance in the page
    n2: (int), column where the name appears
    name: (string), filename
    find_link_name (bool), work around to distinguish a case where the table uses a different syntax
    
    Output
    **************
    (pandas dataframe), content, party, state
    
    """
    r = requests.get(page)
    sp = BeautifulSoup(r.content, "html.parser")
    parsed_table = sp.find_all('table')[n1] 
    if find_link_name:
        data = [[td.a['href'].replace('/wiki/','') if n==n2 and td.find('a') else 
                 ''.join(td.stripped_strings)
                 for n,td in enumerate(row.find_all(['th','td']))]
                for row in parsed_table.find_all('tr')]
    else:
        data = [[td.a['href'].replace('/wiki/','')+' ('+''.join(td.stripped_strings).split('(')[1] if n==n2 and td.find('a') else 
                 ''.join(td.stripped_strings)
                 for n,td in enumerate(row.find_all(['th','td']))]
                for row in parsed_table.find_all('tr')]
    try:
        df = pd.DataFrame(data[1:], columns=data[0])
    except:
        df = pd.DataFrame(data[1:])

    return df

def find_current_name(my_title):
    """
    Find current name of a given page.
    
    Input
    ***********
    my_title (string): page name
    Output
    *******************
    (string): current page name
    
    """
    
    
    my_title = urllib.unquote(str(my_title))
    print my_title,
    timestamp, revision_content = request_current_page(my_title)
    if 'REDIRECT' in revision_content:
        new_title = re.findall(r'\[\[(.*?)\]\]',revision_content)[0]
        print new_title
        return new_title.replace(' ','_')
    else:
        return my_title.replace(' ','_')


### Define dictionaries for states and parties

In [35]:
#Dictionary between states and their abbreviation
abb_page = "https://en.wikipedia.org/wiki/List_of_U.S._state_abbreviations"
abbrev = pd.read_html(abb_page,encoding='utf-8',attrs={"class":"wikitable"}) [0][12:].groupby(3)[0].first().to_dict()

#Dictionary betwen parties and their abbreviation
parties = {'D':'Democratic','R':'Republican','I':'Indipendent'}

### Desired pages

In [38]:
url = "https://en.wikipedia.org/wiki/"
pages = ['List_of_members_of_the_United_States_House_of_Representatives_in_the_113th_Congress_by_seniority',
         'List_of_members_of_the_United_States_House_of_Representatives_in_the_114th_Congress_by_seniority',
         'List_of_members_of_the_United_States_House_of_Representatives_in_the_115th_Congress_by_seniority',
         'List_of_United_States_Senators_in_the_113th_Congress_by_seniority',
         'List_of_United_States_Senators_in_the_114th_Congress_by_seniority',
         'List_of_United_States_Senators_in_the_115th_Congress_by_seniority']

number_of_the_table = [0,0,0,1,0,0]
name_column = [1,1,1,2,2,2]

links = [True, True, True, False, True, True]

pages = [url+i for i in pages]


### Run code (take some time)

In [39]:
dfs = [parse_table(page,n1,n2, f) for page,n1, n2, name, f in zip(pages, number_of_the_table, name_column, names, links)]

### Adjust columns and write to file

In [None]:
dfs[0]['State'] = dfs[0]['District'].apply(lambda x: x.split('-')[0])
dfs[0]['State'] = dfs[0]['State'].map(abbrev)
dfs[0] = dfs[0][['Representative','Party','State']].rename(columns = {'Representative':'WikiPageName'})
dfs[0]['WikiPageName'] = dfs[0]['WikiPageName'].apply(lambda x:find_current_name(x))
dfs[0]['Party'] = dfs[0]['Party'].map(parties)
dfs[0].to_csv('./data/'+names[0], encoding = 'utf-8',index = False)

In [None]:
dfs[1]['State'] = dfs[1]['District'].apply(lambda x: ' '.join(x.split(u'\xa0')[:-1]))
dfs[1] = dfs[1][['Representative','Party','State']].rename(columns = {'Representative':'WikiPageName'})
dfs[1]['Party'] = dfs[1]['Party'].map(parties)
dfs[1]['WikiPageName'] = dfs[1]['WikiPageName'].apply(lambda x:find_current_name(x))
dfs[1].to_csv('./data/'+names[1], encoding = 'utf-8',index = False)

In [None]:
dfs[2]['State'] = dfs[2]['District'].apply(lambda x: ' '.join(x.split(u'\xa0')[:-1]))
dfs[2] = dfs[2][['Representative','Party','State']].rename(columns = {'Representative':'WikiPageName'})
dfs[2]['WikiPageName'] = dfs[2]['WikiPageName'].apply(lambda x:find_current_name(x))
dfs[2]['Party'] = dfs[2]['Party'].map(parties)
dfs[2].to_csv('./data/'+names[2], encoding = 'utf-8',index = False)

In [None]:
dfs[3]['Representative'] = dfs[3]['Senator (Party-State)'].apply(lambda x: ' ('.join(x.split(' (')[:-1]))
dfs[3]['State'] = dfs[3]['Senator (Party-State)'].apply(lambda x: x.split('-')[-1].replace(')','').split('[')[0])
dfs[3]['State'] = dfs[3]['State'].map(abbrev)
dfs[3]['Party'] = dfs[3]['Senator (Party-State)'].apply(lambda x: x.split('-')[0][-1]).map(parties)
dfs[3] =  dfs[3][['Representative','Party','State']].rename(columns = {'Representative':'WikiPageName'})
dfs[3]['WikiPageName'] = dfs[3]['WikiPageName'].apply(lambda x:find_current_name(x))
dfs[3].to_csv('./data/'+names[3], encoding = 'utf-8',index = False)

In [None]:
dfs[4]['Representative'] = dfs[4]['Senator']
dfs[4] = dfs[4][['Representative','Party','State']].rename(columns = {'Representative':'WikiPageName'})
dfs[4]['WikiPageName'] = dfs[4]['WikiPageName'].apply(lambda x:find_current_name(x))
dfs[4].to_csv('./data/'+names[4], encoding = 'utf-8',index = False)

In [None]:
dfs[5]['Representative'] = dfs[5]['Senator']
dfs[5] = dfs[5][['Representative','Party','State']].rename(columns = {'Representative':'WikiPageName'})
dfs[5]['WikiPageName'] = dfs[5]['WikiPageName'].apply(lambda x:find_current_name(x))
dfs[5].to_csv('./data/'+names[5], encoding = 'utf-8', index = False)

In [41]:
import networkx as nx

In [42]:
G = nx.Graph()
G.add_edges_from([(1,2),(2,3),(3,4)])

In [45]:
projected_graph = nx.projected_graph(G,[1,2])

In [47]:
nx.adjacency_matrix(projected_graph).toarray()

array([[0, 0, 1, 0],
       [0, 0, 0, 1],
       [1, 0, 0, 0],
       [0, 1, 0, 0]])