In [9]:
import wikipedia
import numpy as np
import pandas as pd
import json
import re
import multiprocessing as mp
import threading
from tqdm import tqdm_notebook

In [2]:
wikipedia.WikipediaPage(title = "Tony Blair").categories

['1953 births',
 '20th-century Prime Ministers of the United Kingdom',
 '21st-century Prime Ministers of the United Kingdom',
 'AC with 17 elements',
 'Advocates of the European Union',
 'All BLP articles lacking sources',
 'All articles lacking reliable references',
 'All articles with unsourced statements',
 "Alumni of St John's College, Oxford",
 'Alumni of the Inns of Court School of Law',
 'Articles lacking reliable references from January 2016',
 'Articles with short description',
 'Articles with unsourced statements from August 2016',
 'Articles with unsourced statements from January 2016',
 'BLP articles lacking sources from January 2016',
 'British diplomats',
 'CS1 maint: BOT: original-url status unknown',
 'CS1 maint: Extra text: editors list',
 'Commission for Africa members',
 'Congressional Gold Medal recipients',
 'Converts to Roman Catholicism from Anglicanism',
 'Democratic socialists',
 'English Roman Catholics',
 'English autobiographers',
 "Fellows of St John's Coll

In [3]:
wikipedia.search('Thomas Boardman')

['Tom Boardman',
 'Tom Boardman, Baron Boardman',
 'The Exorcism of Emily Rose',
 'Robert II of Scotland',
 'Humphrey Boardman',
 'Boardman Books',
 'Christopher Boardman',
 'How Hill',
 'Joseph H. Boardman',
 '1967 Leicester South West by-election']

In [10]:
uk_pol = pd.read_csv('uk_pol_clean.csv')

In [11]:
speakers = pd.DataFrame(uk_pol.speaker.value_counts()).reset_index()
speakers.columns = ['speaker','speech_count']

In [14]:
speakers.head()

Unnamed: 0,speaker,speech_count
0,David Cameron,186
1,Theresa May,158
2,Gordon Brown,67
3,Matthew Hancock,62
4,George Osborne,58


In [7]:
def get_wiki_summary(name):
    try:
        wiki_summary = wikipedia.WikipediaPage(title=name).summary
    except:
        try:
            name = name +' (politician)'
            wiki_summary = wikipedia.WikipediaPage(title=name).summary
        except:
            try:
                name = name +' (British politician)'
                wiki_summary = wikipedia.WikipediaPage(title=name).summary
            except:
                try:
                    name = name +' (Labour politician)'
                    wiki_summary = wikipedia.WikipediaPage(title=name).summary
                except:
                    try:
                        potential_matches = wikipedia.search(name)
                        wiki_summary = []
                        for match in potential_matches:
                            match_summary = wikipedia.WikipediaPage(title=match).summary
                            if name in match_summary:
                                wiki_summary.append(match_summary)
                    except:
                        wiki_summary = 'No summary'

                        return wiki_summary

In [12]:
party_dict = {'Conservative Party':'Conservative',
              'Labour Party':'Labour',
              'Labour Co-operative':'Labour',
              'Trades Union':'Labour',
              'Liberal Democrat':'Lib Dem',
              'Liberal Party':'Lib Dem',
              'Green Party':'Greens',
              'Scottish National Party':'SNP',
              'Plaid Cymru':'Plaid Cymru',
              'Sinn Féin':'Sinn Féin',
              'Ulster Unionist Party':'UUP',
              'Democratic Unionist Party':'DUP',
              'UK Independence Party':'UKIP'}

def get_party(name):
    try:
        wiki_cats = wikipedia.WikipediaPage(title=name).categories
    except:
        try:
            longname = name +' (politician)'
            wiki_cats = wikipedia.WikipediaPage(title=longname).categories
        except:
            try:
                longname = name +' (British politician)'
                wiki_cats = wikipedia.WikipediaPage(title=longname).categories
            except:
                try:
                    longname = name +' (Labour politician)'
                    wiki_cats = wikipedia.WikipediaPage(title=longname).categories
                except:
                    try:
                        longname = name +' (Northern Ireland politician)'
                        wiki_cats = wikipedia.WikipediaPage(title=longname).categories
                    except:
                        wiki_cats = []
    wiki_cats_joined = ', '.join(wiki_cats)
    party_count=0
    party_name = 'No name'
    for ref, party in party_dict.items():
        if ref in wiki_cats_joined:
            if party != party_name:
                party_count+=1
            party_name = party
    return name, party_name, party_count

def get_parties(index_group, indexes, return_dict):
    results = []
    for ind in tqdm_notebook(indexes):
        name = speakers.iloc[ind,0]
        speech_count = speakers.iloc[ind,1]
        name, party_name, party_count = get_party(name)
        results.append([name, speech_count, party_name, party_count])
    return_dict[index_group]=results

In [13]:
def request_thread(index_groups):
    manager = mp.Manager()
    return_dict = manager.dict()
    threads = []
    for index_group, indexes in tqdm_notebook(index_groups.items()):
        thread = threading.Thread(name=index_group, 
                                  target=get_parties, 
                                  args=(index_group, indexes, return_dict))
        thread.start()
        threads.append(thread)

    for t in threads:
        t.join()

    return return_dict

In [10]:
speakers.to_csv('speakers.csv',index=False)

In [11]:
speakers = pd.read_csv('speakers.csv')

In [15]:
name_index_groups = range(speakers.shape[0],0,-50)
indexes = {group:[i for i in range(0,group) if group-i<51] for group in name_index_groups}

In [16]:
parties_dict = request_thread(indexes)

HBox(children=(IntProgress(value=0, max=12), HTML(value='')))

HBox(children=(IntProgress(value=0, max=50), HTML(value='')))

HBox(children=(IntProgress(value=0, max=50), HTML(value='')))

HBox(children=(IntProgress(value=0, max=50), HTML(value='')))

HBox(children=(IntProgress(value=0, max=50), HTML(value='')))

HBox(children=(IntProgress(value=0, max=50), HTML(value='')))

HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

HBox(children=(IntProgress(value=0, max=50), HTML(value='')))

HBox(children=(IntProgress(value=0, max=50), HTML(value='')))

HBox(children=(IntProgress(value=0, max=50), HTML(value='')))

HBox(children=(IntProgress(value=0, max=50), HTML(value='')))

HBox(children=(IntProgress(value=0, max=50), HTML(value='')))

HBox(children=(IntProgress(value=0, max=50), HTML(value='')))



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


In [17]:
speaker_parties = pd.DataFrame([x for group in parties_dict.values() for x in group],columns=['name','speech_count','party','party_count'])

In [18]:
speaker_parties[speaker_parties.party_count==0].sort_values('speech_count',ascending=False).head()

Unnamed: 0,name,speech_count,party,party_count
8,Queen Elizabeth II,45,No name,0
75,King George VI,9,No name,0
73,Queen Victoria,9,No name,0
106,John Reid,6,No name,0
119,Michael Wilshaw,6,No name,0


In [19]:
speaker_parties[speaker_parties.party_count==0].speech_count.sum()

202

In [17]:
speaker_parties[speaker_parties.party_count==1].speech_count.sum()

2784

In [20]:
speaker_parties[speaker_parties.party_count==1].shape

(447, 4)

In [21]:
speaker_parties[speaker_parties.party_count>=2].speech_count.sum()

89

In [1]:
446+89

535

In [6]:
65+136

201

In [20]:
speaker_parties[speaker_parties.party_count==2]

Unnamed: 0,name,speech_count,party,party_count
41,William Ewart Gladstone,2,Lib Dem,2
44,Jonathan Aitken,2,UKIP,2
51,George Eustice,2,UKIP,2
62,Roy Jenkins,2,Lib Dem,2
83,Elizabeth Truss,18,Lib Dem,2
97,Vince Cable,13,Lib Dem,2
123,Winston Churchill,9,Lib Dem,2
143,Chris Huhne,7,Lib Dem,2
149,Peter Hain,6,Lib Dem,2
180,Mark Reckless,1,UKIP,2


In [21]:
speaker_parties[speaker_parties.party_count==1].party.value_counts()

Conservative    213
Labour          177
Lib Dem          31
SNP              11
DUP               5
Plaid Cymru       3
Greens            3
UUP               1
Sinn Féin         1
UKIP              1
Name: party, dtype: int64

In [22]:
speaker_parties[speaker_parties.party_count==1].groupby('party')['speech_count'].sum()

party
Conservative    1776
DUP               10
Greens             7
Labour           783
Lib Dem          162
Plaid Cymru        5
SNP               33
Sinn Féin          2
UKIP               5
UUP                1
Name: speech_count, dtype: int64

If we stick to MPs who have just returned one party then we still have 2772 speeches. I reckon it's worth doing this to keep the data clean. Might have to rethink if I drill down to just speeches that I've managed to pull the date from.

In [23]:
uk_pol.shape

(3077, 7)

In [24]:
uk_pol.head(1)

Unnamed: 0,title,description,text,speaker,year,subject,date
0,"Isherwood, Mark – 2006 Speech on Fuel Poverty",Below is the text of the speech made by Mark I...,The Chartered Institute of Housing Cymru state...,Mark Isherwood,2006,Fuel Poverty,2006-01-10


In [25]:
speaker_parties.head(1)

Unnamed: 0,name,speech_count,party,party_count
0,David Cameron,186,Conservative,1


In [26]:
party_list = pd.DataFrame()
party_list['speaker'] = speaker_parties.name
party_list['party'] = speaker_parties.apply(lambda x: x['party'] if x['party_count']==1 else 'No info', axis=1)

In [27]:
party_list[party_list.party!='No info'].shape

(446, 2)

In [28]:
party_list.to_csv('party_list_uk_pol.csv', index=False)

In [29]:
uk_pol_final = uk_pol.merge(party_list).drop(['title','description'],axis=1)

In [30]:
uk_pol_final[uk_pol_final.year==0]

Unnamed: 0,text,speaker,year,subject,date,party
1234,This is not a meeting that any of us would hav...,David Steel,0,"Steel, David – Donald Dewar",2000-10-13,Lib Dem


In [31]:
uk_pol_final.iloc[1234,2]=2000

In [32]:
uk_pol_final = uk_pol_final[uk_pol_final.party!='No info'].dropna()

In [33]:
uk_pol_final.party.value_counts()

Conservative    1724
Labour           670
Lib Dem          153
SNP               27
DUP               10
Greens             7
UKIP               5
Plaid Cymru        5
Sinn Féin          2
UUP                1
Name: party, dtype: int64

In [34]:
uk_pol_final.to_csv('uk_pol_final.csv', index=False)

In [35]:
uk_pol_final = pd.read_csv('uk_pol_final.csv')

Think I'm going to dump the 'year' column but want to check if it matches the year in the date column first.

In [36]:
uk_pol_final['year_check'] = uk_pol_final.date.map(lambda x: int(x[:4]))

In [37]:
uk_pol_final[uk_pol_final.year!=uk_pol_final.year_check]

Unnamed: 0,text,speaker,year,subject,date,party,year_check
137,I am pleased that I have been given the opport...,Jeff Rooker,1998,British Poultry Meat Federation,1988-04-29,Labour,1988
982,"Next week at Copenhagen, the EU will take some...",Jack Straw,2002,Critical Decisions for European Union,2012-12-05,Labour,2012
1067,"Thank you very much, Ben, and thank you everyo...",Nick Gibb,2015,Stonewall,2011-07-01,Conservative,2011
1134,"Thank you very much, Dominic, for that kind in...",Michael Gove,2012,Speech at BETT Show,2016-01-11,Conservative,2016
1450,It is a pleasure to be here today to celebrate...,Gordon Brown,2003,Speech at City Growth Strategies Forum,2013-10-08,Labour,2013
1494,"Thank you Penny [Thompson, GSCC CEO]. And than...",Paul Burstow,2012,Community Care Live Speech,2013-05-17,Lib Dem,2013
1542,Hillary Clinton: Some months ago so this is no...,William Hague,2012,Speech with Hillary Clinton,2010-05-14,Conservative,2010
1587,Introduction I’d like to begin by thanking Pla...,Stephen Hammond,2012,Place West London Conference,2013-10-22,Conservative,2013
1768,When the NHS was set up nearly 70 years ago Be...,Jeremy Hunt,2014,GPs,2015-06-19,Conservative,2015
2154,"Thank you, Mr Speaker; it is a great privilege...",Jo Cox,2015,Maiden Speech,2016-06-03,Labour,2016


Check differences (ignore when within one year unless obvious):<p>
speech 137 was in 1998 > change<br>
speech 983 was probably 2002 > change<br>
speech 1068 was 2011 > correct<br>
speech 1135 was 2012 > change<br>
speech 1451 was probably 2003 > change<br>
speech 2171 was 1998 > change<br>
speech 2328 was 2017 > correct

In [38]:
uk_pol_final.iloc[137,4] = '1998-04-29'
uk_pol_final.iloc[983,4] = '2002-12-05'
uk_pol_final.iloc[1135,4] = '2012-01-11'
uk_pol_final.iloc[1451,4] = '2003-10-08'
uk_pol_final.iloc[2171,4] = '1998-01-06'

In [39]:
uk_pol_final.drop(['year_check'],axis=1,inplace=True)

In [40]:
uk_pol_final.drop(['year'],axis=1,inplace=True)

In [41]:
uk_pol_final.head()

Unnamed: 0,text,speaker,subject,date,party
0,The Chartered Institute of Housing Cymru state...,Mark Isherwood,Fuel Poverty,2006-01-10,Conservative
1,Thank you for that kind welcome. I am hugely g...,Chuka Umunna,Universities UK,2011-12-02,Labour
2,Thank you for that introduction Michael and to...,Chuka Umunna,High Pay Commission,2012-01-12,Labour
3,"Thank you for that introduction, and thank you...",Chuka Umunna,Hub Westminster,2012-06-26,Labour
4,"Conference, thank you so very much for invitin...",Chuka Umunna,UCATT Conference,2012-05-28,Labour


In [42]:
uk_pol_final = uk_pol_final[['speaker','party','date','subject','text']]

In [43]:
uk_pol_final.head()

Unnamed: 0,speaker,party,date,subject,text
0,Mark Isherwood,Conservative,2006-01-10,Fuel Poverty,The Chartered Institute of Housing Cymru state...
1,Chuka Umunna,Labour,2011-12-02,Universities UK,Thank you for that kind welcome. I am hugely g...
2,Chuka Umunna,Labour,2012-01-12,High Pay Commission,Thank you for that introduction Michael and to...
3,Chuka Umunna,Labour,2012-06-26,Hub Westminster,"Thank you for that introduction, and thank you..."
4,Chuka Umunna,Labour,2012-05-28,UCATT Conference,"Conference, thank you so very much for invitin..."


In [44]:
uk_pol_final.date = pd.to_datetime(uk_pol_final.date)

In [45]:
uk_pol_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2604 entries, 0 to 2603
Data columns (total 5 columns):
speaker    2604 non-null object
party      2604 non-null object
date       2604 non-null datetime64[ns]
subject    2604 non-null object
text       2604 non-null object
dtypes: datetime64[ns](1), object(4)
memory usage: 101.8+ KB


Check for any speeches that are less than 100 characters:

In [46]:
length_test = uk_pol_final.text.map(lambda x: True if len(x)<100 else False)

In [47]:
uk_pol_final[length_test]

Unnamed: 0,speaker,party,date,subject,text
1152,Michael Gove,Conservative,2016-04-19,EU,Speech available as a PDF.


In [48]:
uk_pol_final.drop(1153,inplace=True)

In [49]:
uk_pol_final.to_csv('uk_pol_final.csv', index=False)

In [50]:
uk_pol_final = pd.read_csv('uk_pol_final.csv')

In [51]:
length_check = uk_pol_final.text.map(lambda x: len(x.split(' ')))

In [52]:
length_check.sort_values()[:20]

1152      5
850      57
1573     67
2300     68
2214     72
2320     83
2273    101
1941    131
1931    139
164     143
2313    145
2090    151
2120    151
2012    160
824     160
2295    164
833     171
2055    175
621     181
703     185
Name: text, dtype: int64

In [53]:
for speech in uk_pol_final.iloc[length_check.sort_values()[:20].index,4]:
    print(speech)
    print('--------------------------------------------')

Speech available as a PDF.
--------------------------------------------
Passover is a time of coming together, when Jewish communities commemorate the liberation of the people of Israel from slavery in ancient Egypt. It is a time to celebrate freedom as a basic human right. Pesach Sameach to all Jewish families both in the UK and around the world. I wish them a happy and peaceful holiday.
--------------------------------------------
Easter is a season of hope for all Christians. At this time of celebration my thoughts are with all those facing persecution, discrimination and denied the right to worship freely, particularly Christians in the Middle East. This Government has pledged to stand up for the right to live and to worship free from discrimination, and we will continue to work actively to make this a reality .
--------------------------------------------
I am sickened by the senseless loss of life in Barcelona today. The Foreign Office is working to establish if any British natio

Final check for duplicates:

In [54]:
uk_pol_final.describe(include='all')

Unnamed: 0,speaker,party,date,subject,text
count,2603,2603,2603,2603,2603
unique,439,10,1525,1883,2600
top,David Cameron,Conservative,2015-12-02,Maiden Speech,"Ladies and Gentlemen, Thank you for the opport..."
freq,178,1723,22,135,2


Turns out we've got a few duplicated speeches. Will drop these:

In [62]:
uk_pol_final[['text']].drop_duplicates().index

Int64Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
            ...
            2593, 2594, 2595, 2596, 2597, 2598, 2599, 2600, 2601, 2602],
           dtype='int64', length=2600)

In [64]:
uk_pol_final = uk_pol_final.iloc[uk_pol_final[['text']].drop_duplicates().index]

In [65]:
uk_pol_final.to_csv('uk_pol_final.csv', index=False)