In [3]:
import os
import pandas as pd
import numpy as np

In [6]:
PROJECT_LEAD = "szaboildi"
PROJECT_NAME = "uk-pol-speech-classifier"

###########  CONSTANTS  ###############
LOCAL_PATH = os.path.join(
    os.path.expanduser('~'), "code", PROJECT_LEAD, PROJECT_NAME)
raw_data_path = os.path.join(
            LOCAL_PATH, "raw_data", "Corp_HouseOfCommons_V2.feather")
data = pd.read_feather(raw_data_path)


In [7]:
# Filter and clean data
data = data[["speaker", "party", "text"]]
min_word_count=400
sample_size=1000
parties_to_exclude=[]

# Filter for min word count
data["word_n_full"] = data.apply(lambda row: len(row["text"].strip().split()), axis=1)
data = data[data["word_n_full"] >= min_word_count]

# Only select big enough parties
n_speeches_by_party = data.groupby("party").size().reset_index(name="n_speeches").\
sort_values("n_speeches", ascending=False).reset_index(drop=True)
big_parties = n_speeches_by_party[n_speeches_by_party.n_speeches > sample_size]["party"].tolist()
data = data[(data["party"].isin(big_parties)) & (~(data["party"].isin(parties_to_exclude)))]

# Undersample
df_undersampled = pd.DataFrame()

for group_name, group_data in data.groupby('party'):
    sampled_data = group_data.sample(sample_size)
    df_undersampled = pd.concat([df_undersampled, sampled_data], axis=0)

df_undersampled.reset_index(drop=True, inplace=True)

In [9]:
from sklearn.model_selection import train_test_split
data_train, data_test = train_test_split(df_undersampled, test_size=0.2, random_state=42, stratify=df_undersampled["party"])

In [10]:
# Undersample data_test
speeches_per_party = 20
grouped_data = data_test.groupby('party')
smaller_data_test = []
for party, group in grouped_data:
    # Select randomly 20 speeches per party
    sampled_group = group.sample(n=speeches_per_party, random_state=42)
    # Add selected speeches to list
    smaller_data_test.append(sampled_group)

In [11]:
smaller_data_test

[               speaker party  \
 230      Gerald Malone   Con   
 313  Iain Duncan Smith   Con   
 366    Stephen Hammond   Con   
 510       Peter Lilley   Con   
 460       Michael Gove   Con   
 454   Gillian Shephard   Con   
 701      Boris Johnson   Con   
 391       Geoffrey Cox   Con   
 776         Greg Clark   Con   
 504        Mark Harper   Con   
 71      Charles Wardle   Con   
 875       Andrew Tyrie   Con   
 241     Andrea Leadsom   Con   
 791        John Bercow   Con   
 191       Andrew Percy   Con   
 288    Stephen O'Brien   Con   
 418     Quentin Davies   Con   
 768         Hugo Swire   Con   
 524      Michael Ellis   Con   
 337       John Redwood   Con   
 
                                                   text  word_n_full  
 230  I am afraid that I do not have time. The hon. ...         1272  
 313  I join the Prime Minister in paying tribute to...          799  
 366  I am grateful to you, Mr Speaker, for selectin...          657  
 510  The hon. Gentle

In [12]:
df = pd.concat(smaller_data_test, ignore_index=True)

In [13]:
df

Unnamed: 0,speaker,party,text,word_n_full
0,Gerald Malone,Con,I am afraid that I do not have time. The hon. ...,1272
1,Iain Duncan Smith,Con,I join the Prime Minister in paying tribute to...,799
2,Stephen Hammond,Con,"I am grateful to you, Mr Speaker, for selectin...",657
3,Peter Lilley,Con,The hon. Gentleman has already intervened thre...,484
4,Michael Gove,Con,I thank the Secretary of State for his announc...,939
...,...,...,...,...
135,Martin Smyth,UUP,I appreciate the opportunity of making a brief...,633
136,Martin Smyth,UUP,I wish to raise several points. I agree with m...,615
137,James Molyneaux,UUP,I share the reservations of the right hon. Mem...,929
138,William Ross,UUP,It is evident that the Liberal party has learn...,495


In [23]:
#Save smaller_data_test to csv
path="~/code/uk-pol-speech-classifier/polclassifier/smaller_data_test.csv"
df.to_csv(path, index=False)

In [None]:
path = os.path.join(LOCAL_PATH, "processed_data", "smaller_data_test.csv")

In [5]:
path

'/home/hailinh/code/szaboildi/uk-pol-speech-classifier/processed_data/smaller_data_test.csv'

In [6]:
pd.read_csv(path)

Unnamed: 0,speaker,party,text,word_n_full
0,Cheryl Gillan,Con,"That point was well made. I, too, was concerne...",480
1,Simon Hart,Con,"Like many hon. Members, I trawled through the ...",819
2,Graham Brady,Con,I have taken a lot of interventions and I shou...,462
3,Tom King,Con,I have already commented on the need for Membe...,962
4,Gary Streeter,Con,I was about to make the point that many Conser...,849
...,...,...,...,...
135,William Ross,UUP,The hon. Member for Yeovil (Mr. Ashdown) picke...,418
136,William Ross,UUP,Following on briefly from what the right hon. ...,459
137,David Trimble,UUP,I congratulate the hon. Member for Canterbury ...,1344
138,John Taylor,UUP,"With the leave of the House, I wish to reply t...",1018


In [14]:
np.unique(df["party"])

array(['Con', 'DUP', 'Lab', 'LibDem', 'PlaidCymru', 'SNP', 'UUP'],
      dtype=object)

In [15]:
libdem_data = data[data['party'] == "LibDem"]

In [16]:
libdem_data

Unnamed: 0,speaker,party,text,word_n_full
51,Paddy Ashdown,LibDem,No. The hon. Gentleman will discover why in a ...,527
53,Paddy Ashdown,LibDem,I am sorry to be discourteous to the hon. Memb...,629
55,Paddy Ashdown,LibDem,I hope that the hon. Gentleman will forgive me...,845
329,Robert Maclennan,LibDem,The two previous speeches have reversed the no...,2224
570,Archy Kirkwood,LibDem,It is always a pleasure to follow the hon. Mem...,1345
...,...,...,...,...
1955732,Norman Lamb,LibDem,I thank the hon. Gentleman for that. I will ca...,454
1955734,Norman Lamb,LibDem,I totally agree. The statistics that I am citi...,905
1955736,Norman Lamb,LibDem,I suspect that the right hon. Gentleman knows ...,414
1955755,Norman Lamb,LibDem,I thank the Minister for her response to the d...,619


In [17]:
lab_Data = data[data['party'] == "Lab"]
lab_Data

Unnamed: 0,speaker,party,text,word_n_full
4,Neil Kinnock,Lab,I am sure that I speak for the majority of hon...,2768
6,Neil Kinnock,Lab,"The hon. Gentleman says, "" Appeal."" Of course ...",850
59,Audrey Wise,Lab,Nothing in the Gracious Speech or in the Prime...,1344
67,Geoffrey Lofthouse,Lab,The Queen's Speech outlines the Government's l...,1661
73,Michael Martin,Lab,In the past few weeks in Glasgow and in Scotla...,691
...,...,...,...,...
1956183,Emma Hardy,Lab,I agree with my hon. Friend that the Governmen...,1020
1956185,Stephanie Peacock,Lab,It is a pleasure to serve under your chairmans...,447
1956187,Mike Kane,Lab,It is an honour to serve under your chairmansh...,1164
1956207,Lisa Nandy,Lab,"I beg to move, That Sir Lindsay Hoyle do take ...",811
