In [3]:
import os
import pandas as pd

In [4]:
PROJECT_LEAD = "szaboildi"
PROJECT_NAME = "uk-pol-speech-classifier"

###########  CONSTANTS  ###############
#LOCAL_PATH = os.path.join(
#    os.path.expanduser('~'), "code", PROJECT_LEAD, PROJECT_NAME)
raw_data_path = "~/code/uk-pol-speech-classifier/polclassifier/Corp_HouseOfCommons_V2.feather"
data = pd.read_feather(raw_data_path)


In [7]:
# Filter and clean data
data = data[["speaker", "party", "text"]]
min_word_count=400
sample_size=1000
parties_to_exclude=[]

# Filter for min word count
data["word_n_full"] = data.apply(lambda row: len(row["text"].strip().split()), axis=1)
data = data[data["word_n_full"] >= min_word_count]

# Only select big enough parties
n_speeches_by_party = data.groupby("party").size().reset_index(name="n_speeches").\
sort_values("n_speeches", ascending=False).reset_index(drop=True)
big_parties = n_speeches_by_party[n_speeches_by_party.n_speeches > sample_size]["party"].tolist()
data = data[(data["party"].isin(big_parties)) & (~(data["party"].isin(parties_to_exclude)))]

# Undersample
df_undersampled = pd.DataFrame()

for group_name, group_data in data.groupby('party'):
    sampled_data = group_data.sample(sample_size)
    df_undersampled = pd.concat([df_undersampled, sampled_data], axis=0)

df_undersampled.reset_index(drop=True, inplace=True)

In [8]:
from sklearn.model_selection import train_test_split
data_train, data_test = train_test_split(df_undersampled, test_size=0.2, random_state=42, stratify=df_undersampled["party"])

In [11]:
# Undersample data_test
speeches_per_party = 20
grouped_data = data_test.groupby('party')
smaller_data_test = []
for party, group in grouped_data:
    # Select randomly 20 speeches per party
    sampled_group = group.sample(n=speeches_per_party, random_state=42)
    # Add selected speeches to list
    smaller_data_test.append(sampled_group)

In [14]:
smaller_data_test

[               speaker party  \
 230      Cheryl Gillan   Con   
 313         Simon Hart   Con   
 366       Graham Brady   Con   
 510           Tom King   Con   
 460      Gary Streeter   Con   
 454       Mark Garnier   Con   
 701    Robert Goodwill   Con   
 391       Damian Green   Con   
 776        Anna Soubry   Con   
 504          John Glen   Con   
 71           Ed Vaizey   Con   
 875     George Osborne   Con   
 241      Graham Stuart   Con   
 791         Eric Forth   Con   
 191  James Brokenshire   Con   
 288        James Paice   Con   
 418        Andrew Rowe   Con   
 768   Kevin Hollinrake   Con   
 524      Mark Francois   Con   
 337       David Howell   Con   
 
                                                   text  word_n_full  
 230  That point was well made. I, too, was concerne...          480  
 313  Like many hon. Members, I trawled through the ...          819  
 366  I have taken a lot of interventions and I shou...          462  
 510  I have already 

In [21]:
df = pd.concat(smaller_data_test, ignore_index=True)

In [22]:
df

Unnamed: 0,speaker,party,text,word_n_full
0,Cheryl Gillan,Con,"That point was well made. I, too, was concerne...",480
1,Simon Hart,Con,"Like many hon. Members, I trawled through the ...",819
2,Graham Brady,Con,I have taken a lot of interventions and I shou...,462
3,Tom King,Con,I have already commented on the need for Membe...,962
4,Gary Streeter,Con,I was about to make the point that many Conser...,849
...,...,...,...,...
135,William Ross,UUP,The hon. Member for Yeovil (Mr. Ashdown) picke...,418
136,William Ross,UUP,Following on briefly from what the right hon. ...,459
137,David Trimble,UUP,I congratulate the hon. Member for Canterbury ...,1344
138,John Taylor,UUP,"With the leave of the House, I wish to reply t...",1018


In [23]:
#Save smaller_data_test to csv
path="~/code/uk-pol-speech-classifier/polclassifier/smaller_data_test.csv"
df.to_csv(path, index=False)

In [None]:
path = os.path.join(LOCAL_PATH, "processed_data", "smaller_data_test.csv")

In [5]:
path

'/home/hailinh/code/szaboildi/uk-pol-speech-classifier/processed_data/smaller_data_test.csv'

In [6]:
pd.read_csv(path)

Unnamed: 0,speaker,party,text,word_n_full
0,Cheryl Gillan,Con,"That point was well made. I, too, was concerne...",480
1,Simon Hart,Con,"Like many hon. Members, I trawled through the ...",819
2,Graham Brady,Con,I have taken a lot of interventions and I shou...,462
3,Tom King,Con,I have already commented on the need for Membe...,962
4,Gary Streeter,Con,I was about to make the point that many Conser...,849
...,...,...,...,...
135,William Ross,UUP,The hon. Member for Yeovil (Mr. Ashdown) picke...,418
136,William Ross,UUP,Following on briefly from what the right hon. ...,459
137,David Trimble,UUP,I congratulate the hon. Member for Canterbury ...,1344
138,John Taylor,UUP,"With the leave of the House, I wish to reply t...",1018
