In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
import pandas as pd
from sklearn.pipeline import make_pipeline
# from sklearn.compose import ColumnTransformer
# from sklearn.feature_selection import SelectPercentile, mutual_info_regression
from sklearn.preprocessing import OneHotEncoder
# from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [4]:
uk_data = pd.read_feather("/content/drive/MyDrive/Colab Notebooks/LeWagon_Project/PoliticalSpeeches/Data/Corp_HouseOfCommons_V2.feather")

In [5]:
uk_data["word_n_full"] = uk_data.apply(
    lambda row: len(row["text"].strip().split()), axis=1)
uk_data.columns

Index(['date', 'agenda', 'speechnumber', 'speaker', 'party', 'party.facts.id',
       'chair', 'terms', 'text', 'parliament', 'iso3country', 'word_n_full'],
      dtype='object')

In [40]:
uk_data_thin = uk_data[uk_data["word_n_full"] > 400][["speaker", "party", "text", "word_n_full"]]
y = uk_data_thin["party"]

In [41]:
sample_size = 1000

n_speeches_by_party = uk_data_thin.groupby("party").size().reset_index(name="n_speeches").\
sort_values("n_speeches", ascending=False).reset_index(drop=True)
big_parties = n_speeches_by_party[n_speeches_by_party.n_speeches > sample_size]["party"].tolist()
uk_data_thin_big_parties = uk_data_thin[uk_data_thin["party"].isin(big_parties)]

In [42]:
n_speeches_by_party

Unnamed: 0,party,n_speeches
0,Lab,103019
1,Con,101488
2,LibDem,21386
3,SNP,6080
4,DUP,2575
5,PlaidCymru,1632
6,UUP,1244
7,SDLP,822
8,Independent,569
9,GPEW,266


In [43]:
# Create an empty DataFrame to store the undersampled data
uk_data_thin_big_parties_undersampled = pd.DataFrame()

# Iterate over each group
for group_name, group_data in uk_data_thin_big_parties.groupby('party'):
    # Sample n tokens from each group
    sampled_data = group_data.sample(sample_size)
    # Append the sampled data to the undersampled DataFrame
    uk_data_thin_big_parties_undersampled = pd.concat([
        uk_data_thin_big_parties_undersampled, sampled_data], axis=0)

# Reset the index of the undersampled DataFrame
uk_data_thin_big_parties_undersampled.reset_index(drop=True, inplace=True)
uk_data_thin_big_parties_undersampled.head()

Unnamed: 0,speaker,party,text,word_n_full
0,Michael Alison,Con,Clauses 20 and 21 impose important new duties ...,1216
1,Ann Widdecombe,Con,"This has been a short but interesting debate, ...",505
2,Jackie Doyle-Price,Con,I congratulate my hon. Friend the Member for C...,1023
3,Nigel Evans,Con,I am grateful to have this opportunity to take...,464
4,Stephen Dorrell,Con,I want to deal with other emergency services. ...,538


In [44]:
texts_full = uk_data_thin_big_parties_undersampled["text"]
texts_500 = uk_data_thin_big_parties_undersampled[["text", "word_n_full"]].apply(
    lambda x: x.text if x.word_n_full <= 600
    else " ".join(x.text.split()[(x.word_n_full//2)-300:(x.word_n_full//2)+300]), axis=1)

In [45]:
# uk_data_thin_big_parties_undersampled["no_stop_text"] = uk_data_thin_big_parties_undersampled.apply(
#     lambda x: " ".join([word for word in x.text.split() if word not in stop_words]),
#     axis=1)
# uk_data_thin_big_parties_undersampled["word_n_no_stop"] = uk_data_thin_big_parties_undersampled.apply(
#     lambda x: len(x["no_stop_text"].split()),
#     axis=1)

In [46]:
# uk_data_thin_150_clean = uk_data_thin[uk_data_thin.word_n_no_stop >= 150]
# uk_data_thin_150_clean.shape

In [47]:
# texts_no_stop_1200 = uk_data_thin_150_clean[["no_stop_text", "word_n_no_stop"]].apply(
#     lambda x: x.no_stop_text if x.word_n_no_stop <= 1200
#     else " ".join(
#         x.no_stop_text.split()[
#             (x.word_n_no_stop//2)-600:(x.word_n_no_stop//2)+600]), axis=1)

In [48]:
# texts_1200_small = texts_no_stop_1200[:50000]

In [49]:
# Instantiating the TfidfVectorizer
tf_idf_vectorizer = TfidfVectorizer(min_df=5, max_df=0.85,
                                    max_features=10000, stop_words="english")

# Training it on the texts
weighted_words = tf_idf_vectorizer.fit_transform(
    uk_data_thin_big_parties_undersampled["text"]).toarray()

weighted_words.shape

(7000, 10000)

## Transforming the target

In [50]:
# Initialize OneHotEncoder
encoder = OneHotEncoder()

# Fit and transform the 'class' column
one_hot_encoded = encoder.fit_transform(uk_data_thin_big_parties_undersampled[['party']])

# Convert the one-hot encoded result to a DataFrame
one_hot_encoded_df = pd.DataFrame(one_hot_encoded.toarray(),
                                  columns=encoder.get_feature_names_out(['party']))

# # Concatenate the one-hot encoded DataFrame with the original DataFrame, dropping the original 'class' column
# df_encoded = pd.concat([uk_data_thin_big_parties_undersampled.drop(columns=['class']), one_hot_encoded_df], axis=1)
one_hot_encoded_df

Unnamed: 0,party_Con,party_DUP,party_Lab,party_LibDem,party_PlaidCymru,party_SNP,party_UUP
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...
6995,0.0,0.0,0.0,0.0,0.0,0.0,1.0
6996,0.0,0.0,0.0,0.0,0.0,0.0,1.0
6997,0.0,0.0,0.0,0.0,0.0,0.0,1.0
6998,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [51]:
# Save targets on google drive directly
one_hot_encoded_df.to_csv("/content/drive/MyDrive/Colab Notebooks/LeWagon_Project/PoliticalSpeeches/Data/target_1000sample_400min_600cutoff.csv")

In [52]:
# Save features on google drive directly
pd.DataFrame(weighted_words).to_csv("/content/drive/MyDrive/Colab Notebooks/LeWagon_Project/PoliticalSpeeches/Data/features_1000sample_400min_600cutoff.csv")