In [3]:
import os
import pandas as pd
import numpy as np

In [6]:
PROJECT_LEAD = "szaboildi"
PROJECT_NAME = "uk-pol-speech-classifier"

###########  CONSTANTS  ###############
LOCAL_PATH = os.path.join(
    os.path.expanduser('~'), "code", PROJECT_LEAD, PROJECT_NAME)
raw_data_path = os.path.join(
            LOCAL_PATH, "raw_data", "Corp_HouseOfCommons_V2.feather")
data = pd.read_feather(raw_data_path)


In [7]:
# Filter and clean data
data = data[["speaker", "party", "text"]]
min_word_count=400
sample_size=1000
parties_to_exclude=[]

# Filter for min word count
data["word_n_full"] = data.apply(lambda row: len(row["text"].strip().split()), axis=1)
data = data[data["word_n_full"] >= min_word_count]

# Only select big enough parties
n_speeches_by_party = data.groupby("party").size().reset_index(name="n_speeches").\
sort_values("n_speeches", ascending=False).reset_index(drop=True)
big_parties = n_speeches_by_party[n_speeches_by_party.n_speeches > sample_size]["party"].tolist()
data = data[(data["party"].isin(big_parties)) & (~(data["party"].isin(parties_to_exclude)))]

# Undersample
df_undersampled = pd.DataFrame()

for group_name, group_data in data.groupby('party'):
    sampled_data = group_data.sample(sample_size)
    df_undersampled = pd.concat([df_undersampled, sampled_data], axis=0)

df_undersampled.reset_index(drop=True, inplace=True)

In [9]:
from sklearn.model_selection import train_test_split
data_train, data_test = train_test_split(df_undersampled, test_size=0.2, random_state=42, stratify=df_undersampled["party"])

In [10]:
# Undersample data_test
speeches_per_party = 20
grouped_data = data_test.groupby('party')
smaller_data_test = []
for party, group in grouped_data:
    # Select randomly 20 speeches per party
    sampled_group = group.sample(n=speeches_per_party, random_state=42)
    # Add selected speeches to list
    smaller_data_test.append(sampled_group)

In [11]:
smaller_data_test

[               speaker party  \
 230      Gerald Malone   Con   
 313  Iain Duncan Smith   Con   
 366    Stephen Hammond   Con   
 510       Peter Lilley   Con   
 460       Michael Gove   Con   
 454   Gillian Shephard   Con   
 701      Boris Johnson   Con   
 391       Geoffrey Cox   Con   
 776         Greg Clark   Con   
 504        Mark Harper   Con   
 71      Charles Wardle   Con   
 875       Andrew Tyrie   Con   
 241     Andrea Leadsom   Con   
 791        John Bercow   Con   
 191       Andrew Percy   Con   
 288    Stephen O'Brien   Con   
 418     Quentin Davies   Con   
 768         Hugo Swire   Con   
 524      Michael Ellis   Con   
 337       John Redwood   Con   
 
                                                   text  word_n_full  
 230  I am afraid that I do not have time. The hon. ...         1272  
 313  I join the Prime Minister in paying tribute to...          799  
 366  I am grateful to you, Mr Speaker, for selectin...          657  
 510  The hon. Gentle

In [12]:
df = pd.concat(smaller_data_test, ignore_index=True)

In [13]:
df

Unnamed: 0,speaker,party,text,word_n_full
0,Gerald Malone,Con,I am afraid that I do not have time. The hon. ...,1272
1,Iain Duncan Smith,Con,I join the Prime Minister in paying tribute to...,799
2,Stephen Hammond,Con,"I am grateful to you, Mr Speaker, for selectin...",657
3,Peter Lilley,Con,The hon. Gentleman has already intervened thre...,484
4,Michael Gove,Con,I thank the Secretary of State for his announc...,939
...,...,...,...,...
135,Martin Smyth,UUP,I appreciate the opportunity of making a brief...,633
136,Martin Smyth,UUP,I wish to raise several points. I agree with m...,615
137,James Molyneaux,UUP,I share the reservations of the right hon. Mem...,929
138,William Ross,UUP,It is evident that the Liberal party has learn...,495


In [23]:
#Save smaller_data_test to csv
path="~/code/uk-pol-speech-classifier/polclassifier/smaller_data_test.csv"
df.to_csv(path, index=False)

In [None]:
path = os.path.join(LOCAL_PATH, "processed_data", "smaller_data_test.csv")

In [5]:
path

'/home/hailinh/code/szaboildi/uk-pol-speech-classifier/processed_data/smaller_data_test.csv'

In [6]:
pd.read_csv(path)

Unnamed: 0,speaker,party,text,word_n_full
0,Cheryl Gillan,Con,"That point was well made. I, too, was concerne...",480
1,Simon Hart,Con,"Like many hon. Members, I trawled through the ...",819
2,Graham Brady,Con,I have taken a lot of interventions and I shou...,462
3,Tom King,Con,I have already commented on the need for Membe...,962
4,Gary Streeter,Con,I was about to make the point that many Conser...,849
...,...,...,...,...
135,William Ross,UUP,The hon. Member for Yeovil (Mr. Ashdown) picke...,418
136,William Ross,UUP,Following on briefly from what the right hon. ...,459
137,David Trimble,UUP,I congratulate the hon. Member for Canterbury ...,1344
138,John Taylor,UUP,"With the leave of the House, I wish to reply t...",1018


In [14]:
np.unique(df["party"])

array(['Con', 'DUP', 'Lab', 'LibDem', 'PlaidCymru', 'SNP', 'UUP'],
      dtype=object)

In [15]:
libdem_data = data[data['party'] == "LibDem"]

In [16]:
libdem_data

Unnamed: 0,speaker,party,text,word_n_full
51,Paddy Ashdown,LibDem,No. The hon. Gentleman will discover why in a ...,527
53,Paddy Ashdown,LibDem,I am sorry to be discourteous to the hon. Memb...,629
55,Paddy Ashdown,LibDem,I hope that the hon. Gentleman will forgive me...,845
329,Robert Maclennan,LibDem,The two previous speeches have reversed the no...,2224
570,Archy Kirkwood,LibDem,It is always a pleasure to follow the hon. Mem...,1345
...,...,...,...,...
1955732,Norman Lamb,LibDem,I thank the hon. Gentleman for that. I will ca...,454
1955734,Norman Lamb,LibDem,I totally agree. The statistics that I am citi...,905
1955736,Norman Lamb,LibDem,I suspect that the right hon. Gentleman knows ...,414
1955755,Norman Lamb,LibDem,I thank the Minister for her response to the d...,619


In [17]:
lab_Data = data[data['party'] == "Lab"]
lab_Data

Unnamed: 0,speaker,party,text,word_n_full
4,Neil Kinnock,Lab,I am sure that I speak for the majority of hon...,2768
6,Neil Kinnock,Lab,"The hon. Gentleman says, "" Appeal."" Of course ...",850
59,Audrey Wise,Lab,Nothing in the Gracious Speech or in the Prime...,1344
67,Geoffrey Lofthouse,Lab,The Queen's Speech outlines the Government's l...,1661
73,Michael Martin,Lab,In the past few weeks in Glasgow and in Scotla...,691
...,...,...,...,...
1956183,Emma Hardy,Lab,I agree with my hon. Friend that the Governmen...,1020
1956185,Stephanie Peacock,Lab,It is a pleasure to serve under your chairmans...,447
1956187,Mike Kane,Lab,It is an honour to serve under your chairmansh...,1164
1956207,Lisa Nandy,Lab,"I beg to move, That Sir Lindsay Hoyle do take ...",811


# Debugging

In [108]:
import numpy as np

import nltk
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")

import os
import pandas as pd
from colorama import Fore, Style

import string
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.preprocessing import OneHotEncoder

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import gensim.downloader as api
from nltk.corpus import stopwords

from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV

from sklearn.svm import SVC

def clean_text(text:str):
    # remove whitespace
    text = text.strip()
    # lowercase characters
    text = text.lower()
    # remove numbers
    text = "".join([l for l in text if not l.isdigit()])
    # remove punctuation
    text = "".join([l for l in text if l not in string.punctuation])

    # remove double spaces
    text = text.replace("  ", " ")

    # tokenize
    tokens = word_tokenize(text)

    # # remove stopwords - we're doing this in the
    # stop_words = set(stopwords.words("english"))
    # tokens = [w for w in tokens if w not in stop_words]

    # lemmatize
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # join tokens
    cleaned_text = " ".join(lemmatized_tokens)

    return cleaned_text

def preprocess_text_col(
    df, max_word_count=600, extract_from="middle"):
    """
    df: dataframe to process
    max_word_count: amount of text to truncate long speeches to
    extract_from: where to get the truncated speech from for long speeches
        possible values: "start", "middle", "end"
        only "middle" is implemented so far
    min_df: hyperparameter for the TfidfVectorizer
    max_df: hyperparameter for the TfidfVectorizer
    max_features: hyperparameter for the TfidfVectorizer
    """
    # Truncating
    if extract_from == "middle":
        df["text"] = df.apply(
            lambda x: x.text if x.word_n_full <= max_word_count
            else " ".join(x.text.split()[
                (x.word_n_full//2)-(max_word_count//2):(x.word_n_full//2)+max_word_count//2]),
            axis=1)

    # Clean truncated text
    clean_texts = df["text"].apply(clean_text)

    return clean_texts

def pred_sklearn(model, X, speech: str = None) -> np.ndarray:

    """ Let's make a prediction using the latest ML model """

    # Create X_pred dataframe consisting of speech text and word count
    word_n_full = len(speech.strip().split())

    X_pred = pd.DataFrame(dict(
        text=[speech],
        word_n_full=[word_n_full],
    ))

    print("✅ Input string converted to dataframe, now preprocessing...\n")

    # Preprocess the input data
    X_processed = preprocess_text_col(X_pred)

    # Vectorise the processed text... HOW?


    tf_idf_vectorizer = TfidfVectorizer(
            min_df=1, max_df=0.99, max_features=10000,
            stop_words="english")
    tf_idf_vectorizer.fit(X)
    X_vectorized = tf_idf_vectorizer.transform(X_processed).toarray()

    print("... and vectorizing! ✅ \n")


    # Load model functionality specific to ML models
    model = model
    assert model is not None

    y_pred = model.predict(X_vectorized)

    print(f"✅ And the winner is ... {y_pred}")

    # Predict probabilities if the model supports it
    if hasattr(model, 'predict_proba'):
        y_prob = model.predict_proba(X_vectorized)
        print(f"✅ Probability estimates: {y_prob}")
    else:
        print("❌ Model does not support probability estimates.")
        y_prob = None

    return y_pred, y_prob

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/hailinh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/hailinh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/hailinh/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [11]:
import pandas as pd
# Load  data
X = pd.read_csv("/home/hailinh/code/szaboildi/uk-pol-speech-classifier/processed_data/features_1000sample_400min_600cutoff_tfidf.csv")
y=pd.read_csv("/home/hailinh/code/szaboildi/uk-pol-speech-classifier/processed_data/target_1000sample_400min_600cutoff_tfidf.csv")

In [54]:
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9990,9991,9992,9993,9994,9995,9996,9997,9998,9999
0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6995,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6996,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6997,0.0,0.0,0.086956,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6998,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
model = SVC(kernel="linear", C=1.32, probability=True)
model.fit(X, y)

  y = column_or_1d(y, warn=True)


In [59]:
texts=pd.read_csv("/home/hailinh/code/szaboildi/uk-pol-speech-classifier/raw_data/features_text_1000sample_400min_600cutoff.csv")
X_text=texts["text"]

In [109]:
y_pred, y_prob = pred_sklearn(model=model, X=X_text, speech="This is the first time that I have been accused of calling the shots on Government legislation. I know, although I will try to get through this speech without any further references to by-elections in Glasgow. My entry into this debate comes from my promotion last year of the Trade Union Rights and Freedoms Bill, which not only was supported by the TUC unanimously at congress, but received the support of action required by the Labour party conference. My Bill resulted from our experience in my constituency with Gate Gourmet. A large number of my constituents, largely Asian women, who were working for Gate Gourmet, a company that received contracts outsourced from British Airways, went to work one morning and were herded into a shed. A manager then came forward and sacked them by megaphone for refusing to sign up to new terms of work that would undermine their conditions, cut their pay and even threaten their pension status. At that time, we were looking for new legislation that would provide protection to those workers. However, some hon. Members will also recall that other members of the Transport and General Workers Union at the airport spontaneously came out on strike in solidarity. The airport is like a mining village. People work in different sections of the airport and for different companies. Many worked for companies to which contracts had been outsourced by British Airways and had worked with each other in the past. Indeed, many were members of the same family working in different areas, so there was a natural feeling of spontaneous support for the workers, largely women, who had been sacked so brutally by Gate Gourmet. The workers therefore came out on strike. As a result, the TGWU was threatened with tens of millions of pounds-the figure quoted was about Â£42 million-in fines by the aviation industry, which threatened to break the union. Eventually, the workers had to go back to work. Although the union did its best, some individuals suffered severe detriment, in both wages and conditions. Indeed, some of them are still not even back at work, despite all the union's hard work and best endeavours. It was in that climate that I expected the Employment Bill to reflect the concerns of trade union members, as well as members of the community throughout, and to reflect the Trade Union Rights and Freedoms Bill, which we sought to promote last year, because such incidents continue. Only last week we had a debate with the Minister present-I am grateful for his help in the discussions that we are having-because the Bakers, Food and Allied Workers Union had come forward about the Lyndale group. For three years, Lyndale had been planning to restructure, but without consultation with the trade unions. Lyndale went into administration, but within hours established itself as a new company trading in exactly the same way, laid off 700 workers, avoided its responsibilities for redundancy payments and, again, frogmarched some workers off its site. That is no way in which to treat workers in this day and age. The reason why companies can treat workers so is that we have fewer trade union rights now than we had in 1906, after the Taff Vale judgment. We still do not have the right to strike embedded in law and we no longer have the right to solidarity action. As a result, the reality of work for many people is that they are exploited and feel unprotected, so I was hoping for a more ambitious Bill than this one. I would like to set out those areas in which some of us will seek to amend the Bill in Committee and on Report to improve the situation, so that people can be represented properly by their trade unions, and so that industrial harmony can break out. Respect for trade union rights has been fundamental for co-operation between workers and employers for two centuries now, and has produced an industrial climate that has been beneficial to both sides of the industry, as well as to the community overall. I would like to go through those elements of the Bill that I would like to be amended and which I hope will form the agenda for later debate. Other Members on our Benches have issues that they would like to raise for amendment, too. The first issue that I want to discuss is industrial action. There is currently no right to strike in law in this country, although we have the right to be protected for a limited period from employers' attempts to threaten actions on breach of contract. In their most recent legislation on the issue, the Government improved the situation by extending the period of protection from eight to 12 weeks, but the right to strike is still not embedded in law, as it is across Europe. Furthermore, no protection is provided after 12 weeks, and I would like the Bill to be amended so that employees are given protection for an indefinite period when they go on strike. They should also have the right to take action if an employer takes action against them, unless the employer can prove that the detriment to, or the sacking of, a member of staff is not related to the industrial action. Even though people are protected for 12 weeks and action can be taken if they are sacked or suffer detriment, the Bill also fails to amend current law under which there is no effective way for them to ensure that they are reinstated. Their trade union may introduce a reinstatement order on their behalf following action by their employer, but such orders are made in only 0.2 per cent. of the cases that are brought, and few are implemented. I would therefore like the Bill to be amended to make effective the protection that people have when they take industrial action. The Bill also relates to agencies. We successfully introduced legislation to prevent agencies from being used to bring in workers to break strikes, but a loophole continues to exist, and agencies are still used to bring in strike-breaking workers. Indeed, in the Gate Gourmet case, the employer prepared well in advance by recruiting agency workers. I would like us to use a provision from the Trade Union Rights and Freedoms Bill to amend this Bill by placing a duty on the employer to inform the agency when a dispute takes place. At the moment, agencies often argue that they are unaware of such disputes. Unfortunately, the Bill also fails to consider ballot arrangements. The balloting process that trade unions currently have to undertake is complex and in many ways invites litigation and injunctions from employers. The Government need to examine a simplified notice system, which would benefit all sides. The employer should be given notice of future industrial action, but the amount of information required should be significantly reduced. For example, we have seen a number of disputes in which injunctions have been taken out against individual trade unions for not giving full information regarding the number, names and addresses of employees or the places where they work. Such requirements no longer reflect the modern workplace, where employees are moved from time to time, making it difficult to keep track of them, particularly when there is an element of outsourcing and contracts have been delegated to individual companies. I would like the Government to reconsider the issue, because it is in everyone's interests that we introduce a simplified procedure for giving notice to the employer, which requires less information about the number of employees who will participate in the dispute. In that way, we would overcome the unnecessary difficulties and, often, conflicts that embitter disputes as a result of the notice requirements in existing legislation. We recently had an extensive debate in Westminster Hall on the minimum wage at which the Financial Secretary to the Treasury was present. We hoped that the Bill would address a range of the issues that were raised, and although I welcome the fact that it addresses some of them, there are many outstanding ones. I still have a number of concerns-other hon. Members mentioned them during that Westminster Hall debate-including the Government's failure to reconsider the youth rate of the minimum wage. It is perplexing for many of us that the youth rate discriminates against younger workers. There are currently three national minimum wage rates based on age; workers between the ages of 16 and 17 receive Â£3.40 an hour, workers between the ages of 18 and 21 get Â£4.60 and workers aged over 22 get Â£5.52 an hour. What that means for the 16 to 17-year-olds is an annual wage of Â£6,630; and for the 18 to 21-year-olds, it is Â£8,970. I believe that those are poverty wages, and I find it almost impossible to understand how anyone, particularly an 18 to 20-year-old, could survive on a wage of Â£8,970 a year. Even for the over-22s, the annual minimum wage is only Â£10,764. These are poverty pay rates. My hon. Friend the Member for North Ayrshire and Arran (Ms Clark) laid early-day motion 329 before the House on the issue of the national minimum wage for young workers, and it was signed by more than 100 Labour Members. It called on the Governmentâ€œto take steps to remove the age discrimination in the national minimum wage and establish one rate for all workers irrespective of age.â€I regret that that principle has not been embodied in the Bill. The hon. Member for Huntingdon (Mr. Djanogly) referred to seafarers-again, in the recent Westminster Hall debate, we also asked that the Bill consider their position. I know that my hon. Friend the Member for North Ayrshire and Arran will also dwell on that subject. Speaking as someone who has steered debates on seafarers through the House for the past five years or longer, I have to say that I am extremely disappointed that the Bill does not even look at the current problem. I have discussed the issue with the Clerks, however, and found out that it will be in order to amend the Bill to deal with the matter later. Some Members will know the background to that debate because they have participated in it with me over the years. For a number of years, the Race Relations Act 1976 exempted shipping, so shipowners were able to discriminate in the payment of their workers on the basis of race. The European Union asked the Government to look again at the matter and there was a long campaign, organised by the National Union of Rail, Maritime and Transport Workers, in response to which the Government committed themselves to review the issue of discrimination on racial grounds, which meant that two seafarers doing exactly the same job on a British flagship were being paid differential rates based on their race. Most of us found that morally abhorrent, as it was dramatically perpetuating the exploitation of workers on low rates of pay. We campaigned and the Government responded, telling us that they would reform the law, but the reform that they introduced was unacceptable-the Government amended the legislation so that it could no longer discriminate on grounds of race, but discrimination on grounds of nationality continued. Most of us find that distinction almost impossible to fathom. We thought we had gained at least one concession when a former Minister gave a commitment to the Regulatory Reform Committee that all workers on UK flagships and ships working in British waters would be paid the minimum wage. I felt that that was a breakthrough, a concession and a victory that had resulted from our long-standing campaign. What we did not know then was that British waters would be defined not as territorial waters, as is generally accepted, but as Britain's internal waters. As a result, large numbers of workers-even on ferries, which we would usually consider to be within British waters-are not being paid the minimum wage, as employers continue to discriminate on the basis of nationality. I hope that in later stages of our consideration, we will be able to amend the Bill in order to secure fairness and equity for all workers, who should get the rate for the job, based on the nature of the job itself, not on their country of origin. In his introduction, the Minister spoke about the enforcement of the minimum wage. Many of us welcomed the statements made some time ago by the then Chancellor of the Exchequer, now the Prime Minister, about the allocation of additional funding of Â£3 million for the purpose of enforcement, but as we said in the Westminster Hall debate, it does not look as though much of that money has actually filtered through to the appointment of staff. I would welcome the Minister's pledge to reconsider that matter and the powers given to individual officers working for the enforcement team. Third-party actions also warrant further examination. At present, enforcement takes place when individuals demonstrate that they are being paid less than the minimum wage. In some circumstances that requires an act of courage, especially when exploitative employers seek to intimidate their work forces. Perhaps the Minister will consider tabling an amendment allowing third-party actions. That would enable trade unions and other organisations to represent individuals or groups of workers and bring cases relating to failure to pay the minimum wage, so that the system would no longer need to rely on individuals who can so easily be victimised and intimidated. The Minister mentioned voluntary workers and cadet force adult volunteers, but I believe that the Bill may allow us to look beyond those groups. We have received representations from the National Union of Journalists and the Performers Alliance, which includes Equity, the Musicians' Union and the Writers' Guild of Great Britain. According to the NUJ, many people, mostly in newspapers but across the media, are required to work voluntarily not just for a few hours a week or a few weeks but, in some cases, for between six and 12 months in order to get on to the ladder even to be considered for a permanent position. As a result, they fall outside any legislation that would protect them from exploitation and ensure that they were paid the minimum wage at some stage in their careers. I hope that the Minister will examine the issue of volunteers in those sectors and propose amendments, because we may not have another opportunity to tackle it for some time. The Performers Alliance unions also raised the issue of agency fees and their effect on the minimum wage. We know that the Government have helped greatly through their moves to protect workers from an agency which, at one point, was charging high fees and deducting them from wages, with the result that many workers' pay fell below the level of the minimum wage. However, there is still a loophole. Some agents have reconstructed themselves as publishers, so that workers can be charged a publication fee simply to be listed in a directory. They are being exploited by having to pay a fee upfront, which is deducted from their pay so that, again, it falls below the minimum wage. Perhaps that too could be amended in the Bill. The hon. Member for Broxbourne (Mr. Walker), who is no longer present, mentioned tips. We have been promised movement on that for a long time. A private Member's Bill was drafted in the hope that the Government could incorporate its provisions in future legislation. I think the time has come to give full protection to workers who depend on gratuities, so that those amounts are not deducted from their minimum wage. At present there are a number of ways in which employers can avoid the current legislation, with the result that workers do not receive the full reward for their good service. That too was raised in the Adjournment debate a few weeks ago, and again we hope that the Minister will be able to incorporate provisions from the private Member's Bill in this legislation. I hope that another issue raised by Members will be incorporated. The Bill provides powers of enforcement in regard to non-payment of the minimum wage, but we believe that the provision for the exchange of information between agencies should include the opportunity to deal with non-payment of holiday money, and with other payments not made by employers. The Bill should contain a right of protection, so that those officers who deal with the enforcement of the minimum wage can also deal with the non-payment of holiday pay and other payments that employees should have received but the employer has avoided paying. I also hope the Minister will consider the representations we have received from a number of unions-I am thinking in particular of the civil service union, the Public and Commercial Services Union-on workplace environmental representatives. We were hoping that the role that such representatives play in their companies and their work sectors would be recognised in this Bill. Environmental representatives are like health and safety representatives in that they want to participate in the development of the policy of their company-or agency, or Government Department-with regard to improving the environmental standards of their workplace. At present, they undertake that role while some employers, including Government Departments, recognise them and some do not. They play a vital role in identifying areas where the environmental standards and performance of their employers can be improved, but they gain no statutory recognition and as a result no facilities in assisting them other than those that have been acquired voluntarily by representations to their employers. I would like the Government to look at the formal recognition of workplace environmental representatives, particularly on the day when we have received an Environmental Audit Committee report on the lack of progress, to put it diplomatically, especially of Government Departments in achieving the Government's own environmental targets. Finally, let me turn to the part of the Bill that deals with the ASLEF judgment. A range of problems have been identified in the concessions the Government have so far made in the other place. I believe they will impede the implementation of the spirit of the Government proposals, particularly with regard to the detail of what is required of a union in identifying what political party the member they are dealing with belongs to and how that can change over time. Other fairly onerous requirements will undermine the implementation of the European Court of Justice's decision on the matter. We will look at amendments as this Bill proceeds through Committee and on Report, but I think there is agreement in every part of the House that it is most important that we ensure that trade unions have the freedom not to have to accept as members those who hold offensive views and act offensively. That leads me on to another issue to do with our public services, which I think may be addressed in this part of the proposed legislation. We already have rules and regulations that prevent British National party members from serving in the police and prison services, but we do not have those rules for BNP members who work in other sections and Government Departments, such as the Department for Work and Pensions, where they are meant to be serving a multicultural community. I would like the Government to look at from what other areas of service beyond the police and the Prison Service we should bar the employment of BNP members or bar their continued membership of the BNP, because I believe that holding those views and being a member of that party infects their role in serving a multicultural community. We should do everything we can and employ the full legislative force to prevent BNP members from operating in those sectors and thereby undermining the ethos of fair and equal service to the public. I hope to work through those issues during the Bill's progress because, by doing so, I believe we have the opportunity to implement employment legislation that addresses some of the key issues that face many of our work force today. Mention has been made of the Warwick agreement and the discussions taking place at present about Warwick mark 2. On that, I wish to repeat my disappointment that we have not had the opportunity within this Employment Bill to assert in law, as is the case in the rest of Europe, the basic human right of someone to withdraw their labour, including by secondary action-sympathetic action-in solidarity with other workers. Until we can secure that right once again, people will continue to be exploited; there will be bad employers who seek to undermine their wages and conditions, and the balance between employer and employee will still be out of kilter. I urge the Government to look again at this Bill to see whether we can include once again in British law this protection and the basic human right to withdraw one's labour")

✅ Input string converted to dataframe, now preprocessing...

... and vectorizing! ✅ 





✅ And the winner is ... ['Con']
✅ Probability estimates: [[0.30261353 0.03648093 0.3037644  0.08037255 0.08932465 0.14282415
  0.04461979]]




In [110]:
y_pred.tolist()

['Con']

In [111]:
print(y_prob), print(type(y_prob)), print(y_prob.shape)

[[0.30261353 0.03648093 0.3037644  0.08037255 0.08932465 0.14282415
  0.04461979]]
<class 'numpy.ndarray'>
(1, 7)


(None, None, None)

In [112]:
np.max(y_prob[0])

0.3037643974812754

In [107]:
y_prob[0]

IndexError: invalid index to scalar variable.

In [104]:
print(dict(party=y_pred[0], probability=y_prob))

{'party': 'Con', 'probability': 0.30261352868602004}


In [90]:
if isinstance(y_prob, np.ndarray):
    if y_prob.size == 1:
        y_prob = float(y_prob[0])
    else:
        print("The numpy array y_prob should contain only one element.")

In [85]:
if y_prob is not None:
        print(dict(party=y_pred[0], probability=round(y_prob*100, 2)))
else:
        print(dict(party=y_pred[0]))

{'party': 'Con', 'probability': 30.26}


In [94]:
 y_prob = y_prob.tolist()

AttributeError: 'float' object has no attribute 'tolist'

# File sample with cut out text

In [2]:
import pandas as pd

df = pd.read_csv("/home/hailinh/code/szaboildi/uk-pol-speech-classifier/processed_data/smaller_data_test.csv")
df

Unnamed: 0,speaker,party,text,word_n_full
0,Cheryl Gillan,Con,"That point was well made. I, too, was concerne...",480
1,Simon Hart,Con,"Like many hon. Members, I trawled through the ...",819
2,Graham Brady,Con,I have taken a lot of interventions and I shou...,462
3,Tom King,Con,I have already commented on the need for Membe...,962
4,Gary Streeter,Con,I was about to make the point that many Conser...,849
...,...,...,...,...
135,William Ross,UUP,The hon. Member for Yeovil (Mr. Ashdown) picke...,418
136,William Ross,UUP,Following on briefly from what the right hon. ...,459
137,David Trimble,UUP,I congratulate the hon. Member for Canterbury ...,1344
138,John Taylor,UUP,"With the leave of the House, I wish to reply t...",1018


In [3]:
def cut_middle_600(text):

    text = text.strip().split()

    if len(text) <= 600:
        return text  # Return the original text if it's shorter than 600 characters
    else:
        start_index = (len(text) // 2) - 300  # Calculate the start index of the middle 600 characters
        return "..." + " ".join(text[start_index:start_index + 600]) + "..."  # Add "..." around the middle 600 characters

# Apply the function to the 'text' column
df['sample_text'] = df['text'].apply(cut_middle_600)

In [12]:
df["sample_text"]

0      [That, point, was, well, made., I,, too,, was,...
1      ...That idea seems to be taking root, and in f...
2      [I, have, taken, a, lot, of, interventions, an...
3      ...public inquiry or royal commission will hav...
4      ...an honest attempt to assert the primacy of ...
                             ...                        
135    [The, hon., Member, for, Yeovil, (Mr., Ashdown...
136    [Following, on, briefly, from, what, the, righ...
137    ...and other hon. Members, to please read the ...
138    ...behaving thoroughly unpleasantly towards ea...
139    ...some familiar problems. One of the great di...
Name: sample_text, Length: 140, dtype: object

In [6]:
#Save smaller_data_test to csv
path="/home/hailinh/code/szaboildi/uk-pol-speech-classifier/processed_data/smaller_data_sample_text.csv"
df.to_csv(path, index=False)

In [11]:
import os
PROJECT_LEAD = "szaboildi"
PROJECT_NAME = "uk-pol-speech-classifier"
LOCAL_PATH = os.path.join(
    os.path.expanduser('~'), "code", PROJECT_LEAD, PROJECT_NAME)
data_path = os.path.join(LOCAL_PATH, "processed_data", "smaller_data_sample_text.csv")
data = pd.read_csv(data_path)

In [14]:
# Filter DataFrame for tokenized texts
df_tokenized = df[df['sample_text'].apply(lambda x: isinstance(x, list))]

# Show the filtered DataFrame
print(df_tokenized)

                    speaker       party  \
0             Cheryl Gillan         Con   
2              Graham Brady         Con   
11           George Osborne         Con   
12            Graham Stuart         Con   
14        James Brokenshire         Con   
23              Jim Shannon         DUP   
26              Ian Paisley         DUP   
27              Jim Shannon         DUP   
29             Sammy Wilson         DUP   
31              Nigel Dodds         DUP   
33             Sammy Wilson         DUP   
44             Ben Bradshaw         Lab   
45              Steve Pound         Lab   
48            Jeremy Corbyn         Lab   
51             Jon Ashworth         Lab   
60            Sandra Gidley      LibDem   
61               Paul Rowen      LibDem   
62               David Laws      LibDem   
64             Norman Baker      LibDem   
65              David Heath      LibDem   
66               David Laws      LibDem   
67               David Laws      LibDem   
68         

In [15]:
def tokenize_if_needed(text):
    if isinstance(text, list):
        return text  # Already tokenized
    return text.split()  # Tokenize by splitting

# Apply the tokenization function to ensure all texts are tokenized
df['text'] = df['text'].apply(tokenize_if_needed)

# Filter DataFrame for tokenized texts
df_tokenized = df[df['text'].apply(lambda x: isinstance(x, list))]

# Function to cut the middle 600 words
def cut_middle_600(words):
    if len(words) <= 600:
        return " ".join(words)  # Return the original text if it's shorter than 600 words
    else:
        start_index = (len(words) // 2) - 300  # Calculate the start index of the middle 600 words
        # Ensure the start index is non-negative
        start_index = max(start_index, 0)
        return "..." + " ".join(words[start_index:start_index + 600]) + "..."  # Add "..." around the middle 600 words

# Apply the function to the filtered DataFrame
df_tokenized['sample_text'] = df_tokenized['text'].apply(cut_middle_600)

In [17]:
df = df_tokenized
df

Unnamed: 0,speaker,party,text,word_n_full,sample_text
0,Cheryl Gillan,Con,"[That, point, was, well, made., I,, too,, was,...",480,"That point was well made. I, too, was concerne..."
1,Simon Hart,Con,"[Like, many, hon., Members,, I, trawled, throu...",819,"...That idea seems to be taking root, and in f..."
2,Graham Brady,Con,"[I, have, taken, a, lot, of, interventions, an...",462,I have taken a lot of interventions and I shou...
3,Tom King,Con,"[I, have, already, commented, on, the, need, f...",962,...public inquiry or royal commission will hav...
4,Gary Streeter,Con,"[I, was, about, to, make, the, point, that, ma...",849,...an honest attempt to assert the primacy of ...
...,...,...,...,...,...
135,William Ross,UUP,"[The, hon., Member, for, Yeovil, (Mr., Ashdown...",418,The hon. Member for Yeovil (Mr. Ashdown) picke...
136,William Ross,UUP,"[Following, on, briefly, from, what, the, righ...",459,Following on briefly from what the right hon. ...
137,David Trimble,UUP,"[I, congratulate, the, hon., Member, for, Cant...",1344,"...and other hon. Members, to please read the ..."
138,John Taylor,UUP,"[With, the, leave, of, the, House,, I, wish, t...",1018,...behaving thoroughly unpleasantly towards ea...


In [18]:
# Filter DataFrame for tokenized texts
df_tokenized = df[df['sample_text'].apply(lambda x: isinstance(x, list))]
df_tokenized

Unnamed: 0,speaker,party,text,word_n_full,sample_text


In [19]:
#Save smaller_data_test to csv
path="/home/hailinh/code/szaboildi/uk-pol-speech-classifier/processed_data/smaller_data_sample_text.csv"
df.to_csv(path, index=False)