This model will analyse and determine topics for categorising voter to a MP using the House of Commons Oral and Written Questions data set

In [None]:
import pandas as pd
import json

In [None]:
import json

with open('/content/questions_by_party.json', 'r') as file:
    data = json.load(file)

print(data)

{'Conservative': ['If her Department will make an assessment of the potential merits of excluding charity lotteries from the proposed gambling levy.', 'To ask the hon. Member for South West Bedfordshire, representing the Church Commissioners, what steps the Church of England is taking to encourage church choirs to engage with local schools.', 'If she will make a statement on her departmental responsibilities.', 'To ask the hon. Member for South West Bedfordshire, representing the Church Commissioners, what recent assessment the Church of England has made of the potential merits of recruiting additional ordained Ministers into Parish Ministry in the Diocese of Exeter.', 'If she will make a statement on her departmental responsibilities.', 'What assessment her Department has made of the potential impact of the gambling white paper on the horse racing sector.', 'To ask the hon. Member for South West Bedfordshire, representing the Church Commissioners, what steps the Church of England is t

In [None]:
transformed_data = []

for party, questions in data.items():
    for question in questions:
        transformed_data.append({"Party": party, "Question": question})

print(transformed_data)

[{'Party': 'Conservative', 'Question': 'If her Department will make an assessment of the potential merits of excluding charity lotteries from the proposed gambling levy.'}, {'Party': 'Conservative', 'Question': 'To ask the hon. Member for South West Bedfordshire, representing the Church Commissioners, what steps the Church of England is taking to encourage church choirs to engage with local schools.'}, {'Party': 'Conservative', 'Question': 'If she will make a statement on her departmental responsibilities.'}, {'Party': 'Conservative', 'Question': 'To ask the hon. Member for South West Bedfordshire, representing the Church Commissioners, what recent assessment the Church of England has made of the potential merits of recruiting additional ordained Ministers into Parish Ministry in the Diocese of Exeter.'}, {'Party': 'Conservative', 'Question': 'If she will make a statement on her departmental responsibilities.'}, {'Party': 'Conservative', 'Question': 'What assessment her Department has 

In [None]:
df = pd.DataFrame(transformed_data)

display(df)

Unnamed: 0,Party,Question
0,Conservative,If her Department will make an assessment of t...
1,Conservative,To ask the hon. Member for South West Bedfords...
2,Conservative,If she will make a statement on her department...
3,Conservative,To ask the hon. Member for South West Bedfords...
4,Conservative,If she will make a statement on her department...
...,...,...
4459,Workers Party of Britain,If she will have discussions with the Secretar...
4460,Workers Party of Britain,If she will make a statement on her department...
4461,Workers Party of Britain,What recent discussions he has had with Avanti...
4462,Workers Party of Britain,If he will make a statement on his departmenta...


In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF
from nltk.corpus import stopwords
import nltk

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

stop_words = list(ENGLISH_STOP_WORDS)

vectorizer = TfidfVectorizer(stop_words=stop_words, max_df=0.95, min_df=2)
X = vectorizer.fit_transform(df['Question'])

In [None]:
# LDA for topic modelling
n_topics = 5
lda = LatentDirichletAllocation(n_components=n_topics)
lda.fit(X)

# NMF
nmf = NMF(n_components=n_topics)
nmf.fit(X)

In [None]:
def display_topics(model, feature_names, num_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx + 1}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-num_top_words - 1:-1]]))

num_top_words = 15
feature_names = vectorizer.get_feature_names_out()

print("LDA Topics:")
display_topics(lda, feature_names, num_top_words)

print("\nNMF Topics:")
display_topics(nmf, feature_names, num_top_words)

LDA Topics:
Topic 1:
steps taking recent discussions department support colleagues cabinet northern reduce ireland veterans assessment help church
Topic 2:
list official engagements wednesday living cost january february july march november 13 increases 22 december
Topic 3:
departmental responsibilities statement make intensive grassroots gambling industries decarbonise charity ending vulnerable ticketing hotels administrations
Topic 4:
steps taking department assessment recent increase support help potential progress sector discussions impact state people
Topic 5:
steps taking department assessment recent support help improve energy number adequacy impact increase implications children

NMF Topics:
Topic 1:
departmental responsibilities statement make policy merits government public potential autumn portfolio ownership ministers assessment economy
Topic 2:
wednesday engagements list official january february march july 13 november 22 15 september december 28
Topic 3:
steps taking depa

In [None]:
#gensim alternative below
#topics should relate to environment, health care, social issues, economy, international relations?
import gensim
from gensim import corpora
from gensim.models import LdaModel

In [None]:
stop_words = set(stopwords.words('english'))

def preprocess(text):
    tokens = gensim.utils.simple_preprocess(text)
    return [token for token in tokens if token not in stop_words]

df['processed'] = df['Question'].apply(preprocess)

In [None]:
dictionary = corpora.Dictionary(df['processed'])

dictionary.filter_extremes(no_below=2, no_above=0.95)

corpus = [dictionary.doc2bow(text) for text in df['processed']]

In [None]:
num_topics = 5
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, random_state=42, passes=10)

topics = lda_model.print_topics(num_words=10)
for topic in topics:
    print(topic)

(0, '0.058*"discussions" + 0.055*"official" + 0.055*"list" + 0.055*"engagements" + 0.055*"wednesday" + 0.043*"cabinet" + 0.043*"colleagues" + 0.039*"recent" + 0.021*"scotland" + 0.020*"government"')
(1, '0.128*"steps" + 0.112*"taking" + 0.047*"department" + 0.043*"support" + 0.039*"help" + 0.017*"increase" + 0.015*"improve" + 0.014*"reduce" + 0.012*"ensure" + 0.011*"taken"')
(2, '0.294*"make" + 0.267*"statement" + 0.264*"departmental" + 0.006*"rights" + 0.005*"students" + 0.005*"climate" + 0.004*"human" + 0.004*"legal" + 0.004*"change" + 0.003*"case"')
(3, '0.099*"made" + 0.093*"assessment" + 0.056*"recent" + 0.027*"potential" + 0.027*"impact" + 0.026*"adequacy" + 0.025*"department" + 0.014*"progress" + 0.014*"policies" + 0.014*"level"')
(4, '0.038*"discussions" + 0.032*"whether" + 0.027*"commission" + 0.025*"recent" + 0.022*"state" + 0.020*"electoral" + 0.019*"secretary" + 0.018*"committee" + 0.017*"hon" + 0.016*"ask"')


In [None]:
def display_topics(model, num_words):
    for idx, topic in model.print_topics(-1):
        print('Topic: {} \nWords: {}'.format(idx, topic))

display_topics(lda_model, 10)

Topic: 0 
Words: 0.058*"discussions" + 0.055*"official" + 0.055*"list" + 0.055*"engagements" + 0.055*"wednesday" + 0.043*"cabinet" + 0.043*"colleagues" + 0.039*"recent" + 0.021*"scotland" + 0.020*"government"
Topic: 1 
Words: 0.128*"steps" + 0.112*"taking" + 0.047*"department" + 0.043*"support" + 0.039*"help" + 0.017*"increase" + 0.015*"improve" + 0.014*"reduce" + 0.012*"ensure" + 0.011*"taken"
Topic: 2 
Words: 0.294*"make" + 0.267*"statement" + 0.264*"departmental" + 0.006*"rights" + 0.005*"students" + 0.005*"climate" + 0.004*"human" + 0.004*"legal" + 0.004*"change" + 0.003*"case"
Topic: 3 
Words: 0.099*"made" + 0.093*"assessment" + 0.056*"recent" + 0.027*"potential" + 0.027*"impact" + 0.026*"adequacy" + 0.025*"department" + 0.014*"progress" + 0.014*"policies" + 0.014*"level"
Topic: 4 
Words: 0.038*"discussions" + 0.032*"whether" + 0.027*"commission" + 0.025*"recent" + 0.022*"state" + 0.020*"electoral" + 0.019*"secretary" + 0.018*"committee" + 0.017*"hon" + 0.016*"ask"


In [None]:
!pip install pyLDAvis
import pandas as pd
import gensim
from gensim import corpora
from gensim.models import LdaModel, CoherenceModel
from nltk.corpus import stopwords
import nltk
import pyLDAvis.gensim_models
import pyLDAvis

Collecting pyLDAvis
  Downloading pyLDAvis-3.4.1-py3-none-any.whl (2.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m24.1 MB/s[0m eta [36m0:00:00[0m
Collecting funcy (from pyLDAvis)
  Downloading funcy-2.0-py2.py3-none-any.whl (30 kB)
Installing collected packages: funcy, pyLDAvis
Successfully installed funcy-2.0 pyLDAvis-3.4.1


In [None]:
coherence_model_lda = CoherenceModel(model=lda_model, texts=df['processed'], dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print(f'Coherence Score: {coherence_lda}')

  and should_run_async(code)


Coherence Score: 0.45657970591837166


In [None]:
import pyLDAvis.gensim_models as gensimvis

pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, corpus, dictionary)

pyLDAvis.display(vis)

  and should_run_async(code)
