# Analysis of PM Lee's COVID-19 Speeches

## Data Wrangling and Cleaning

### 1. Import modules

In [9]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from multi_rake import Rake
import bar_chart_race as bcr

In [4]:
df = pd.read_csv('../Data/pm_lee_covid_speeches.csv')
df['date'] = df['title'].str.extract(r'(\d{1,2}\s\w+\s202[0-2])')
df['date'] = pd.to_datetime(df['date'])
df

Unnamed: 0,title,speech,date
0,PM Lee Hsien Loong on the COVID-19 situation i...,\nWe have faced the new coronavirus (nCoV) sit...,2020-02-08
1,PM Lee Hsien Loong on the COVID-19 situation i...,"\nMy fellow Singaporeans, good evening.\n\nFiv...",2020-03-12
2,PM Lee Hsien Loong on the COVID-19 situation i...,\n\nMy fellow Singaporeans\nGood afternoon\n\n...,2020-04-03
3,PM Lee Hsien Loong on the COVID-19 situation i...,\n\nMy fellow Singaporeans\n\nI spoke to you o...,2020-04-10
4,PM Lee Hsien Loong on the COVID-19 situation i...,\nFellow Singaporeans & Residents\n\nI am spea...,2020-04-21
5,National Broadcast by PM Lee Hsien Loong on 7 ...,\nMy fellow Singaporeans\n\nGood evening\n\nOu...,2020-06-07
6,PM Lee Hsien Loong on the COVID-19 situation i...,\nMy fellow Singaporeans\n\nWe are coming to a...,2020-12-14
7,PM Lee Hsien Loong on the COVID-19 'New Normal...,\nMy fellow Singaporeans\n\nFor the last three...,2021-05-31
8,Update on the COVID-19 Situation in Singapore ...,"\nProtect the Vulnerable, Secure our Future\nM...",2021-10-09
9,Speech by PM Lee Hsien Loong on COVID-19: A Ne...,"\nMy fellow Singaporeans,\n\nOur fight against...",2022-03-24


In [5]:
def remove_punct(text):

    """This function takes a text string as input and returns the same text string without specified punctuation marks."""
    
    # Specify punctuation marks to remove from text string
    punctuation = "``±!@#$%^&*()+?:;”“’<>.,-—" 

    # Loop through the text to remove specified punctuations
    for c in text:
        if c in punctuation:
            text = text.replace(c, "")

    return text

t = df.loc[0, 'speech']

def pre_process(text):
    lemmatizer = nltk.stem.WordNetLemmatizer()
    
    text = remove_punct(text)
    text = text.replace("\n", " ")
    text = text.lower()
#     word_list = [word for word in word_tokenize(text) if not word in set(stopwords.words('english'))]
    word_list = [word for word in word_tokenize(text)]

#     word_list = [lemmatizer.lemmatize(word) for word in word_tokenize(text) if not word in set(stopwords.words('english'))]
    return ' '.join(word_list)
    
t = pre_process(t)    

In [6]:
df['text_processed'] = df['speech'].apply(lambda x: pre_process(x))
df

Unnamed: 0,title,speech,date,text_processed
0,PM Lee Hsien Loong on the COVID-19 situation i...,\nWe have faced the new coronavirus (nCoV) sit...,2020-02-08,we have faced the new coronavirus ncov situati...
1,PM Lee Hsien Loong on the COVID-19 situation i...,"\nMy fellow Singaporeans, good evening.\n\nFiv...",2020-03-12,my fellow singaporeans good evening five weeks...
2,PM Lee Hsien Loong on the COVID-19 situation i...,\n\nMy fellow Singaporeans\nGood afternoon\n\n...,2020-04-03,my fellow singaporeans good afternoon update o...
3,PM Lee Hsien Loong on the COVID-19 situation i...,\n\nMy fellow Singaporeans\n\nI spoke to you o...,2020-04-10,my fellow singaporeans i spoke to you on tv la...
4,PM Lee Hsien Loong on the COVID-19 situation i...,\nFellow Singaporeans & Residents\n\nI am spea...,2020-04-21,fellow singaporeans residents i am speaking to...
5,National Broadcast by PM Lee Hsien Loong on 7 ...,\nMy fellow Singaporeans\n\nGood evening\n\nOu...,2020-06-07,my fellow singaporeans good evening our fight ...
6,PM Lee Hsien Loong on the COVID-19 situation i...,\nMy fellow Singaporeans\n\nWe are coming to a...,2020-12-14,my fellow singaporeans we are coming to a full...
7,PM Lee Hsien Loong on the COVID-19 'New Normal...,\nMy fellow Singaporeans\n\nFor the last three...,2021-05-31,my fellow singaporeans for the last three week...
8,Update on the COVID-19 Situation in Singapore ...,"\nProtect the Vulnerable, Secure our Future\nM...",2021-10-09,protect the vulnerable secure our future my fe...
9,Speech by PM Lee Hsien Loong on COVID-19: A Ne...,"\nMy fellow Singaporeans,\n\nOur fight against...",2022-03-24,my fellow singaporeans our fight against covid...


In [7]:
d = pd.DataFrame()
for idx in range(len(df)):
    rake = Rake()
    keywords = rake.apply(df.loc[idx, 'speech'])
    df1 = pd.DataFrame(keywords[:10])
    if idx == 0:
        d = df1
    else:
        d = pd.merge(d, df1, how='outer', on=0)
    
d2 = d.T
d2.columns = d2.loc[0]
d2 = d2[1:]
d2.reset_index(drop=True, inplace=True)
d2 = pd.concat([d2, df['date']], axis=1)
d2.set_index('date', inplace=True)
d2.fillna(0, inplace=True)
# d2.reset_index(inplace=True)
d2

Unnamed: 0_level_0,ministerial task force,personal protective equipment,advanced research capabilities,large public events,observe personal hygiene,natural human reactions,team nila volunteers,stockpiled adequate supplies,avoid crowded places,quarantined close contacts,...,highest vaccination rates,plans moving forward,made tremendous sacrifices,drastically streamline testing,solidarity remains crucial,everyone’s collective effort,‘freedom day’ approach,enjoy larger gatherings,social responsibility —,protect singaporeans’ livelihoods
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-02-08,9.0,9.0,9.0,9.0,9.0,9.0,9.0,8.5,8.5,8.333333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-03-12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-04-03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-04-10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-04-21,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-06-07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-12-14,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2021-05-31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2021-10-09,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2022-03-24,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,9.0,9.0,9.0,9.0,8.5,8.5,8.5,8.5,8.0


In [10]:
bcr.bar_chart_race(
    df=d2,
    n_bars=10,
    fixed_order=False,
    fig_kwargs={
        'figsize': (26, 15),
        'dpi': 120,
        'facecolor': '#F8FAFF'
    },
    period_length=1500,
    title={
        'label': "Key Phrases from PM Lee's COVID-19 Speeches",
        'size': 52,
        'weight': 'bold',
        'pad': 40
    },
    period_label={
        'x': 0.95, 
        'y': 0.15,
        'ha': 'right',
        'va': 'center',
        'size': 72,
        'weight': 'semibold'
    },
    bar_label_font={
        'size': 27
    },
    tick_label_font={
        'size': 27
    },
    bar_kwargs={
        'alpha': 0.99,
        'lw'L 0
    }
    
#     filename="../video.mp4"
)

In [69]:
# df['speech'].str.len()
# df['text_processed'].str.len()

In [71]:
docs = df['text_processed'].tolist()
# docs

In [72]:
from sklearn.feature_extraction.text import CountVectorizer

In [73]:
cv = CountVectorizer(max_df=0.85)
word_count_vector = cv.fit_transform(docs)
word_count_vector

<10x2458 sparse matrix of type '<class 'numpy.int64'>'
	with 4853 stored elements in Compressed Sparse Row format>

In [76]:
list(cv.vocabulary_.keys())[:10]

['faced',
 'coronavirus',
 'ncov',
 'ministerial',
 'task',
 'force',
 'advised',
 'dpm',
 'heng',
 'swee']

In [77]:
from sklearn.feature_extraction.text import TfidfTransformer

In [None]:
tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True)
tfidf_transformer.fit(word_count_vector)

In [78]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(use_idf=True, max_df=0.5, min_df=1, ngram_range=(1,3))
vectors = vectorizer.fit_transform(df['text_processed'])
vectors

<10x19294 sparse matrix of type '<class 'numpy.float64'>'
	with 21662 stored elements in Compressed Sparse Row format>

In [80]:
dict_of_tokens = {i[1]:i[0] for i in vectorizer.vocabulary_.items()}
# dict_of_tokens

In [182]:
# tfidf_vectors = []  # all deoc vectors by tfidf
# for row in vectors:
#     tfidf_vectors.append({dict_of_tokens[column]:value for (column,value) in zip(row.indices,row.data)})


In [181]:
# print("The number of document vectors = ", len(tfidf_vectors), "\nThe dictionary of document[0]: ", tfidf_vectors[0])

In [180]:
# doc_sorted_tfidfs =[]  # list of doc features each with tfidf weight
# #sort each dict of a document
# for dn in tfidf_vectors:
#     newD = sorted(dn.items(), key=lambda x: x[1], reverse=True)
#     newD = dict(newD)
#     doc_sorted_tfidfs.append(newD)
    
# doc_sorted_tfidfs

RAKE

In [207]:
# 'chief guards officer' in df.loc[3, 'speech'].lower()
# print(df.loc[3, 'speech'])

In [260]:
keywords6 = rake.apply(df.loc[6, 'speech'])
df6 = pd.DataFrame(keywords6[:10])
df6

Unnamed: 0,0,1
0,antigen rapid tests,9.0
1,resume larger gatherings,9.0
2,ministerial task force,9.0
3,visit someone’s home,9.0
4,health sciences authority,9.0
5,systematically creating opportunities,9.0
6,global economic dislocation,8.75
7,distributing tracetogether tokens,8.5
8,clinical trial data,8.5
9,global aviation hub,8.25


In [264]:
pd.merge(df0, df6, how='outer', on=0)

Unnamed: 0,0,1_x,1_y
0,ministerial task force,9.0,9.0
1,personal protective equipment,9.0,
2,advanced research capabilities,9.0,
3,large public events,9.0,
4,observe personal hygiene,9.0,
5,natural human reactions,9.0,
6,team nila volunteers,9.0,
7,stockpiled adequate supplies,8.5,
8,avoid crowded places,8.5,
9,quarantined close contacts,8.333333,


In [253]:
from multi_rake import Rake
rake = Rake(max_words=3)

all_phrases = []
d = {}

for idx in range(len(df)):
    keywords = rake.apply(df.loc[idx, 'speech'])
#     print(keywords[:10])
#     print()
    
    for phrase, score in keywords[:5]:
        print(phrase, score)
        
        if phrase in all_phrases:
            print("***", phrase)
            d[phrase] = round(score, 1)
        
        else:
            d[phrase] = round(score, 1)
            all_phrases.append(phrase)
#         if item[0] not in col:
#             col.append(item[0])
#         else:
            
    print()
pd.DataFrame(d, index=[0])
# print(all_phrases)
# print(len(all_phrases))

ministerial task force 9.0
personal protective equipment 9.0
advanced research capabilities 9.0
large public events 9.0
observe personal hygiene 9.0

kuala lumpur recently 9.0
saint peter’s square 9.0
high blood pressure 9.0
create additional capacity 9.0
made direct appeals 9.0

pre-empt escalating infections 9.0
press conference immediately 9.0
disrupt economic sectors 9.0
global supply chain 9.0
social distancing extremely 9.0

chief guards officer 9.0
public service agencies 9.0
favourite tv programmes 9.0
foreign worker dormitories 8.5
paying close attention 8.5

active treatment receive 9.0
maintaining key infrastructure 9.0
remained largely contained 9.0
multi-ministry task force 9.0
popular wet markets 9.0

contact tracing significantly 9.0
outbreak remains firmly 9.0
maintain personal hygiene 9.0
largest fiscal intervention 9.0
quick weekend trips 9.0

antigen rapid tests 9.0
resume larger gatherings 9.0
ministerial task force 9.0
*** ministerial task force
visit someone’s hom

Unnamed: 0,ministerial task force,personal protective equipment,advanced research capabilities,large public events,observe personal hygiene,kuala lumpur recently,saint peter’s square,high blood pressure,create additional capacity,made direct appeals,...,complicated flow charts,secured vaccine supplies,needed icu treatment,needed oxygen supplementation,badly needed beds,plans moving forward,migrant worker dormitories,made tremendous sacrifices,drastically streamline testing,solidarity remains crucial
0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,...,9.0,8.5,8.5,8.5,8.5,9.0,9.0,9.0,9.0,9.0


In [241]:
col = []
d = {}
for item in keywords[:30]:
    d[item[0]] = round(item[1], 1)
    
pd.DataFrame(d, index=[0])

Unnamed: 0,ministerial task force,personal protective equipment,advanced research capabilities,large public events,observe personal hygiene,natural human reactions,team nila volunteers,stockpiled adequate supplies,avoid crowded places,quarantined close contacts,...,ample supplies,avoid touching,extra mile,government’s response,lie ahead,medical facilities,national centre,trained doctors,important differences,hubei province
0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,8.5,8.5,8.3,...,4.5,4.5,4.5,4.0,4.0,4.0,4.0,4.0,4.0,4.0


In [None]:
data = {'col_1': [3, 2, 1, 0], 'col_2': ['a', 'b', 'c', 'd']}
pd.DataFrame.from_dict(data)

In [220]:
len(keywords)

232

In [231]:
# pd.DataFrame(keywords)
# .pivot(columns=0, values=1)

Unnamed: 0,0,1
0,pre-empt escalating infections,9.0
1,press conference immediately,9.0
2,disrupt economic sectors,9.0
3,global supply chain,9.0
4,social distancing extremely,9.0
...,...,...
227,transmission,1.0
228,protect,1.0
229,seniors,1.0
230,ill,1.0


In [135]:
help(Rake)

Help on class Rake in module multi_rake.algorithm:

class Rake(builtins.object)
 |  Rake(min_chars=3, max_words=3, min_freq=1, language_code=None, stopwords=None, lang_detect_threshold=50, max_words_unknown_lang=2, generated_stopwords_percentile=80, generated_stopwords_max_len=3, generated_stopwords_min_freq=2)
 |  
 |  Methods defined here:
 |  
 |  __init__(self, min_chars=3, max_words=3, min_freq=1, language_code=None, stopwords=None, lang_detect_threshold=50, max_words_unknown_lang=2, generated_stopwords_percentile=80, generated_stopwords_max_len=3, generated_stopwords_min_freq=2)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  apply(self, text, text_for_stopwords=None)
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors defined here:
 |  
 |  __dict__
 |      dictionary for instance variables (if defined)
 |  
 |  __weakref__
 |      list of weak references to the object (if defined)



TextRank

In [174]:
from summa import keywords

In [177]:
TR_keywords = keywords.keywords(df.loc[9, 'speech'], scores=True)
print(TR_keywords[0:30])

[('vaccinated', 0.2475221539865503), ('vaccine', 0.2475221539865503), ('national vaccination', 0.1601561675153647), ('worker', 0.14917952228556874), ('measures', 0.14689201760477072), ('measured', 0.14689201760477072), ('covid', 0.13894791912910842), ('new', 0.12914126949785804), ('personal', 0.1284812292911113), ('persons', 0.1284812292911113), ('day', 0.1272284038515296), ('days', 0.1272284038515296), ('needed', 0.12623963525949566), ('need', 0.12623963525949566), ('needing', 0.12623963525949566), ('healthcare workers', 0.12433623923144828), ('majority', 0.1156438027397464), ('safely', 0.11556832043566007), ('safe', 0.11556832043566007), ('travel', 0.1142871454920944), ('travellers', 0.1142871454920944), ('tested', 0.113921177746588), ('testing', 0.113921177746588), ('test', 0.113921177746588), ('case', 0.11121670107137734), ('singapore', 0.10332993531769606), ('major turning', 0.10056707464288946), ('singaporeans', 0.10018546866430139), ('omicron', 0.09923042222372601), ('wave', 0.0

In [178]:
TR_keywords = keywords.keywords(df.loc[9, 'text_processed'], scores=True)
print(TR_keywords[0:30])

[('vaccinated', 0.24565986293593872), ('vaccine', 0.24565986293593872), ('national vaccination', 0.15895033493115093), ('worker', 0.1480505496026945), ('measured', 0.14578678110738694), ('covid', 0.1379070035544217), ('personal', 0.12750910148147382), ('persons', 0.12750910148147382), ('day', 0.1262742767153898), ('days', 0.1262742767153898), ('needed', 0.12528565060243532), ('need', 0.12528565060243532), ('needing', 0.12528565060243532), ('healthcare workers', 0.12339527752683288), ('omicron', 0.11973949163239495), ('omicrons', 0.11973949163239495), ('majority', 0.11491495644636918), ('safely', 0.11469439605914179), ('safe', 0.11469439605914179), ('travel', 0.11344448861100938), ('travellers', 0.11344448861100938), ('new wave', 0.11324489918823424), ('testing', 0.11305903597467334), ('test', 0.11305903597467334), ('case', 0.11040607800353987), ('stringent measures', 0.10442216182349584), ('singapore', 0.10254843721482261), ('major turning', 0.10004616478597864), ('singaporeans', 0.099

In [168]:
TR_keywords = keywords.keywords(df.loc[9, 'speech'], deaccent=True, scores=True)
print(TR_keywords[0:30])

[('vaccinated', 0.2475221539865503), ('vaccine', 0.2475221539865503), ('national vaccination', 0.1601561675153647), ('worker', 0.14917952228556874), ('measures', 0.14689201760477072), ('measured', 0.14689201760477072), ('covid', 0.13894791912910842), ('new', 0.12914126949785804), ('personal', 0.1284812292911113), ('persons', 0.1284812292911113), ('day', 0.1272284038515296), ('days', 0.1272284038515296), ('needed', 0.12623963525949566), ('need', 0.12623963525949566), ('needing', 0.12623963525949566), ('healthcare workers', 0.12433623923144828), ('majority', 0.1156438027397464), ('safely', 0.11556832043566007), ('safe', 0.11556832043566007), ('travel', 0.1142871454920944), ('travellers', 0.1142871454920944), ('tested', 0.113921177746588), ('testing', 0.113921177746588), ('test', 0.113921177746588), ('case', 0.11121670107137734), ('singapore', 0.10332993531769606), ('major turning', 0.10056707464288946), ('singaporeans', 0.10018546866430139), ('omicron', 0.09923042222372601), ('wave', 0.0

In [169]:
keywords.keywords?

In [171]:
keywords.get_graph(df.loc[9, 'speech'])

<summa.graph.Graph at 0x19556afea30>

In [103]:
help(keywords.keywords)

Help on function keywords in module summa.keywords:

keywords(text, ratio=0.2, words=None, language='english', split=False, scores=False, deaccent=False, additional_stopwords=None)



In [106]:
from keybert import KeyBERT

In [107]:
kw_model = KeyBERT(model='all-mpnet-base-v2')

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.1k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [108]:
help(KeyBERT.extract_keywords)

Help on function extract_keywords in module keybert._model:

extract_keywords(self, docs: Union[str, List[str]], candidates: List[str] = None, keyphrase_ngram_range: Tuple[int, int] = (1, 1), stop_words: Union[str, List[str]] = 'english', top_n: int = 5, min_df: int = 1, use_maxsum: bool = False, use_mmr: bool = False, diversity: float = 0.5, nr_candidates: int = 20, vectorizer: sklearn.feature_extraction.text.CountVectorizer = None, highlight: bool = False, seed_keywords: List[str] = None) -> Union[List[Tuple[str, float]], List[List[Tuple[str, float]]]]
    Extract keywords/keyphrases
    
    NOTE:
        I would advise you to iterate over single documents as they
        will need the least amount of memory. Even though this is slower,
        you are not likely to run into memory errors.
    
    Multiple Documents:
        There is an option to extract keywords for multiple documents
        that is faster than extraction for multiple single documents.
    
        However...this

In [111]:
keywords = kw_model.extract_keywords(df.loc[9, 'speech'], 
                                     keyphrase_ngram_range=(1, 3), 
                                     stop_words='english', 
                                     highlight=True,
                                     top_n=20)

keywords_list= list(dict(keywords).keys())

print(keywords_list)

['singaporeans fight covid', 'fight covid 19', 'living covid 19', 'watchful covid 19', 'moving forward covid', 'covid 19 journey', 'covid 19 resilience', 'emerge stronger pandemic', 'approach declared pandemic', 'pandemic control', 'battled covid 19', 'remain watchful covid', 'covid 19 bring', 'support let pandemic', 'covid 19', 'covid 19 reached', 'fight covid', 'pandemic united people', 'covid 19 patients', 'way fight covid']
