In [27]:
# Author: Ulya Bayram, Ph.D.
# ulyabayram@gmail.com
# Demo for the APSA2020 invited talk titled "What NLP and Machine Learning Can Reveal from The Political Texts"

import nltk

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [3]:
# A subsection of the August 19, 2020 speech of the 45th President of the U.S., Donald Trump
speech_ = "We're also using the full power of the federal government to defeat, as you know, the China virus.\
 New cases have declined in 80 percent of the jurisdictions in the past week.  Eighty percent.  New Zealand,\
 by the way, had a big outbreak.  And other countries that were held up to try and make us look not as good as\
 we should look — because we’ve done an incredible job — but they're having a lot of outbreaks, but they'll be\
 able to put them out and we put them out."

In [4]:
print(speech_)

We're also using the full power of the federal government to defeat, as you know, the China virus. New cases have declined in 80 percent of the jurisdictions in the past week.  Eighty percent.  New Zealand, by the way, had a big outbreak.  And other countries that were held up to try and make us look not as good as we should look — because we’ve done an incredible job — but they're having a lot of outbreaks, but they'll be able to put them out and we put them out.


In [13]:
# 1---- Normalization of the text: lowercase conversion, punctuation removal

speech_ = speech_.lower() # lowercase conversion

print('Lowercase conversion result:')
print(speech_)
print('

')

speech_ = speech_.replace(',', '') # manual removal of the punctuations - only comma selected for removal
print('Punctuation (only comma) removal result:')
print(speech_)

['-', '-', '-', '-', '-', '-', '-', '-', '-', '-']
Lowercase conversion result:
we're also using the full power of the federal government to defeat, as you know, the china virus. new cases have declined in 80 percent of the jurisdictions in the past week.  eighty percent.  new zealand, by the way, had a big outbreak.  and other countries that were held up to try and make us look not as good as we should look — because we’ve done an incredible job — but they're having a lot of outbreaks, but they'll be able to put them out and we put them out.


["we're also using the full power of the federal government to defeat, as you know, the china virus.", 'new cases have declined in 80 percent of the jurisdictions in the past week.', 'eighty percent.', 'new zealand, by the way, had a big outbreak.', "and other countries that were held up to try and make us look not as good as we should look — because we’ve done an incredible job — but they're having a lot of outbreaks, but they'll be able to put

In [None]:
# 2----- Tokenization of the text: Split by sentences
print(nltk.sent_tokenize(speech_))

In [14]:
# Tokenization of the text: Split by word boundaries
tokens_speech = nltk.word_tokenize(speech_)
print(tokens_speech)

['we', "'re", 'also', 'using', 'the', 'full', 'power', 'of', 'the', 'federal', 'government', 'to', 'defeat', ',', 'as', 'you', 'know', ',', 'the', 'china', 'virus', '.', 'new', 'cases', 'have', 'declined', 'in', '80', 'percent', 'of', 'the', 'jurisdictions', 'in', 'the', 'past', 'week', '.', 'eighty', 'percent', '.', 'new', 'zealand', ',', 'by', 'the', 'way', ',', 'had', 'a', 'big', 'outbreak', '.', 'and', 'other', 'countries', 'that', 'were', 'held', 'up', 'to', 'try', 'and', 'make', 'us', 'look', 'not', 'as', 'good', 'as', 'we', 'should', 'look', '—', 'because', 'we', '’', 've', 'done', 'an', 'incredible', 'job', '—', 'but', 'they', "'re", 'having', 'a', 'lot', 'of', 'outbreaks', ',', 'but', 'they', "'ll", 'be', 'able', 'to', 'put', 'them', 'out', 'and', 'we', 'put', 'them', 'out', '.']


In [16]:
# Further remove remaining punctuations by removing words containing non-alphabetical characters in the tokens
list_of_words = [current_word for current_word in tokens_speech if current_word.isalpha()]
print(list_of_words)

['we', 'also', 'using', 'the', 'full', 'power', 'of', 'the', 'federal', 'government', 'to', 'defeat', 'as', 'you', 'know', 'the', 'china', 'virus', 'new', 'cases', 'have', 'declined', 'in', 'percent', 'of', 'the', 'jurisdictions', 'in', 'the', 'past', 'week', 'eighty', 'percent', 'new', 'zealand', 'by', 'the', 'way', 'had', 'a', 'big', 'outbreak', 'and', 'other', 'countries', 'that', 'were', 'held', 'up', 'to', 'try', 'and', 'make', 'us', 'look', 'not', 'as', 'good', 'as', 'we', 'should', 'look', 'because', 'we', 've', 'done', 'an', 'incredible', 'job', 'but', 'they', 'having', 'a', 'lot', 'of', 'outbreaks', 'but', 'they', 'be', 'able', 'to', 'put', 'them', 'out', 'and', 'we', 'put', 'them', 'out']


In [20]:
# 3----- Stopword removal

# What are stopwords?
list_of_stopwords = nltk.corpus.stopwords.words('english')
print(list_of_stopwords)
print('

')

# Remove the stopwords from the text
list_remaining_words = [current_word for current_word in list_of_words if not current_word in list_of_stopwords]
print(list_remaining_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [24]:
# 4----- Lemmatization

wnl = nltk.stem.WordNetLemmatizer()

list_lemmatized_words = [wnl.lemmatize(current_word) for current_word in list_remaining_words]
print(list_lemmatized_words)

['also', 'using', 'full', 'power', 'federal', 'government', 'defeat', 'know', 'china', 'virus', 'new', 'case', 'declined', 'percent', 'jurisdiction', 'past', 'week', 'eighty', 'percent', 'new', 'zealand', 'way', 'big', 'outbreak', 'country', 'held', 'try', 'make', 'u', 'look', 'good', 'look', 'done', 'incredible', 'job', 'lot', 'outbreak', 'able', 'put', 'put']


In [25]:
# 5----- Stemming

wnl = nltk.stem.PorterStemmer()

list_stemmed_words = [wnl.stem(current_word) for current_word in list_remaining_words]
print(list_stemmed_words)

['also', 'use', 'full', 'power', 'feder', 'govern', 'defeat', 'know', 'china', 'viru', 'new', 'case', 'declin', 'percent', 'jurisdict', 'past', 'week', 'eighti', 'percent', 'new', 'zealand', 'way', 'big', 'outbreak', 'countri', 'held', 'tri', 'make', 'us', 'look', 'good', 'look', 'done', 'incred', 'job', 'lot', 'outbreak', 'abl', 'put', 'put']


In [28]:
# 6----- POS (Part-of-Speech) Tagging

print(nltk.pos_tag(list_remaining_words))

# https://www.nltk.org/book/ch05.html
# Tag	    Meaning	                  English Examples
# ------------------------------------------------------------------------
# ADJ	    adjective	                new, good, high, special, big, local
# ADP	    adposition	              on, of, at, with, by, into, under
# ADV	    adverb	                  really, already, still, early, now
# CONJ	  conjunction	              and, or, but, if, while, although
# DET	    determiner, article	      the, a, some, most, every, no, which
# NOUN	  noun	                    year, home, costs, time, Africa
# NUM	    numeral	                  twenty-four, fourth, 1991, 14:24
# PRT	    particle	                at, on, out, over per, that, up, with
# PRON	  pronoun	                  he, their, her, its, my, I, us
# VERB	  verb	                    is, say, told, given, playing, would
# .	      punctuation marks	         . , ; !
# X	      other	                    ersatz, esprit, dunno, gr8, univeristy

[('also', 'RB'), ('using', 'VBG'), ('full', 'JJ'), ('power', 'NN'), ('federal', 'JJ'), ('government', 'NN'), ('defeat', 'NN'), ('know', 'VBP'), ('china', 'VBZ'), ('virus', 'JJ'), ('new', 'JJ'), ('cases', 'NNS'), ('declined', 'VBD'), ('percent', 'JJ'), ('jurisdictions', 'NNS'), ('past', 'IN'), ('week', 'NN'), ('eighty', 'RB'), ('percent', 'JJ'), ('new', 'JJ'), ('zealand', 'NN'), ('way', 'NN'), ('big', 'JJ'), ('outbreak', 'NN'), ('countries', 'NNS'), ('held', 'VBD'), ('try', 'RB'), ('make', 'VB'), ('us', 'PRP'), ('look', 'VB'), ('good', 'JJ'), ('look', 'NN'), ('done', 'VBN'), ('incredible', 'JJ'), ('job', 'NN'), ('lot', 'NN'), ('outbreaks', 'VBZ'), ('able', 'JJ'), ('put', 'NN'), ('put', 'VBD')]
