-
Notifications
You must be signed in to change notification settings - Fork 3
/
1.Raghuram_Rajan_speech_text_analysis.py
369 lines (207 loc) · 8.78 KB
/
1.Raghuram_Rajan_speech_text_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
#!/usr/bin/env python
# coding: utf-8
# ## Project: RBI Governor Speech Texts - Sentiment Analysis
#
# ### Introduction
# RBI Governor Speech Texts Sentiment Analysis is the project on web scraping, text pre-processing and normalization, data visualization and sentiment analysis using data provided by [RBI](https://www.rbi.org.in/Scripts/BS_ViewSpeeches.aspx). Used various python tools and libraries to perform sentiment analysis over a speech texts by RBI governor.
# In[1]:
from bs4 import BeautifulSoup # extracting speech text from HTML doc
import nltk # for pre-processing text
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from textblob import TextBlob # for sentiment analysis
import string
from collections import Counter
import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')
from wordcloud import WordCloud
import seaborn as sns
sns.set_context('notebook')
# ### Extracting text from markup like HTML document formats for each speech
# In[14]:
htmlfile = open('RBI_governor_speech/Raghuram_rajan/Reserve Bank of India - Speeches_1.htm', encoding="utf8").read()
soup = BeautifulSoup(htmlfile)
for speech_text_1 in soup.findAll(attrs={'class' : 'tablecontent2'}):
speech_text_1 = speech_text_1.text.strip()
# break into lines and remove leading and trailing space on each
lines = (line.strip() for line in speech_text_1.splitlines())
# break multi-headlines into a line each
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
# drop blank lines
speech_1 = '\n'.join(chunk for chunk in chunks if chunk)
print(speech_1)
# In[15]:
htmlfile = open('RBI_governor_speech/Raghuram_rajan/Reserve Bank of India - Speeches_2.htm', encoding="utf8").read()
soup = BeautifulSoup(htmlfile)
for speech_text_2 in soup.findAll(attrs={'class' : 'tablecontent2'}):
speech_text_2 = speech_text_2.text.strip()
# break into lines and remove leading and trailing space on each
lines = (line.strip() for line in speech_text_2.splitlines())
# break multi-headlines into a line each
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
# drop blank lines
speech_2 = '\n'.join(chunk for chunk in chunks if chunk)
print(speech_2)
# In[16]:
htmlfile = open('RBI_governor_speech/Raghuram_rajan/Reserve Bank of India - Speeches_3.htm', encoding="utf8").read()
soup = BeautifulSoup(htmlfile)
for speech_text_3 in soup.findAll(attrs={'class' : 'tablecontent2'}):
speech_text_3 = speech_text_3.text.strip()
# break into lines and remove leading and trailing space on each
lines = (line.strip() for line in speech_text_3.splitlines())
# break multi-headlines into a line each
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
# drop blank lines
speech_3 = '\n'.join(chunk for chunk in chunks if chunk)
print(speech_3)
# In[17]:
# Combining all three speech texts into one
raghuram_rajan_speeches = (speech_1 +"\n"+ speech_2 +"\n"+ speech_3)
print(raghuram_rajan_speeches)
# ## Text Analysis Operations using NLTK
# In[18]:
# Lets break text paragraphs into sentences
tokenized_text=sent_tokenize(raghuram_rajan_speeches)
print(tokenized_text)
# In[20]:
# Word tokenizer breaks text paragraph into words.
tokenized_word=word_tokenize(raghuram_rajan_speeches)
print(tokenized_word)
# In[21]:
# lets find Frequency Distribution of each words
fdist = FreqDist(tokenized_word)
print(fdist)
# In[22]:
fdist.most_common(5)
# In[23]:
# Frequency Distribution Plot
fdist.plot(30,cumulative=False)
plt.show()
# In[24]:
# Stopwords considered as noise in the text. Text may contain stop words such as is, am, are, this, a, an, the, etc.
stop_words=set(stopwords.words("english"))
print(stop_words)
# In[25]:
# Removing Stopwords
filtered_word=[]
for w in tokenized_word:
if w not in stop_words:
filtered_word.append(w)
print("Tokenized Words:",tokenized_word[:100])
print("Filterd Words:",filtered_word[:100])
# In[26]:
# lets find Frequency Distribution of filtered words
fdist = FreqDist(filtered_word)
print(fdist)
# In[27]:
fdist.plot(30,cumulative=False)
plt.show()
# In[28]:
# Lets get rid of the punctuation
# Python provides a constant called string.punctuation that provides a great list of punctuation characters.
table = str.maketrans('', '', string.punctuation)
stripped = [w.translate(table) for w in filtered_word]
print(stripped[:200])
# In[29]:
# remove all tokens that are not alphabetic
filtered_words = [word for word in stripped if word.isalpha()]
print(filtered_words[:100])
# In[30]:
# lets find Frequency Distribution of filtered words
fdist = FreqDist(filtered_words)
print(fdist)
# In[31]:
fdist.plot(30,cumulative=False)
plt.show()
# #### Much better after removing stopwords and punctuation!
# #### Lexicon Normalization
#
#
# Lexicon normalization considers another type of noise in the text. For example, connection, connected, connecting word reduce to a common word "connect". It reduces derivationally related forms of a word to a common root word.
# In[33]:
#Lexicon Normalization
#performing Stemming and Lemmatization
ps = PorterStemmer()
stemmed_words=[]
for w in filtered_words:
stemmed_words.append(ps.stem(w))
print("Filtered Words:",filtered_words[:100])
print("Stemmed Words:",stemmed_words[:100])
# In[34]:
# Lets try Lemmatization
lem = WordNetLemmatizer()
lemma_words=[]
for w in filtered_words:
lemma_words.append(lem.lemmatize(w))
print("Filtered Words:",filtered_words[:100])
print("Lemmatize Words:",lemma_words[:100])
# #### Much better after performing Lemmatization
# In[37]:
print('Total number of words after text pre-processing :', len(lemma_words))
# ### WordCloud for Raghuram Rajan Speech
# #### Now Lets look at importance of each word frequency from Raghuram Rajan's speech using WordCloud
# #### Which will help us to get insights about his Audience, for eg. Tenurity
# In[43]:
filtered_speech_words = str(lemma_words)
wordcloud = WordCloud(width=1000, height=500,
random_state=21, max_font_size=110).generate(filtered_speech_words)
plt.figure(figsize=(18, 15))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()
# #### Great! We can say his speech is mostly focused on topics - Economy growth, Market, Public sector bank, Government, Loan, Debt etc.
# #### Now lets plot most occurring words from Raghuram Rajan's speech
# In[44]:
# Count all unique words
speech_word_counts = Counter(lemma_words)
# In[49]:
# Plot top 20 most frequently occuring words from Raghuram Rajan
rr_common_words = [word[0] for word in speech_word_counts.most_common(20)]
rr_common_counts = [word[1] for word in speech_word_counts.most_common(20)]
# Using background style
plt.style.use('dark_background')
plt.figure(figsize=(18, 12))
sns.barplot(x=rr_common_words, y=rr_common_counts)
plt.title('Most Common Words used by Raghuram Rajan')
plt.show()
# In[50]:
# See count list of most common words
print("25 most common words:\nWord\t\tCount")
for word, count in speech_word_counts.most_common(25):
print("{}\t\t{}".format(word, count))
# ### Get Sentiment scores from Raghuram Rajan's speech
# In[52]:
# Using TextBlob to get sentiment scores from text
speech_text_object = TextBlob(filtered_speech_words)
# textblob has a pre-trained sentiment analysis model that we can use
speech_text_object.sentiment
# TextBlob.sentiment
#
# Return a tuple(value pair) of form (polarity, subjectivity ) where polarity is a float(number) within the range [-1.0, 1.0] and subjectivity is a float(number) within the range [0.0, 1.0] where 0.0 is very objective and 1.0 is very subjective.
# #### What these scores say is that Raghuram Rajan's speech text is fairly subjective (opinionated) but very neutral in polarity (not phrased in a negative or positive way)
# ### Plot the words by their sentiment from Raghuram Rajan's speech
# In[61]:
plt.figure(figsize=(18,15))
# for each word draw the text on the char using the sentiment score as the x and y coordinates
for word in lemma_words:
word_sentiment = TextBlob(word).sentiment
plt.text(word_sentiment.polarity, # x coordinate
word_sentiment.subjectivity, # y coordinate
word) # the text to draw
# set axis ranges
plt.xlim(-1, 1)
plt.ylim(0, 1)
# draw line in middle
plt.axvline(0, color='red', linestyle='dashed')
# label axis
plt.title('Sentiment analysis of words from Raghuram Rajan speech\n')
plt.xlabel('Polarity (Negative or Positive)')
plt.ylabel('Subjectivity (0 - purly objective, 1 - purly subjective)')
# display
plt.show()
# ## Thank You.
# In[ ]: