### Exploratory Data Analysis of Dr. Martin Luther King's (1963) "I Have a Dream" speech
For more info on this speech, see https://en.wikipedia.org/wiki/I_Have_a_Dream

### Source & Load Corpus

In [None]:
# Check to see if the file was read without error
fp = open('MLK_I_Have_A_Dream.txt', 'r', encoding='ISO-8859-1')
print(fp.readable()) 

In [None]:
print(fp.read())

In [None]:
# tell() provides current location of text file pointer
fp.tell()

In [None]:
# seek(0) repositions file pointer to the begining of file; 
# seek(1) relative to current position; seek(2) relative to EOF
# readline() prints one line in the file and moves pointer to next line
fp.seek(0)
print(fp.readline())

In [None]:
print(fp.readline())

In [None]:
# Put each line into an array and print array
print(fp.readlines())

In [None]:
fp.seek(0)
speech = fp.read()
print(speech)

In [None]:
# Now that the text is read into a variable, we can close the file pointer
fp.close()

### Clean/preprocess text

In [None]:
# Remove punctuations and special characters
for char in '!#$%&@?,.:;+-*/=<>"\'()[\\]{|}~\n\t':
    speech = speech.replace(char, ' ')
speech

In [None]:
speech = speech.lower()
speech

In [None]:
# Tokenize text
word_list = speech.split()
word_list

In [None]:
len(word_list)

In [None]:
word_list = [w for w in word_list if len(w) > 2]
word_list

In [None]:
len(word_list)

In [None]:
# Import stopwords from NLTK
# import nltk
# nltk.download('stopwords')
import nltk
nltk.download('stopwords')
from nltk import corpus
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words

In [None]:
# Remove stop words from corpus
word_list = [w for w in word_list if w not in stop_words]
word_list

In [None]:
len(word_list)

### Data Visualizations

In [None]:
# We want to draw a barplot of the most frequent words in this speech
# to do this, we must create a dictionary with word counts for each word, and sort the dictionary by word counts
dict = {}                            # Initialize dictionary
for w in word_list:
    dict[w] = dict.get(w, 0) + 1
# get() returns the value of a key (word) in a dictionary if it exists;
# if the key is missing, it returns a value 0
dict

In [None]:
len(dict)

In [None]:
# Since dictionaries are unordered, to sort the word counts from high 
# to low, we must convert the dictionary into a list (of tuples)
word_freq = []
for key, value in dict.items():
    word_freq.append((value, key))
word_freq
# items() returns a list of tuples with key-value pairs in a dictionary
# Keys and values are reversed in list word_freq for ease of sorting 

In [None]:
# Sort word_freq list from high to low
word_freq.sort(reverse=True)
word_freq

In [None]:
top20_words = word_freq[0:20]
top20_words

In [None]:

# Convert the list of tuples to two tuples for plotting using zip()
values, labels = zip(*top20_words)
values

In [None]:
labels

In [None]:
# Barplot of most frequent words
import matplotlib.pyplot as plt
plt.bar(labels, values)
plt.xlabel('Words')
plt.xticks(rotation=65)
plt.ylabel('Count')
plt.title('Barplot of Top 20 Most Frequent Words')

### Wordcloud
Install the wordcloud library from Anaconda command prompt (Note: The "python -m" option ensures compatibility with current Python kernel).

python -m pip install wordcloud

In [None]:
# Install wordcloud on Anaconda command prompt; WordCloud requires a text corpus as input
# python -m pip install wordcloud

from wordcloud import WordCloud
text_corpus = ' '.join(word_list)
wordcloud = WordCloud(width=500, height=500, background_color='white', collocations='FALSE', min_font_size=16).generate(text_corpus)
plt.imshow(wordcloud)
plt.show()

Question: How is it that "freedom" is the most frequent word in the barplot, but appears in smaller font in the word cloud?

Ans: By default, wordcloud uses bigrams, which treats "freedom" and "let freedom" as two bigrams, and reduces the frequency of the word freedom.

In [None]:
# Create n-grams using NLTK
# n-grams is a way of preserving sequence (and meaning) of words
from nltk.util import ngrams
word_list = speech.split()
bigrams = list(ngrams(word_list, 4))
bigrams

In [None]:
trigrams = list(ngrams(word_list, 3))
trigrams = [' '.join(word) for word in trigrams]
trigrams

### Sentiment Analysis

In [None]:
# nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

print(analyzer.polarity_scores(speech))

In [None]:
print(analyzer.polarity_scores(text_corpus))

In [None]:
for w in word_list:
    print(w, analyzer.polarity_scores(w))

In [None]:
negative_words = 0
positive_words = 0
pol_words = 0
sum_pol = 0.0

for w in word_list:
    pol = analyzer.polarity_scores(w)
    if pol["compound"] != 0:
        pol_words += 1
        sum_pol = sum_pol + pol["compound"]
    if pol["neg"] > 0:
        negative_words += 1 
    if pol["pos"] > 0:
        positive_words += 1
        
print("Positive words: ", positive_words)
print("Negative words: ", negative_words)
print("Polarity ", sum_pol/pol_words)

### Insights

In [None]:
# So what have we learned from the above analysis of Dr. MLK's "I Have a Dream" speech?