# Task 1.5 - Text Mining

## Importing Data and Libraries

In [None]:
#Import Libraries

from textblob import TextBlob
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
matplotlib.use('Agg')  # Use a non-interactive backend
import nltk
import nltk
import re
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from collections import Counter
sns.set()

In [None]:
# Import txt file

with open('20th_Century_Scrape.txt', 'r', errors='ignore') as file: 
   data = file.read().replace( '\n', ' ')

## Tokenization

In [None]:
# Sentence tokenization

from nltk.tokenize import sent_tokenize
tokenized_sent = sent_tokenize(data)
print(tokenized_sent) 

In [None]:
# Word tokenization

from nltk.tokenize import word_tokenize
tokenized_word = word_tokenize(data)
print(tokenized_word) 

In [None]:
# Checking frequency distribution

from nltk.probability import FreqDist
dist_words = FreqDist(tokenized_word)
print(dist_words) 

In [None]:
# Finding 10 most common words

dist_words.most_common(10)

In [None]:
# Frequency Distribution Plot

plt.figure(figsize=(8, 3))
dist_words.plot(10,cumulative = False)
plt.show()

## Remove Stop Words

In [None]:
# Defining stopwords

from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))
print(stop_words)

In [None]:
# Removing stopwords in words

filtered_words = [] # creates an empty list
for word in tokenized_word:
    if word not in stop_words:
        filtered_words.append(word)

In [None]:
filtered_words

In [None]:
# Create a new FreqDist for filtered_words

dist_words_filter = FreqDist(filtered_words)
print(dist_words_filter)

In [None]:
# Frequency Distribution Plot

plt.figure(figsize=(8, 3))
dist_words_filter.plot(10, cumulative = False)
plt.show()

In [None]:
dist_words_filter

In [None]:
# Substitute all punctuations marks with a space 

sans_punc = re.sub("[^a-zA-Z]",  # Search for all non-letters
                          " ",          # Replace all non-letters with spaces
                          str(filtered_words))

In [None]:
# Word tokenization

tokenized_word_2 = word_tokenize(sans_punc)
print(tokenized_word_2)

In [None]:
# Create a new FreqDist

dist_words_filter_2 = FreqDist(tokenized_word_2)

In [None]:
# Frequency Distribution Plot

plt.figure(figsize=(8, 3))
dist_words_filter_2.plot(30, cumulative = False)
plt.show()

## Before and after removing stopwords/punctuation
##### The results of the text mining are far more useful and telling after removing stopwords and punctuation. More discernible words, such as proper pronouns and names of countries that were notorious for their involvement in various wars, now appear

## POS Tags list with textblob

In [None]:
new_stopwords = ["And", "Then", 'n', 't', 's', 'The']

In [None]:
filtered = []
for word in tokenized_word_2:
    if word not in new_stopwords:
        filtered.append(word)

In [None]:
%%time
text = TextBlob(str(filtered))

In [None]:
text

In [None]:
tags_list = text.tags

In [None]:
tags_list

In [None]:
df_text = pd.DataFrame(tags_list)
df_text.columns = ['Words', "Word type"]

In [None]:
df_text.head()

In [None]:
df_t = df_text.groupby('Word type').count().reset_index()

In [None]:
df_t.head()

In [None]:
top20 = df_t.nlargest(20, 'Words')

In [None]:
plt.figure(figsize = (10, 5))
with sns.dark_palette("xkcd:blue", 20):
    sns.barplot(x = "Words", y = "Word type",
    saturation = 0.9, data = top20).set_title("20th Century - top 20 word types used")

## Three bar plots for nouns, verbs and adjectives

In [None]:
def word_analysis(word_type):
    filtered = [row for row in my_list if str(word_type) in row[1]]
    print("filtered for " + word_type)
    df = pd.DataFrame(filtered)
    df.columns = ["Word", "Occurences"]
    x=df.groupby('Word').count().reset_index()
    y=x.sort_values(by=['Occurences'], ascending=False)
    top10=y.nlargest(10, 'Occurences')
    plt.figure(figsize=(10, 5))
    sns.barplot(x="Word", y="Occurences", palette="rocket", saturation=0.9, data=top10).set_title("Lord of the rings - most frequently used "+ word_type +" type word")

### Nouns

In [None]:
df = df_text[(df_text['Word type'] == "NN") | (df_text['Word type'] == "NNS") | (df_text['Word type'] == "NNP")]
df.columns = ["Word", "Occurences"]
x = df.groupby('Word').count().reset_index()
y = x.sort_values(by = ['Occurences'], ascending=False)
top10_n = y.nlargest(10, 'Occurences')

In [None]:
top10_n

In [None]:
plt.figure(figsize=(10, 5))
with sns.dark_palette("xkcd:blue", 10):
    sns.barplot(x="Word", y="Occurences",
    saturation=0.9, data = top10_n).set_title("20th Century - most frequently used nouns")

In [None]:
# Save the figure top_nouns
plt.savefig('top_nouns.png', bbox_inches='tight')

### Verbs

In [None]:
df_v = df_text[(df_text['Word type'] == "VB")  | (df_text['Word type'] == "VBD")]
df_v.columns = ["Word", "Occurences"]
x = df_v.groupby('Word').count().reset_index()
y = x.sort_values(by = ['Occurences'], ascending=False)
top10_v = y.nlargest(10, 'Occurences')

In [None]:
top10_v

In [None]:
plt.figure(figsize = (10, 5))
with sns.dark_palette("xkcd:blue", 10):
    sns.barplot(x = "Word", y = "Occurences",
    saturation = 0.9, data = top10_v).set_title("20th Century - most frequently used verbs")

In [None]:
# Save the figure top_verbs
plt.savefig('top_verbs.png', bbox_inches='tight')

### Adjectives

In [None]:
df_a = df_text[df_text['Word type'] == "JJ"]
df_a.columns = ["Word", "Occurences"]
x = df_a.groupby('Word').count().reset_index()
y = x.sort_values(by=['Occurences'], ascending=False)
top10_a = y.nlargest(10, 'Occurences')

In [None]:
plt.figure(figsize=(10, 5))
with sns.dark_palette("xkcd:blue", 10):
    sns.barplot(x="Word", y="Occurences",
    saturation=0.9, data=top10_a).set_title("20th Century - most frequently used adjectives")

In [None]:
# Save the figure top_adj
plt.savefig('top_adj.png', bbox_inches='tight')

## Frequency each country is mentioned

In [None]:
listToStr = ' '.join([str(elem) for elem in filtered])
 
print(listToStr)

In [None]:
# Create a count for the main characters

all_counts = Counter(re.sub(r'\W+', ' ', listToStr).split())

In [None]:
all_counts

In [None]:
countries = pd.read_csv("countries_list_20th_century_1.5.csv", index_col = 0)

In [None]:
countries.head()

In [None]:
countries['country_name'] = countries['country_name'].str.strip()

In [None]:
countries.head()

In [None]:
# Replace names with aliases

countries['country_name'] = countries['country_name'].replace('United States','USA')
countries['country_name'] = countries['country_name'].replace('United States of America','USA')
countries['country_name'] = countries['country_name'].replace('America','USA')
countries['country_name'] = countries['country_name'].replace('U.S.','USA')
countries['country_name'] = countries['country_name'].replace('United Kingdom','UK')
countries['country_name'] = countries['country_name'].replace('Great Britain','UK')
countries['country_name'] = countries['country_name'].replace('Britain','UK')

In [None]:
countries['country_alias'] = countries['country_name'].apply(lambda x: x.rsplit(' ', 1)[-1])

In [None]:
countries_list = countries['country_alias'].to_list()

In [None]:
dict_of_counts = {d : all_counts[d] for d in countries_list}

In [None]:
dict_of_counts

In [None]:
# Search for the names from the list in the dictionary

dct = {v:[k] for v,k in dict_of_counts.items()}  
df = pd.DataFrame(dct)

In [None]:
df

In [None]:
df = df.transpose().reset_index()

In [None]:
df.dtypes

In [None]:
df

In [None]:
df.rename(columns = {"index":"country_name", 0:"Times mentioned"}, inplace = True)

In [None]:
df

In [None]:
df.shape

In [None]:
# Set the figure size
plt.figure(figsize=(100, 300))

# Create a dark palette with 27 colors
palette = sns.dark_palette("#79C", 27)

# Plotting
with sns.color_palette(palette):
    ax = sns.barplot(
        x="Times mentioned", 
        y="country_name",
        saturation=0.9, 
        data=df.sort_values("Times mentioned", ascending=False)
    )
    ax.set_title("20th Century - most frequently mentioned countries", fontsize=100)
    ax.set_xlabel("Times mentioned", fontsize=50)
    ax.set_ylabel("Country Name", fontsize=50)

    # Adjust tick label size
    ax.tick_params(axis='x', labelsize=50)
    ax.tick_params(axis='y', labelsize=50)

### Thoughts on the Bar Chart

There is still further cleaning to be done. There are some unusable entries in the countries list, such as "States", "Islands" and "North", which could refer to any number of countries or perhaps aren't referring countries at all. I also noticed that countries with two words in the name do not appear as much as I would expect from this graph: countries such as United States and United Kingdom. 

## Save Final Bar Chart

In [None]:
# country mentions
plt.savefig('20th_century_countries_mentions.png', bbox_inches='tight')