## Task 5

1.	Download Alice in Wonderland by Lewis Carroll from Project Gutenberg's website 
    http://www.gutenberg.org/files/11/11-0.txt
2.	Perform any necessary preprocessing on the text, including converting to lower case, 
    removing stop words, numbers / non-alphabetic characters, lemmatization.
3.	Find Top 10 most important (for example, in terms of TF-IDF metric) words 
    from each chapter in the text (not "Alice"); how would you name each chapter 
    according to the identified tokens?
4.	Find the Top 10 most used verbs in sentences with Alice. What does Alice do most often?
5.	*(not necessary) Find Top 100 most used verbs in sentences with Alice. 
    Get word vectors using a pre-trained word2vec model and visualize them. 
    Compare the words using embeddings.

In [1]:
import nltk
import string
import math
import pandas as pd

from nltk import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import Counter
from string import punctuation
from os import listdir

In [2]:
nltk.download("wordnet")
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("averaged_perceptron_tagger")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Sultan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Sultan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Sultan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Sultan\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [3]:
# Load text into memory
with open("./11-0.txt", "r", encoding="utf8") as f_src:
    text = f_src.read()

In [4]:
# Clean text
def clean_text(txt):
    tokens = txt.split()
    table = str.maketrans('', '', string.punctuation)
    tokens = [w.translate(table) for w in tokens]
    tokens = [word for word in tokens if word.isalpha()]
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    tokens = [word for word in tokens if len(word) > 3]
    return tokens

In [5]:
# Turn a text into clean tokens
words = clean_text(text)

In [None]:
# Lemmatization
wordnet_lemmatizer = WordNetLemmatizer()
lem_words = [wordnet_lemmatizer.lemmatize(word, pos='v') for word in words]

In [None]:
# Lowercase
lem_words = [word.lower() for word in lem_words] 
print(lem_words[:10])

In [None]:
# Chapters
chapters, tmp_chap = list(), list()
for word in lem_words:
    if word == "chapter":
        tmp_chap = list() 
        chapters.append(tmp_chap)
    else:
        tmp_chap.append(word)

print(f"Total count of chapters: {len(chapters)}")

print("First 3 words per chapter:")    
for i in range(len(chapters)):
    print(chapters[i][0:3])

In [None]:
# TF
def get_tf(word, f_text):
    tf_text = Counter(f_text)
    tf_text[word] = tf_text[word] / float(len(f_text))
        
    return tf_text[word]

In [None]:
# IDF
def get_idf(word, chaps):
    count = sum([1.0 for chap in chaps if word in chap])
    if count != 0:
        return math.log(len(chaps) / count)
    else:
        return 0
    

### Note

I would name them by the first three words found.

In [None]:
# Top-10 TF-IDF in chapters
chapters_list = list()
for chapter in chapters:
    vocab_chapters = Counter()
    for word in chapter:
        if word != "alice":
            vocab_chapters[word] = get_tf(word, chapter) * get_idf(word, chapters)
            vocab_chapters[word] = round(vocab_chapters[word], 5)
    chapters_list.append(vocab_chapters)
        
for chapter in chapters_list:
    print("Chapter " + str(chapters_list.index(chapter) + 1))
    print(pd.DataFrame(chapter.most_common(10)))
    print()
    

### Note

Alice most often says something, goes somewhere and thinks about something.

In [None]:
# Verbs with 'Alice'
vocab_verbs = Counter()
sentences = sent_tokenize(text)

for sentence in sentences:
    sentence_words = clean_text(sentence)
    sentence_words = [wordnet_lemmatizer.lemmatize(w, pos='v') for w in sentence_words]
    sentence_words = [w.lower() for w in sentence_words]
    if "alice" in sentence_words:
        sentence_words = nltk.pos_tag(sentence_words)
        for word, pos_tag in sentence_words:
            if pos_tag == "VB":
                vocab_verbs[word] += 1 
pd.DataFrame(data=vocab_verbs.most_common(10), columns=["Verb", "Count"])