# Task

1. Download Alice in Wonderland by Lewis Carroll from Project Gutenberg's website http://www.gutenberg.org/files/11/11-0.txt
2. Perform any necessary preprocessing on the text, including converting to lower case, removing stop words, numbers / non-alphabetic characters, lemmatization.
3. Find Top 10 most important (for example, in terms of TF-IDF metric) words from each chapter in the text (not "Alice"); how would you name each chapter according to the identified tokens?
4. Find the Top 10 most used verbs in sentences with Alice. What does Alice do most often?

# Alice example

In [33]:
import re
import string
import nltk
import numpy as np
from nltk.corpus import wordnet 
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import PorterStemmer, WordNetLemmatizer
nltk.download("wordnet")
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import sent_tokenize
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vando\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vando\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vando\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\vando\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

## 1. Download Alice

In [2]:
filename = 'alice_text.txt'
with open(filename, encoding='utf-8') as f:
    alice_text = f.read()
alice_text[:1000]

'\ufeffThe Project Gutenberg eBook of Alice’s Adventures in Wonderland, by Lewis Carroll\n\nThis eBook is for the use of anyone anywhere in the United States and\nmost other parts of the world at no cost and with almost no restrictions\nwhatsoever. You may copy it, give it away or re-use it under the terms\nof the Project Gutenberg License included with this eBook or online at\nwww.gutenberg.org. If you are not located in the United States, you\nwill have to check the laws of the country where you are located before\nusing this eBook.\n\nTitle: Alice’s Adventures in Wonderland\n\nAuthor: Lewis Carroll\n\nRelease Date: January, 1991 [eBook #11]\n[Most recently updated: October 12, 2020]\n\nLanguage: English\n\nCharacter set encoding: UTF-8\n\nProduced by: Arthur DiBianca and David Widger\n\n*** START OF THE PROJECT GUTENBERG EBOOK ALICE’S ADVENTURES IN WONDERLAND ***\n\n[Illustration]\n\n\n\n\nAlice’s Adventures in Wonderland\n\nby Lewis Carroll\n\nTHE MILLENNIUM FULCRUM EDITION 3.0\n\n

## 2. Perform any necessary preprocessing on the text

In [3]:
# remove irrelevant text
alice_text_main = alice_text.split('CHAPTER I.')[2]
alice_text_main = alice_text_main.split('THE END')[0]
alice_text_main[:500]

'\nDown the Rabbit-Hole\n\n\nAlice was beginning to get very tired of sitting by her sister on the\nbank, and of having nothing to do: once or twice she had peeped into\nthe book her sister was reading, but it had no pictures or\nconversations in it, “and what is the use of a book,” thought Alice\n“without pictures or conversations?”\n\nSo she was considering in her own mind (as well as she could, for the\nhot day made her feel very sleepy and stupid), whether the pleasure of\nmaking a daisy-chain would be w'

In [37]:
# CODE 1: 
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase
# The above func was borrowed here: https://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python

def preprocess_text(base_text):
    # Move to lower case
    new_text = base_text.lower()

    # Remove contractions
    new_text = decontracted(new_text)

    # Remove bad symbols
    new_text = re.sub(r"[^\w\s]", "", new_text) # This call saves _
    new_text = re.sub("_", " ", new_text)

    # Split to tokens
    text_tokens = new_text.split()
    # tokens = TreebankWordTokenizer().tokenize(text) 

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    sw_tokens = [token for token in text_tokens if not token in stop_words]

    #Lemmatization
    lemmatizer = WordNetLemmatizer()
    # Lemmatize nouns
    lemm_tokens = [lemmatizer.lemmatize(token) for token in sw_tokens]
    # Lemmatize verbs
    lemm_tokens = [lemmatizer.lemmatize(token, "v") for token in sw_tokens]

    # Collect text from tokens
    new_text = " ".join(lemm_tokens) # For what reason this line stands (соеденить текст обратно?)
    
    return new_text



## 3. Find Top 10 most important words

In [39]:
# Split text on chapters
chapters = alice_text_main.split('CHAPTER ')

tfidf = TfidfVectorizer(stop_words='english')
tfidf.fit_transform(chapters)

chapter_num = 1
for chapter in chapters:
    clear_chapter = preprocess_text(base_text = chapter)
    clear_chapter = re.sub(r'alice', '', clear_chapter)
    response = tfidf.transform([clear_chapter])
    feature_array = np.array(tfidf.get_feature_names())
    tfidf_sorting = np.argsort(response.toarray()).flatten()[::-1]

    top_ten_words = feature_array[tfidf_sorting][:10]
    print("\n"+str(chapter_num)+":\t"+str(top_ten_words))
    print("------------------------------------------------------------------------")
    chapter_num += 1


1:	['think' 'eat' 'say' 'little' 'bat' 'fall' 'key' 'try' 'wonder' 'happen']
------------------------------------------------------------------------

2:	['mouse' 'say' 'ill' 'pool' 'little' 'swim' 'think' 'cat' 'dear' 'fan']
------------------------------------------------------------------------

3:	['say' 'mouse' 'dodo' 'prize' 'bird' 'lory' 'ill' 'dry' 'know' 'thimble']
------------------------------------------------------------------------

4:	['grow' 'little' 'window' 'rabbit' 'say' 'run' 'fan' 'puppy' 'hear' 'come']
------------------------------------------------------------------------

5:	['say' 'caterpillar' 'pigeon' 'serpent' 'egg' 'youth' 'bite' 'try' 'think'
 'size']
------------------------------------------------------------------------

6:	['say' 'cat' 'footman' 'baby' 'mad' 'duchess' 'sneeze' 'pig' 'grin'
 'think']
------------------------------------------------------------------------

7:	['say' 'hatter' 'dormouse' 'march' 'hare' 'twinkle' 'draw' 'remark' 'time'
 

## 4 Find the Top 10 most used verbs in sentences with Alice. 

In [40]:
# tokenization - sentences case
sents = sent_tokenize(alice_text_main)

alice_sents = []
verbs = ''
for sentence in sents:
    clearSentence = preprocess_text(base_text = sentence)
    if 'alice' in clearSentence:
        alice_sents.append(clearSentence)
    else:
        continue

for sentence in alice_sents:
    s = nltk.pos_tag(sentence.split())
    for w in s:
        if 'VB' in w[1]:
            verbs += w[0] + ' '
        else:
            continue

response = tfidf.transform([verbs])
feature_array = np.array(tfidf.get_feature_names())
tfidf_sorting = np.argsort(response.toarray()).flatten()[::-1]

print(feature_array[tfidf_sorting][:10])

['say' 'think' 'know' 'run' 'come' 'make' 'begin' 'happen' 'king' 'look']


The algorithm considers word "king" as a verb, but originaly it is used as noun :)