In [160]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import gensim
import re
import nltk
from nltk.tokenize import TreebankWordTokenizer, WhitespaceTokenizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk import FreqDist
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
from collections import defaultdict
from typing import List

In [161]:
def clean_text(text : str, alice=False) -> List:
    text = text.lower() # lower case everything
    text = re.sub("\n", " ", text) # delete character of new line
    text = re.sub('[^A-Za-z0-9 ]', ' ', text) # remove bad symbols
    text = re.sub(r"[^\w\s]", "", text) # remove all symbol except letters and digits
    if not alice:
        text = re.sub("alice", "", text)
        
    tokens = TreebankWordTokenizer().tokenize(text) # tokenizing
    stop_words = stopwords.words("english") 
    tokens = [token for token in tokens if token not in stop_words] # remove english stopwords 
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens] # lemmatize
    text = " ".join(tokens) # make a text of it
    return text

In [162]:
# I pre deleted all text before 1 chapter and everything after the end of book (after the end of 12 chapter).
with open('alice/alice.txt', encoding='utf8') as f:
    text = f.read()

# separate text by chapters
chapters = [
    "CHAPTER I.",
    "CHAPTER II.",
    "CHAPTER III.",
    "CHAPTER IV.",
    "CHAPTER V.",
    "CHAPTER VI.",
    "CHAPTER VII.",
    "CHAPTER VIII.",
    "CHAPTER IX.",
    "CHAPTER X.",
    "CHAPTER XI.",
    "CHAPTER XII.",
]

In [163]:
text_by_chapters = {}
# cutting text by names of chapters 
for curr, next in zip(chapters, chapters[1:]):
    text_by_chapters[curr] = text[text.index(curr):text.index(next)][10:]
text_by_chapters[chapters[-1]] = text[text.index(chapters[-1]):][10:]

In [164]:
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')
clean_text_by_chap = {}
for k, v in text_by_chapters.items():
    clean_text_by_chap[k] = clean_text(v)
    
clean_text_by_chap

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Роман\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Роман\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Роман\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


{'CHAPTER I.': 'rabbit hole beginning get tired sitting sister bank nothing twice peeped book sister reading picture conversation use book thought without picture conversation considering mind well could hot day made feel sleepy stupid whether pleasure making daisy chain would worth trouble getting picking daisy suddenly white rabbit pink eye ran close nothing remarkable think much way hear rabbit say oh dear oh dear shall late thought afterwards occurred ought wondered time seemed quite natural rabbit actually took watch waistcoat pocket looked hurried started foot flashed across mind never seen rabbit either waistcoat pocket watch take burning curiosity ran across field fortunately time see pop large rabbit hole hedge another moment went never considering world get rabbit hole went straight like tunnel way dipped suddenly suddenly moment think stopping found falling deep well either well deep fell slowly plenty time went look wonder going happen next first tried look make coming dark

In [165]:
# find top words by tf-idf
tfidf = TfidfVectorizer()
transformed_text = tfidf.fit_transform(clean_text_by_chap.values())

In [166]:
text_matrix = transformed_text.toarray()
names = tfidf.get_feature_names_out()

In [177]:
def get_ranks(ranks, top):
    # getting top ranks in line from array
    idx = np.take(np.argpartition(ranks, range(-top, 0), axis=1), range(-1, -top-1, -1), axis=1)
    return (idx, )[0]

In [178]:
ranks = get_ranks(text_matrix, 10)
ranks

array([[1102,  129,  504, 1500, 1002, 2133,  555, 1089, 1939,  568],
       [1216, 1419, 1102, 1296, 1897,  262,  447, 1611,  731, 1139],
       [1611, 1216,  501, 1450, 1501, 1122,  541, 1936, 1023,  170],
       [ 169, 1500, 1102, 2167, 1474,  799, 1301,  649,  192,  298],
       [ 265, 1611, 1385, 1662, 2219,  566, 1745,  663, 1102, 2138],
       [1611,  732,  262,  116, 1141,  542, 2197, 1089, 1384,  375],
       [ 865,  506, 1611, 1156,  857, 1914, 2038, 1966, 2138, 1996],
       [1489, 1611,  876, 1009,  777, 1780,  262, 1590,  703, 1452],
       [2031, 1611, 1203,  835,  542, 1209, 1489, 2139, 1626, 1254],
       [2031, 1203,  835, 1611, 1108,  436, 1803,  985,  139, 2154],
       [1009,  865, 1611,  385,  506, 2177,  995, 1489,  994, 1294],
       [1611, 1009,  995, 1489, 1738,  945,  524, 1750, 2196, 1500]],
      dtype=int64)

In [179]:
for i in range(12):
    s = pd.DataFrame(
    zip(names[ranks[i]], text_matrix[i][ranks[i]]), 
    columns=['word', 'count']
    )
    print("Chapter N:", i+1)
    print(s)
    print('\n')

Chapter N: 1
     word     count
0  little  0.170733
1     bat  0.168434
2    door  0.152175
3  rabbit  0.152175
4     key  0.148787
5     way  0.147969
6     eat  0.141280
7    like  0.125204
8   think  0.125204
9  either  0.121097


Chapter N: 2
     word     count
0   mouse  0.367720
1    pool  0.188504
2  little  0.183861
3      oh  0.164087
4    swam  0.155298
5     cat  0.153422
6    dear  0.150182
7    said  0.129784
8    foot  0.126221
9   mabel  0.124238


Chapter N: 3
      word     count
0     said  0.364689
1    mouse  0.364687
2     dodo  0.317451
3    prize  0.184820
4     race  0.184820
5     lory  0.158726
6      dry  0.140211
7  thimble  0.123213
8     know  0.117987
9     bird  0.114117


Chapter N: 4
      word     count
0     bill  0.245107
1   rabbit  0.212795
2   little  0.205918
3   window  0.205689
4    puppy  0.179978
5    glove  0.154567
6      one  0.134295
7      fan  0.132486
8   bottle  0.132486
9  chimney  0.132486


Chapter N: 5
          word     count


- chapt 1: about some bat and rabbit who eating and some key
- chapt 2: about little mouse swimming in the pool
- chapt 3: about mouse names dodo winning some prize
- chapt 4: about little rabbit and window
- chapt 5: caterpillar, serpent and pigeon do something with egg
- chapt 6: a lot of talking between cat and footman
- chapt 7: hatter do something with hare and march
- chapt 8: queen talking with hedgehog and king
- chapt 9: turtle has a moral dilemma about mocking on duchess and gryphon  
- chapt 10: same as in chapt 9 but tutrle mocking on lobster. also, some dancing
- chapt 11: king and hatter have problems in court. dormhouse is a witness
- chapt 12: king is talking a lot about queen, sister, white rabbit and his dreams

In [180]:
# splitting nouns, verbs etc
def get_tag(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag

In [181]:
text_alice = {}
for k, v in text_by_chapters.items():
    text_alice[k] = clean_text(v, alice=False).split()


nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Роман\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [182]:
# counting verbs. if tag is V -> it is a verb
wc = defaultdict(lambda : 0)
for k, v in text_alice.items():
    for token in v:
        tag = get_tag(token)
        if tag == 'V':
            wc[token] += 1

In [183]:
print("TOP 10 verbs:")
for key, value in sorted(wc.items(), key=lambda item: item[1], reverse=True)[:10]:
    print("%s: %s" % (key, value))

TOP 10 verbs:
said: 462
know: 89
went: 83
see: 67
began: 58
go: 57
say: 55
come: 48
get: 46
looked: 45


Usually, Alice do a lot of talking. Also, a lot of going and coming somewhere.