In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords
import re, math
from operator import methodcaller
from collections import Counter
import pandas as pd
from nltk.corpus import wordnet as wn

# if it is importatn to use lemm and stop words

# nltk.download('words')
# nltk.download('stopwords')

def compute_tf(text):
    tf_text = Counter(text)
    for i in tf_text:
        tf_text[i] = tf_text[i] / float(len(text))
    return tf_text

def compute_idf(word, corpus):
    return math.log10(len(corpus) / sum([1.0 for i in corpus if word in i]))

def compute_tfidf(corpus):
    
    documents_list = []

    for text in corpus:
        tf_idf_dictionary = {}
        computed_tf = compute_tf(text)
        for word in computed_tf:
            tf_idf_dictionary[word] = computed_tf[word] * compute_idf(word, corpus)
        documents_list.append(tf_idf_dictionary)

    return documents_list

words = set(nltk.corpus.words.words())
stopwords = set(stopwords.words('english'))

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Владимир\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Владимир\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
class Text:
    
    def __init__(self, filepath):
        with open(filepath, 'r') as file:
            text = file.read()
        self.text = repr(text)
        
    def __str__(self):
        return str(self.text)
    
    def __len__(self):
        return len(self.text)
    
    def __repr__(self):
        return str(self.text)
    
    def lower(self):
        self.text = [[word.lower() for word in corpus] for corpus in self.text]
    
    def replace(self, old, new):
        self.text = self.text.replace(old, new)
        
    def split(self, splitter):
        self.text = self.text.split(splitter)
                        
    def lemmatize(self, lemmatizer):
        self.text = [[lemmatizer(w) for w in corpus] for corpus in self.text]
        
    def delete_stop_words(self, stopwords):
        self.text = [[w for w in corpus if w.lower() not in stopwords] for corpus in self.text]

# Find most popular words in chapters

In [3]:
text = Text('alice.txt') # read data
text.replace(r'\n', ' ') # replace some useless info
text.replace(r'\u', '') # another one
text.text = re.sub("[^a-zA-Z ]+", "", text.text) # replace subs

text.text = text.text.split("CHAPTER")[13 : ] # split on chapters

text.text = list(map(methodcaller("split", " "), text.text)) # lest make tokenization
text.text = [list(filter(None, corpus)) for corpus in text.text] # and delete some trash
text.lower() # make each word in lower case

text.delete_stop_words(stopwords) # deleting stop words
text.lemmatize(WordNetLemmatizer().lemmatize) # and then lemmatization

tf_idf = compute_tfidf(text.text) # lets compute tf-idf metric for each chapter and for each word

frame = pd.DataFrame.from_records(tf_idf).fillna(0.) # so, we had to make the result visible. Filling zeros words which dont maches with chapters

# alice has zero tf-idf metric because it comes from each chapters

In [4]:
for index, row in frame.iterrows():
    print(f"CHAPTER {index + 1}")
    most_popular_words = row.sort_values().index[-10 : ]
    tfidf_values = row.sort_values()[-10 : ]
    for word, value in zip(most_popular_words, tfidf_values):
        print(f"\t'{word}' with {value:.4f} tf-idf")

CHAPTER 1
	'drink' with 0.0024 tf-idf
	'fell' with 0.0025 tf-idf
	'eat' with 0.0027 tf-idf
	'bottle' with 0.0032 tf-idf
	'poison' with 0.0033 tf-idf
	'rabbithole' with 0.0033 tf-idf
	'candle' with 0.0033 tf-idf
	'dark' with 0.0033 tf-idf
	'key' with 0.0037 tf-idf
	'bat' with 0.0048 tf-idf
CHAPTER 2
	'capital' with 0.0023 tf-idf
	'four' with 0.0024 tf-idf
	'cat' with 0.0030 tf-idf
	'dog' with 0.0031 tf-idf
	'fan' with 0.0031 tf-idf
	'glove' with 0.0031 tf-idf
	'mabel' with 0.0043 tf-idf
	'pool' with 0.0048 tf-idf
	'swam' with 0.0054 tf-idf
	'mouse' with 0.0061 tf-idf
CHAPTER 3
	'northumbria' with 0.0026 tf-idf
	'bird' with 0.0028 tf-idf
	'tale' with 0.0028 tf-idf
	'caucusrace' with 0.0039 tf-idf
	'dry' with 0.0044 tf-idf
	'thimble' with 0.0053 tf-idf
	'lory' with 0.0057 tf-idf
	'prize' with 0.0079 tf-idf
	'mouse' with 0.0097 tf-idf
	'dodo' with 0.0114 tf-idf
CHAPTER 4
	'honour' with 0.0035 tf-idf
	'yer' with 0.0035 tf-idf
	'ann' with 0.0035 tf-idf
	'mary' with 0.0035 tf-idf
	'glove' wit

In [5]:
for index, row in frame.iterrows():
    print(f"CHAPTER {index + 1} may have '{row.sort_values().index[-1]}' title with {row.sort_values()[-1]:.4f} max tf-idf value")

CHAPTER 1 may have 'bat' title with 0.0048 max tf-idf value
CHAPTER 2 may have 'mouse' title with 0.0061 max tf-idf value
CHAPTER 3 may have 'dodo' title with 0.0114 max tf-idf value
CHAPTER 4 may have 'window' title with 0.0071 max tf-idf value
CHAPTER 5 may have 'caterpillar' title with 0.0151 max tf-idf value
CHAPTER 6 may have 'footman' title with 0.0106 max tf-idf value
CHAPTER 7 may have 'dormouse' title with 0.0188 max tf-idf value
CHAPTER 8 may have 'queen' title with 0.0095 max tf-idf value
CHAPTER 9 may have 'turtle' title with 0.0146 max tf-idf value
CHAPTER 10 may have 'turtle' title with 0.0184 max tf-idf value
CHAPTER 11 may have 'hatter' title with 0.0140 max tf-idf value
CHAPTER 12 may have 'project' title with 0.0350 max tf-idf value


# Find sentences with Alice

In [6]:
text = Text('alice.txt') # read data
text.replace(r'\n', ' ') # replace some useless info
text.replace(r'\u', '') # another one
text.text = re.sub("[^a-zA-Z.' ]+", "", text.text) # replace subs

text.text = text.text.split('.') # split on sentences
# text.text = text.text.split("CHAPTER")[13 : ] # split on chapters

text.text = list(map(methodcaller("split", " "), text.text)) # lest make tokenization
text.text = [list(filter(None, corpus)) for corpus in text.text] # and delete some trash as empty objects of str
text.lower() # make each word in lower case

text.text = [sentence for sentence in text.text if 'alice' in sentence] # filter only those sentences which consist alice word

text.delete_stop_words(stopwords) # deleting stop words
text.lemmatize(WordNetLemmatizer().lemmatize) # and then lemmatization
text.delete_stop_words(stopwords) # deleting stop words

# filter by verbes
corpus = []
for sentence in text.text:
    corpus.append([])
    for w in sentence:
        try:
            syn = wn.synsets(w)[0].pos()
            if syn == 'v': corpus[-1].append(w)
        except:
            continue

text.text = corpus
# text.text = [[w for w in sentence if  == 'v'] for sentence in text.text]

tf_idf = compute_tfidf(text.text) # lets compute tf-idf metric for each chapter and for each word

frame = pd.DataFrame.from_records(tf_idf).fillna(0.) # so, we had to make the result visible. Filling zeros words which dont maches with chapters

# alice has zero tf-idf metric because it comes from each chapters

In [7]:
for word, tf_idf in frame.sum().sort_values()[-10:].iteritems():
    print(f"'{word}' verb have {tf_idf:.2f} tf-idf")

'asked' verb have 5.56 tf-idf
'made' verb have 5.61 tf-idf
'remarked' verb have 5.85 tf-idf
'heard' verb have 7.13 tf-idf
'got' verb have 7.83 tf-idf
'began' verb have 8.31 tf-idf
'replied' verb have 9.76 tf-idf
'went' verb have 10.29 tf-idf
'looked' verb have 12.05 tf-idf
'said' verb have 27.81 tf-idf
