In [3]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.collocations import *
from nltk.stem import *
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('nps_chat')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from google.colab import drive
drive.mount('/content/drive')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package nps_chat to /root/nltk_data...
[nltk_data]   Package nps_chat is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [26]:
"""
return text as array of lines
"""
def read_book(title_path): 
	with open(title_path, "r", encoding ="utf8") as current_file: 
		text = current_file.readlines()
	return text

def divide_into_chapters(text):
  """
  return array of chapter presenting as array of lines
  """
  string_number = 0
  content_list = []
  chapters = []

  for string in text:
    string_number+=1
    if re.match('CHAPTER', string):
      content_list.append(string_number)
      continue
    if re.match('THE END', string):
      content_list.append(string_number)
  
  for i in range(0, len(content_list)-1):
    start = content_list[i] 
    finish = content_list[i+1] - 1 #w/o one line at the end of chapter, because not to cut the beginning of next chapter
    chapters.append(text[start:finish])  
  return chapters

def get_sentences(chapter):
  sentences = []
  sentence = ""
  for i in range(len(chapter)):
    if chapter[i] == "\n":
      sentences.append(sentence)
      sentence = ""
    else:
      sentence += chapter[i]
      if i == len(chapter)-1:
        sentences.append(sentence)
  return sentences 

def clean_text(sentence, alice_only = False):
  sentence = re.sub(r'\n', ' ', sentence)
  sentence = sentence.lower()
  sentence = re.sub("<.*?\>", " ", sentence)
  sentence = re.sub(r'_', " ", sentence)
  sentence = re.sub(r"[^\w\s]", "", sentence, re.UNICODE)
  sentence = re.sub(r'\s+', " ", sentence)
  if alice_only and "alice" in sentence:
    sentence = re.sub("alice", "", sentence)
    return sentence #return only cleaned sentences with Alice
  else:
    sentence = re.sub("alice", "", sentence) #otherwise deletion "Alice" from sentences is as step of cleaning procedure
  if alice_only == False:
    return sentence

def get_alice_action(chapter):
  tokens = [nltk.word_tokenize(sentence) for sentence in chapter]
  
  #define part-of-spitch 
  pos_list = [nltk.pos_tag(word) for word in tokens]
  
  #select only verbs from sentence 
  alice_action = [[word for (word, tags) in i if tags in ('VB', 'VBZ')] for i in pos_list]
  
  #transform alice_action array into string and return it 
  alice_action = [" ".join(i) for i in alice_action]
  alice_action = " ".join(alice_action)
  
  return alice_action

##Top 10 important words in each chapter

In [None]:
text = read_book('/content/drive/My Drive/Datasets/11-0.txt')
chapters = divide_into_chapters(text)

#print("Before normalization: ", chapters[8])
for i in range(len(chapters)):
  chapters[i] = "".join(chapters[i])
  chapters[i] = clean_text(chapters[i])
#print("After normalization: ", chapters[8],"\n\n")  

#run vectorixer which computes the word counts, idf and tf-idf values all at once and also excepts stop-words
vectorizer = TfidfVectorizer(use_idf=True, stop_words='english')
X = vectorizer.fit_transform(chapters)

#create table [word][tf-idf value] for each chapters in book
#sort descent by [tf-idf value] and show the first 10 values
print("Top 10 important words each chapter")
for i in range(len(chapters)):
  df = pd.DataFrame(X[i].T.todense(), index=vectorizer.get_feature_names(), columns=["Chapter {0}".format(i+1)])
  print((df.sort_values(by=["Chapter {0}".format(i+1)],ascending=False))[:10])

Top 10 important words each chapter
        Chapter 1
little   0.183798
key      0.160173
way      0.159291
eat      0.152091
rabbit   0.145618
bats     0.140755
like     0.134785
think    0.134785
door     0.127415
bottle   0.120882
        Chapter 2
mouse    0.364099
pool     0.195979
little   0.191153
cats     0.171482
oh       0.170595
swam     0.161457
ll       0.145732
dear     0.141944
said     0.134931
mabel    0.129165
         Chapter 3
said      0.378755
mouse     0.360718
dodo      0.329696
lory      0.164848
race      0.159957
prizes    0.159957
dry       0.145620
thimble   0.127966
know      0.122539
dinah     0.108921
         Chapter 4
little    0.220710
window    0.220464
rabbit    0.213826
puppy     0.192906
gloves    0.165670
chimney   0.165348
bottle    0.142003
fan       0.142003
said      0.134345
room      0.119110
             Chapter 5
caterpillar   0.491890
said          0.451554
pigeon        0.299256
serpent       0.224442
youth         0.149628
eggs        

##Top 10 what does Alice like to do 

To calculate TF-IDF metric it supposes that the document (s) is one chapter of the book (D), the term (i) is word in this chapter. 


In [10]:
a = [["test test2"], ["test3 test4"]]
b = " ".join(a)
print(b)

TypeError: ignored

In [20]:
from sklearn.feature_extraction.text import CountVectorizer
# list of text documents
text = ["The quick brown fox jumped brown over the lazy dog."]
# create the transform
vectorizer = CountVectorizer()
# tokenize and build vocab
vectorizer.fit(text)
# summarize
print(vectorizer.vocabulary_)
# encode document
vector = vectorizer.transform(text)
# summarize encoded vector
print("vector shape", vector.shape)
print(type(vector))
print(vector.toarray())

{'the': 7, 'quick': 6, 'brown': 0, 'fox': 2, 'jumped': 3, 'over': 5, 'lazy': 4, 'dog': 1}
vector shape (1, 8)
<class 'scipy.sparse.csr.csr_matrix'>
[[2 1 1 1 1 1 1 2]]


In [28]:
text = read_book('/content/drive/My Drive/Datasets/11-0.txt')
chapters = divide_into_chapters(text)
chapters = [get_sentences(chapter) for chapter in chapters]

for i in range(len(chapters)):
  #select only sentences with Alice
  chapters[i] = [clean_text(sentence, True) for sentence in chapters[i]]
  #sentences w/o Alice give None, so delete its
  chapters[i] = list(filter(None, chapters[i]))

alice_action_in_book = [get_alice_action(chapter) for chapter in chapters]
temp = []
temp = " ".join(alice_action_in_book)
alice_action_in_book.clear()
alice_action_in_book.append(temp)

#run vectorixer which computes the word counts, idf and tf-idf values all at once and also excepts stop-words
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(alice_action_in_book)

#create table [word][tf-idf value] for each chapters in book
#sort descent by [tf-idf value] and show the first 10 values
for i in range(len(alice_action_in_book)):
  df = pd.DataFrame(X[i].T.todense(), index=vectorizer.get_feature_names(), columns=["Alice action in book"])
  print((df.sort_values(by=["Alice action in book"],ascending=False))[:10])


          Alice action in book
say                         20
think                       18
make                        16
tell                        13
come                        12
know                        12
eat                         10
look                         9
remember                     9
hear                         9


##Draft section

In [None]:
#лемматизация
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
w = []
for i in range(len(tokens)):
  w.append([lemmatizer.lemmatize(q) for q in tokens[i]])

print("w", w)

NameError: ignored

In [None]:
#делит книгу на главы, главы на предложения, чистит их и возвращает главу как массив очищенных предложений
def get_sentences(chapter):
  sentences = []
  sentence = ""
  for i in range(len(chapter)):
    if chapter[i] == "\n":
      sentences.append(sentence)
      sentence = ""
    else:
      sentence += chapter[i]
      if i == len(chapter)-1:
        sentences.append(sentence)
  return sentences 

def clean_text(sentence):
  sentence = re.sub(r'\n', ' ', sentence)
  sentence = sentence.lower()
  sentence = re.sub("<.*?\>", " ", sentence)
  sentence = re.sub("alice", "", sentence)
  sentence = re.sub(r'_', " ", sentence)
  sentence = re.sub(r"[^\w\s]", "", sentence, re.UNICODE)
  sentence = re.sub(r'\s+', " ", sentence)
  
  return sentence
   
text = read_book('/content/drive/My Drive/Datasets/11-0.txt')
chapters = divide_into_chapters(text)
print("Before: ", chapters[8])
for i in range(len(chapters)):
  chapters[i] = [clean_text(sentence) for sentence in get_sentences(chapters[i])]
  chapters[i] = [w for w in chapters[i] if w != ""] 
print("After: ", chapters[8])  

полезная ссылка про векторизацию текста
https://kavita-ganesan.com/tfidftransformer-tfidfvectorizer-usage-differences/#.YC5hAWgzbBU

In [None]:
bigram_measures = nltk.collocations.BigramAssocMeasures()
text = "I do not like green eggs and ham, I do not like them Sam I am!"
tokens = nltk.wordpunct_tokenize(text)
finder = BigramCollocationFinder.from_words(tokens)
scored = finder.score_ngrams(bigram_measures.raw_freq)
sorted(bigram for bigram, score in scored)