In [15]:
pip install pymorphy2

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [21]:
import nltk
import json
from re import sub
from pymystem3 import Mystem
from pymorphy2 import MorphAnalyzer
from collections import Counter

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

def txt2str(file_name):
    with open(file_name, 'r', encoding='utf-8') as f:
        txt = sub(r'[\.,?\-!:;"*()]', '', f.read())
    return txt[:2000]  # txt

def str2txt(file_name, txt):
    with open(file_name, 'w', encoding='utf-8') as f:
        f.write(txt)

def str2jsonl(file_name, analyzed_text):
    out_lst = []
    for word in analyzed_text:
        out_lst.append({'lemma': word[0].normal_form,
                        'word': word[0].word,
                        'pos': word[0].tag.POS})
    with open(f'{file_name}.jsonl', 'w', encoding='utf-8') as f:
        json.dump(out_lst, f, ensure_ascii = False, indent = 4)


def lemmit_2(text):
    lemmatized_text = Mystem().lemmatize(text)
    return lemmatized_text

def lemmit(text):  # in case Mystem does not work
    lemmatized_text = ' '.join((MorphAnalyzer().parse(morph))[0].normal_form for morph in text.split())
    return lemmatized_text

def tokenit(text):
    tokenized_text = nltk.word_tokenize(text)
    return tokenized_text

def analyzit(text):
    analyzed_text = [MorphAnalyzer().parse(word) for word in text]

    return analyzed_text

def pos_ratio(analyzed_text):
    pos_list = [analyzed_word[0].tag.POS for analyzed_word in analyzed_text]
    pos_ratio_dict = Counter(pos_list)
    total_word = len(analyzed_text)
    for key in pos_ratio_dict.keys():
        pos_ratio_dict[key] = round(pos_ratio_dict[key] / total_word, 2)

    return pos_ratio_dict

def print_pos_ratio(pos_ratio_dict):
    for key in pos_ratio_dict.keys():
        print(f'{key}: {int(pos_ratio_dict[key] * 100)}%')

def top20verbs(analyzed_text):
    verbs = [analyzed_word[0].normal_form for analyzed_word in analyzed_text if analyzed_word[0].tag.POS == 'VERB']
    top_verbs = Counter(verbs).most_common(20)
    return top_verbs

def top20adverbs(analyzed_text):
    adverbs = [analyzed_word[0].normal_form for analyzed_word in analyzed_text if analyzed_word[0].tag.POS == 'ADVB']
    top_adverbs = Counter(adverbs).most_common(20)
    return top_adverbs

def top25bigrams(lemmatized_text):
    bigrams = nltk.bigrams(lemmatized_text)
    top_bigrams = [('-'.join(big[0]), big[1])for big in Counter(bigrams).most_common(25)]
    return top_bigrams

def top25trigrams(lemmatized_text):
    trigrams = nltk.trigrams(lemmatized_text)
    top_trigrams = [('-'.join(big[0]), big[1])for big in Counter(trigrams).most_common(25)]
    return top_trigrams

def print_top(top_lst):
    for top_tup in top_lst:
        print(f'Total number of "{top_tup[0]}" in text: {top_tup[1]}')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


2. (2 points) Lemmatize the text using mystem (if you absolutely do not want to use mystem, you can use any other suitable tool we have discussed) and save the result into a .txt file

In [22]:
murakami = txt2str('text.txt')
lemmatized_murakami = ''.join(lemmit(murakami))
str2txt('lemmatized_text.txt', lemmatized_murakami)

3. (3 points)
    
    - tokenize the text using nltk
    - analyze the words using pymorphy
    - save the results of the analysis in jsonlines (.jsonl): every line contains morphological analysis in a form of a dictionary ```{"lemma": "конь", "word": "коня", "pos": "NOUN"}```

In [23]:
murakami = txt2str('text.txt')
tokenized_murakami = tokenit(murakami)
analyzed_murakami = analyzit(tokenized_murakami)
str2jsonl('murakami', analyzed_murakami)

4. (2 points) Answer the following questions:
    
    - What percentage constitutes each pos? (E.g., for the verb, the number of verbs divided by the total number of words)
    - Print out top-20 verbs and adverbs
    - you can keep the stop words or you can get rid of them

In [24]:
murakami = txt2str('text.txt')
tokenized_murakami = tokenit(murakami)
analyzed_murakami = analyzit(tokenized_murakami)
pos = pos_ratio(analyzed_murakami)
print_pos_ratio(pos)
print()

print('Top verbs:')
print_top(top20verbs(analyzed_murakami))
print()

print('Top adverbs:')
print_top(top20adverbs(analyzed_murakami))
print()

NOUN: 25%
NPRO: 7%
VERB: 13%
CONJ: 13%
PREP: 12%
ADJF: 8%
ADJS: 2%
PRCL: 7%
PRTF: 0%
ADVB: 10%
GRND: 0%
NUMR: 1%
INTJ: 0%
PRED: 1%
INFN: 2%
COMP: 0%

Top verbs:
Total number of "говорить" in text: 4
Total number of "быть" in text: 3
Total number of "кивать" in text: 2
Total number of "взять" in text: 2
Total number of "заметить" in text: 1
Total number of "стать" in text: 1
Total number of "наклонить" in text: 1
Total number of "понюхать" in text: 1
Total number of "оказаться" in text: 1
Total number of "решить" in text: 1
Total number of "интересоваться" in text: 1
Total number of "очнуться" in text: 1
Total number of "отяжелеть" in text: 1
Total number of "слушаться" in text: 1
Total number of "пересчитывать" in text: 1
Total number of "отвечать" in text: 1
Total number of "обойтись" in text: 1
Total number of "нибыть" in text: 1
Total number of "идти" in text: 1
Total number of "принести" in text: 1

Top adverbs:
Total number of "ещё" in text: 3
Total number of "темно" in text: 2
To

5. (1 point) Find top-25 bigrams and trigrams for your text (use nltk.bigrams), use only lemmas, get rid of the punctuation. Comment shortly on the results.

In [25]:
murakami = txt2str('text.txt')
lemmatized_murakami = ''.join(lemmit(murakami))
print('Top bigrams:')
print_top(top25bigrams(lemmatized_murakami.split()))
print()
print('Top trigrams:')
print_top(top25trigrams(lemmatized_murakami.split()))

Top bigrams:
Total number of "белый-майка" in text: 2
Total number of "а-на" in text: 2
Total number of "харуки-мураками" in text: 2
Total number of "мураками-кафка" in text: 2
Total number of "кафка-на" in text: 2
Total number of "на-пляж" in text: 2
Total number of "парень-по" in text: 2
Total number of "по-прозвище" in text: 2
Total number of "прозвище-ворона" in text: 2
Total number of "у-ты" in text: 2
Total number of "у-он" in text: 2
Total number of "я-кивать" in text: 2
Total number of "говорить-ворона" in text: 2
Total number of "денежка-то" in text: 2
Total number of "тогда-и" in text: 2
Total number of "и-думать" in text: 2
Total number of "думать-быть" in text: 2
Total number of "аннотация-я" in text: 1
Total number of "я-заметить" in text: 1
Total number of "заметить-что" in text: 1
Total number of "что-на" in text: 1
Total number of "на-грудь" in text: 1
Total number of "грудь-белый" in text: 1
Total number of "майка-налиплый" in text: 1
Total number of "налиплый-что" in 