### Rhymetagger test

Basic readings

In [12]:
from rhymetagger import RhymeTagger
from nltk.tokenize import RegexpTokenizer
import pandas as pd
import os
import re

In [2]:
# initialise tokenizer that would clean punct
tokenizer = RegexpTokenizer(r'\w+')

In [3]:
os.chdir("data")

In [4]:
for file in os.listdir():
    if file.endswith('.txt'):
        print(file)

P_44.txt
P_41.txt
P_40.txt
P_42.txt
P_43.txt


In [7]:
rt = RhymeTagger()
rt.load_model(model = 'ru')

Model loaded with following settings:
  frequency_min: 3
           lang: ru
       max_iter: 20
          ngram: 3
   ngram_length: 3
   prob_ipa_min: 0.9
 prob_ngram_min: 0.9
     same_words: False
   stanza_limit: True
         stress: True
       syll_max: 2
    t_score_min: 3.078
   vowel_length: True
         window: 5


In [8]:
poem = []
poem_id = []
verse = []
rhyme_words = []
rhyme_list = []

for file in os.listdir():
    if file.endswith('.txt'):
        with open(file, 'r') as text:
            for line in text:
                if line != '\n':
                    poem_id.append(file)
                    line = line.strip()
                    poem.append(line)
                    verse = tokenizer.tokenize(line)
                    rhyme_words.append(verse[len(verse)-1])
            rhymes = rt.tag(poem, output_format = 3)
            rhyme_list.append(rhymes)
            rhymes, poem, verse = [],[],[]
            print('just finished:', file)
            

just finished: P_44.txt
just finished: P_41.txt
just finished: P_40.txt
just finished: P_42.txt
just finished: P_43.txt


In [None]:
print(rhyme_list)

In [10]:
# flat the nested list from rhyme tagger
flat_rhymes = [item for sublist in rhyme_list for item in sublist]

##### add POS tags

In [67]:
from pymystem3 import Mystem
m = Mystem()

pos_list = []

for word in rhyme_words:
    analysis = m.analyze(word)
    pos_list.append(analysis[0]['analysis'][0]['gr'])

In [68]:
df = pd.DataFrame({'poem_id':poem_id,
                  'rhyme_words':rhyme_words,
                  'rhymes':flat_rhymes,
                  'pos':pos_list})
df

Unnamed: 0,poem_id,rhyme_words,rhymes,pos
0,P_44.txt,одеждой,1.0,"S,жен,неод=твор,ед"
1,P_44.txt,поля,2.0,"S,сред,неод=(вин,мн|род,ед|им,мн)"
2,P_44.txt,надежда,1.0,"S,жен,неод=им,ед"
3,P_44.txt,земля,2.0,"S,жен,неод=им,ед"
4,P_44.txt,безмятежно,3.0,ADV=
...,...,...,...,...
431,P_43.txt,молодая,25.0,"A=им,ед,полн,жен"
432,P_43.txt,зови,26.0,"V,несов,пе=ед,пов,2-л"
433,P_43.txt,прекрасной,27.0,"A=(пр,ед,полн,жен|дат,ед,полн,жен|род,ед,полн,..."
434,P_43.txt,любви,26.0,"S,жен,неод=(пр,ед|вин,мн|дат,ед|род,ед|им,мн)"


In [69]:
df.to_csv('test_data.csv')

### POS tagging tests

In [16]:
import spacy
nlp = spacy.load('ru_core_news_lg')

In [21]:
test = ['мой', 'душа', 'гоняю']

doc = nlp("мой")

for token in doc:
    print(token.pos_)

DET


In [23]:
pos_list = []

for i in test:
    doc = nlp(i)
    for token in doc:
        pos_list.append(token.pos_)

In [30]:
pos_list

['DET', 'NOUN', 'VERB']

In [28]:
rhyme_words[1:10]

['поля',
 'надежда',
 'земля',
 'безмятежно',
 'мечта',
 'безрубежной',
 'облита',
 'простора',
 'чудес']

In [32]:
pos_list = [token.pos_ for word in rhyme_words[1:10] for token in nlp(word)]

In [33]:
pos_list

['NOUN', 'NOUN', 'NOUN', 'ADJ', 'NOUN', 'NOUN', 'VERB', 'NOUN', 'NOUN']

#### nltk pos

In [36]:
from nltk import pos_tag

In [43]:
pos_tag(["безмятежно"])

[('безмятежно', 'NN')]

#### mystem

In [45]:
from pymystem3 import Mystem
m = Mystem()

In [47]:
m.analyze("слово безмятежно")

[{'analysis': [{'lex': 'слово', 'gr': 'S,сред,неод=(вин,ед|им,ед)'}],
  'text': 'слово'},
 {'text': ' '},
 {'analysis': [{'lex': 'безмятежно', 'gr': 'ADV='}], 'text': 'безмятежно'},
 {'text': '\n'}]

In [52]:
analysis = m.analyze("безмятежно")
analysis[0]

{'analysis': [{'lex': 'безмятежно', 'gr': 'ADV='}], 'text': 'безмятежно'}

In [57]:
d = analysis[0]
d['analysis'][0]['gr']

'ADV='

In [58]:
analysis[0]['analysis'][0]['gr']

'ADV='

In [59]:
rhyme_words[1:10]

['поля',
 'надежда',
 'земля',
 'безмятежно',
 'мечта',
 'безрубежной',
 'облита',
 'простора',
 'чудес']

In [65]:
pos_list = []

for word in rhyme_words[1:10]:
    analysis = m.analyze(word)
    pos_list.append(analysis[0]['analysis'][0]['gr'])

In [66]:
pos_list

['S,сред,неод=(вин,мн|род,ед|им,мн)',
 'S,жен,неод=им,ед',
 'S,жен,неод=им,ед',
 'ADV=',
 'S,жен,неод=им,ед',
 'A=(пр,ед,полн,жен|дат,ед,полн,жен|род,ед,полн,жен|твор,ед,полн,жен)',
 'V=прош,ед,прич,кр,жен,сов,страд',
 'S,муж,неод=род,ед',
 'S,сред,неод=род,мн']

In [62]:
pos_list = []