### Tag rhymes & POS

In [1]:
from rhymetagger import RhymeTagger
from nltk.tokenize import RegexpTokenizer
import pandas as pd
import os
import re
from datetime import datetime

In [2]:
# initialise tokenizer that would clean punct
tokenizer = RegexpTokenizer(r'\w+')

In [3]:
# load RhymeTagger model
rt = RhymeTagger()
rt.load_model(model = 'ru')

Model loaded with following settings:
  frequency_min: 3
           lang: ru
       max_iter: 20
          ngram: 3
   ngram_length: 3
   prob_ipa_min: 0.9
 prob_ngram_min: 0.9
     same_words: False
   stanza_limit: True
         stress: True
       syll_max: 2
    t_score_min: 3.078
   vowel_length: True
         window: 5


### Extract rhymes

In [4]:
os.chdir('data/texts') # path for the corpus with poems as separate .txt-s

In [8]:
os.listdir()[1:20] # check the directory

['C_156__20.txt',
 'P_518.txt',
 'C_101__42.txt',
 'C_70__25.txt',
 'C_84__24.txt',
 'P_1086.txt',
 'C_111__56.txt',
 'P_1092.txt',
 'C_70__31.txt',
 'C_84__30.txt',
 'C_111__42.txt',
 'C_257__11.txt',
 'C_156__34.txt',
 'C_197__1.txt',
 'C_319__13.txt',
 'P_524.txt',
 'C_327__8.txt',
 'C_160__36.txt',
 'C_194__37.txt']

In [9]:
len(os.listdir()) # number of files in the folder

4798

In [None]:
time_start = datetime.now()
print('start: ', time_start)

poem = []
poem_id = []
verse = []
rhyme_words = []
rhyme_list = []
errors = []

# regex for cleaning lines from digits (other things can be added)
regex = re.compile('\d+\W*')

for file in os.listdir():
    if file.endswith('.txt'):
        with open(file, 'r') as text:
            
            try:
                for line in text:
                    #if line != '\n': # filter empty lines
                    if not re.match(r"^\d*\W*$", line): # filter empty & non-character lines as .....
                        poem_id.append(file)
                        line = line.strip()
                        poem.append(line)
                        verse = tokenizer.tokenize(line) # tokenization
                        verse = [i for i in verse if not regex.match(i)] # remove non-word tokens
                        rhyme_words.append(verse[len(verse)-1]) # last word attachment
                rhymes = rt.tag(poem, output_format = 3) # rhyme analysis for the poem compiled by if not...
                rhyme_list.append(rhymes) # append results
                rhymes, poem, verse = [],[],[] # cleaning iterating vars
                print('just finished:', file)
            
            except:
                print('error in:', file)
                errors.append(file)
                
print('end: ', datetime.now())
total_time = datetime.now() - time_start
print('total time:', total_time)

In [12]:
# total time for full corpus-1835: 0:26:41

# check errors
len(errors)

0

### RNC corpus

In [4]:
os.chdir('data/rnc') # path for rnc corpus

In [6]:
os.listdir()[1:20]

['7600_1832.txt',
 '6700_1830.txt',
 '12637_1830.txt',
 '7246_1846.txt',
 '16012_1831.txt',
 '6701_1830.txt',
 '12636_1830.txt',
 '14685_1839.txt',
 '7601_1832.txt',
 '3808_1810.txt',
 '17666_1806.txt',
 '20056_1791.txt',
 '17416_1784.txt',
 '6448_1792.txt',
 '1581_1846.txt',
 '9502_1848.txt',
 '10470_1846.txt',
 '5892_1836.txt',
 '18488_1783.txt']

In [8]:
len(os.listdir()) # number of files in the folder

10367

In [None]:
time_start = datetime.now()
print('start: ', time_start)

poem = []
poem_id = []
verse = []
rhyme_words = []
rhyme_list = []
errors = []

# regex for cleaning lines from digits (other things can be added)
regex = re.compile('\d+\W*')

for file in os.listdir():
    if file.endswith('.txt'):
        with open(file, 'r') as text:
            
            try:
                for line in text:
                    #if line != '\n': # filter empty lines
                    if not re.match(r"^\d*\W*$", line): # filter empty & non-character lines as .....
                        poem_id.append(file)
                        line = line.strip()
                        poem.append(line)
                        verse = tokenizer.tokenize(line) # tokenization
                        verse = [i for i in verse if not regex.match(i)] # remove non-word tokens
                        rhyme_words.append(verse[len(verse)-1]) # last word attachment
                rhymes = rt.tag(poem, output_format = 3) # rhyme analysis for the poem compiled by if not...
                rhyme_list.append(rhymes) # append results
                rhymes, poem, verse = [],[],[] # cleaning iterating vars
                print('just finished:', file)
            
            except:
                print('error in:', file)
                errors.append(file)
                
print('end: ', datetime.now())
total_time = datetime.now() - time_start
print('total time:', total_time)

In [15]:
total_time = datetime.now() - time_start
print('total time:', total_time)

total time: 1:04:25.333390


Create rhyme lists & write rhyme seq

In [13]:
# flat the nested list from rhyme tagger
flat_rhymes = [item for sublist in rhyme_list for item in sublist]

In [14]:
flat_rhymes[1:10]

[2, 1, 1, 2, 3, None, 4, 3, 4]

In [15]:
rhyme_words[1:10]

['краса',
 'мечтала',
 'вперяла',
 'небеса',
 'собою',
 'своим',
 'огневым',
 'порою',
 'земным']

##### add POS tags

In [16]:
from pymystem3 import Mystem
m = Mystem()

pos_list = []
errors_pos = []

time_start = datetime.now()
print('start: ', time_start)

for word in rhyme_words:
    try:
        analysis = m.analyze(word)
        pos_list.append(analysis[0]['analysis'][0]['gr'])
    except:
        errors_pos.append(word)
        pos_list.append(" ")
        
    
print('end: ', datetime.now())
total_time = datetime.now() - time_start
print('total time:', total_time)

start:  2024-02-02 19:42:54.046816
end:  2024-02-02 19:43:10.239727
total time: 0:00:16.193100


In [17]:
len(errors_pos)

117

In [18]:
errors_pos

['несомй',
 'речахь',
 '__',
 'мертвець',
 'ovo',
 'état',
 'перигю',
 'покаянъе',
 'молвя',
 'Харишы',
 'ва',
 'сновидеийй',
 '_',
 'ненаменносгь',
 'речъю',
 'щемтггь',
 'далыюй',
 'turbot',
 'Ру6о',
 'Doré',
 'любезный1',
 'ви',
 'изнеможеныи',
 'тмы',
 'цветй',
 'подвебесыо',
 'I',
 'Халкидскихь',
 'мпй',
 'ла',
 'устрояя',
 'минувшихь',
 '_',
 'Сый',
 'Dames',
 'paré',
 'дстойИы',
 'векй',
 'нх',
 'своевольныхь',
 'Sospiri',
 'd',
 'monde',
 'vis',
 'кпппт',
 'нм',
 'цепяхь',
 'менл',
 'Salute',
 'урокя',
 '__',
 'secretaire',
 'tête',
 'полцеиы',
 'покорныи',
 'черныи',
 'темнокрылыи',
 'смелыи',
 'donc',
 'снегь',
 'небесньй',
 'пурпурныи',
 'ceatera',
 'прекрасныхь',
 'свът',
 'шагь',
 '_',
 '_',
 'вссь',
 'юмы',
 'рск',
 'шумигь',
 'ои',
 'тообю',
 'ьхп',
 'Werck',
 'растерзаьь',
 'посвя',
 'complaisance',
 'nom',
 'parole',
 '__',
 'XXX',
 'свящепаый',
 'убийцз',
 'рукь',
 'гроаы',
 'мигь',
 'Сый',
 'необъятнойd',
 'delicieuse',
 'delicieuse',
 'птиецй',
 'etceteru',
 'очн',


In [19]:
df = pd.DataFrame({'poem_id':poem_id,
                  'rhyme_words':rhyme_words,
                  'rhymes':flat_rhymes
                  ,
                   'pos':pos_list
                  })
df

Unnamed: 0,poem_id,rhyme_words,rhymes,pos
0,P_1938.txt,стояла,1.0,"V,несов,нп=прош,ед,изъяв,жен"
1,P_1938.txt,краса,2.0,"S,жен,неод=им,ед"
2,P_1938.txt,мечтала,1.0,"V,несов,нп=прош,ед,изъяв,жен"
3,P_1938.txt,вперяла,1.0,"V,пе=прош,ед,изъяв,жен,несов"
4,P_1938.txt,небеса,2.0,"S,сред,неод=(вин,мн|им,мн)"
...,...,...,...,...
190701,C_194__12.txt,дня,12.0,"S,муж,неод=род,ед"
190702,C_194__12.txt,счастии,11.0,"S,сред,неод=пр,ед"
190703,C_194__12.txt,грозит,13.0,"V,несов,нп=непрош,ед,изъяв,3-л"
190704,C_194__12.txt,ненастия,,"S,жен,неод=им,ед"


In [53]:
#df.loc[df['rhyme_words'] == '8']

Unnamed: 0,poem_id,rhyme_words,rhymes
110,P_18.txt,8,


In [20]:
#df.to_csv('../per_rhymes_pos.csv')

# df.to_csv('../rnc_rhymes_only.csv')
# df.to_csv('../rnc_rhymes_pos.csv')

df.to_csv('corpus-35_rhymes.csv')

### Add POS & features annotation to all words

In [4]:
from pymystem3 import Mystem
m = Mystem()

In [5]:
# load prepared data with all unique forms occured in Corpus-1835

# counter = 0
analysis = []
word_form = []
lemma = []
pos_list = []
errors_pos = []

time_start = datetime.now()
print('start: ', time_start)

with open('k_endwords.txt', 'r') as words:
    for word in words:
        #if counter < 50:
            #print(word)
        #    counter += 1
                    
        word = word.strip()                
        word_form.append(word)
        try: 
            analysis = m.analyze(word)

            
            lemma.append(analysis[0]['analysis'][0]['lex'])
            pos_list.append(analysis[0]['analysis'][0]['gr'])

        except:                
            errors_pos.append(word)
            lemma.append(" ")
            pos_list.append(" ")

    
print('end: ', datetime.now())
total_time = datetime.now() - time_start
print('total time:', total_time)

start:  2024-07-31 17:08:23.214667
end:  2024-07-31 17:08:25.443504
total time: 0:00:02.229110


In [6]:
word_form[10:20]

['брызги',
 'нагрянул',
 'отцам',
 'великий',
 'и',
 'не',
 'брачном',
 'новым',
 'встрепенулся',
 'зла']

In [7]:
print(len(word_form), len(lemma), len(pos_list))

1010 1010 1010


In [8]:
errors_pos

['du', 'пею', 'ня']

In [9]:
df = pd.DataFrame({'word_from':word_form,
                  'lemma':lemma,
                   'pos':pos_list
                  })
df

Unnamed: 0,word_from,lemma,pos
0,какое,какой,"APRO=(вин,ед,сред|им,ед,сред)"
1,певец,певец,"S,муж,од=им,ед"
2,себя,себя,SPRO=(вин|род)
3,ль,ль,PART=
4,в,в,"S,сокр=(пр,мн|пр,ед|вин,мн|вин,ед|дат,мн|дат,е..."
...,...,...,...
1005,счастливыми,счастливый,"A=твор,мн,полн"
1006,поп,поп,"S,муж,од=им,ед"
1007,жалобный,жалобный,"A=(вин,ед,полн,муж,неод|им,ед,полн,муж)"
1008,ня,,


In [10]:
df.to_csv('k_endwords.csv')

### POS tagging & other tests

In [None]:
poem = []
poem_id = []
verse = []
verse_cln = []
rhyme_words = []
rhyme_list = []

# regex for cleaning lines from digits (other things can be added)
regex = re.compile('\d+')

for file in os.listdir():
    if file.endswith('.txt'):
        with open(file, 'r') as text:
            for line in text:
                #if line != '\n': # filter empty lines
                if not re.match(r"\W+", line): # filter non-character lines as .....
                    poem_id.append(file)
                    line = line.strip()
                    poem.append(line)
                    verse = tokenizer.tokenize(line)
                    verse = [i for i in verse if not regex.match(i)]
                    print(verse)
                        

In [19]:
line = '\n\nohhh'
if re.match(r"^\d*\W*$", line):
    print('true')
else:
    print('false')

false


In [16]:
import spacy
nlp = spacy.load('ru_core_news_lg')

In [21]:
test = ['мой', 'душа', 'гоняю']

doc = nlp("мой")

for token in doc:
    print(token.pos_)

DET


In [23]:
pos_list = []

for i in test:
    doc = nlp(i)
    for token in doc:
        pos_list.append(token.pos_)

In [30]:
pos_list

['DET', 'NOUN', 'VERB']

In [28]:
rhyme_words[1:10]

['поля',
 'надежда',
 'земля',
 'безмятежно',
 'мечта',
 'безрубежной',
 'облита',
 'простора',
 'чудес']

In [32]:
pos_list = [token.pos_ for word in rhyme_words[1:10] for token in nlp(word)]

In [33]:
pos_list

['NOUN', 'NOUN', 'NOUN', 'ADJ', 'NOUN', 'NOUN', 'VERB', 'NOUN', 'NOUN']

#### mystem
Some checking how the tagger works

In [45]:
from pymystem3 import Mystem
m = Mystem()

In [47]:
m.analyze("слово безмятежно")

[{'analysis': [{'lex': 'слово', 'gr': 'S,сред,неод=(вин,ед|им,ед)'}],
  'text': 'слово'},
 {'text': ' '},
 {'analysis': [{'lex': 'безмятежно', 'gr': 'ADV='}], 'text': 'безмятежно'},
 {'text': '\n'}]

In [52]:
analysis = m.analyze("безмятежно")
analysis[0]

{'analysis': [{'lex': 'безмятежно', 'gr': 'ADV='}], 'text': 'безмятежно'}

In [57]:
d = analysis[0]
d['analysis'][0]['gr']

'ADV='

In [58]:
analysis[0]['analysis'][0]['gr']

'ADV='

In [59]:
rhyme_words[1:10]

['поля',
 'надежда',
 'земля',
 'безмятежно',
 'мечта',
 'безрубежной',
 'облита',
 'простора',
 'чудес']

In [65]:
pos_list = []

for word in rhyme_words[1:10]:
    analysis = m.analyze(word)
    pos_list.append(analysis[0]['analysis'][0]['gr'])

In [66]:
pos_list

['S,сред,неод=(вин,мн|род,ед|им,мн)',
 'S,жен,неод=им,ед',
 'S,жен,неод=им,ед',
 'ADV=',
 'S,жен,неод=им,ед',
 'A=(пр,ед,полн,жен|дат,ед,полн,жен|род,ед,полн,жен|твор,ед,полн,жен)',
 'V=прош,ед,прич,кр,жен,сов,страд',
 'S,муж,неод=род,ед',
 'S,сред,неод=род,мн']

In [62]:
pos_list = []