There was an environment preparation part. Main repository should be cloned and all the requirements from it should be installed

In [2]:
!mkdir poetry_data

In [5]:
!pip install url-normalize jellyfish

Collecting url-normalize
  Downloading url_normalize-1.4.3-py2.py3-none-any.whl (6.8 kB)
Collecting jellyfish
  Downloading jellyfish-0.11.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m26.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: url-normalize, jellyfish
Successfully installed jellyfish-0.11.2 url-normalize-1.4.3
[0m

In [6]:
import numpy as np
import pandas as pd
import urllib
from urllib.parse import urlparse, urldefrag, urljoin
from urllib.request import urlopen
from url_normalize import url_normalize
from bs4 import BeautifulSoup
from queue import Queue
import time
import jellyfish
import re
import os
import requests
import pickle

In [7]:
from src.metre_classifier.stress.dict import StressDict
from src.metre_classifier.stress.predictor import CombinedStressPredictor
from src.metre_classifier.markup.markup import Markup
from src.metre_classifier.metre_classifier import MetreClassifier



In [8]:
stress_predictor = CombinedStressPredictor()

Downloading (…)okenizer_config.json:   0%|          | 0.00/269 [00:00<?, ?B/s]

Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision.


Downloading (…)in/char_tokenizer.py:   0%|          | 0.00/3.91k [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/129 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/883 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/12.8M [00:00<?, ?B/s]

In [9]:
def len_check(text_str, avg_len, percent):
    if np.abs(len(text_str) - avg_len) > percent * avg_len:
        return False
    return True

In [10]:
def close_to_strlist(text_str, str_list, diff_percentage):
    for a in str_list: 
        levenshtein_distance = jellyfish.levenshtein_distance(text_str.lower(), a.lower())
        if (levenshtein_distance < len(a) * diff_percentage):
            return True
    return False

In [11]:
def classify_metre(text: str, stress_predictor) -> str:
    return MetreClassifier.classify_metre(Markup.process_text(text, stress_predictor)).metre

In [12]:
def get_meta(soup):
    meta = soup.find_all('meta')
    for tag in meta:
        if 'name' in tag.attrs.keys():
            if tag.attrs['name'] == 'keywords':
                keywords = tag.attrs['content']
                splitter = keywords.find(',')
                author = keywords[:splitter].strip()
                name = keywords[(splitter + 1):].strip()
    return {'name': name, 'author': author}

In [13]:
def get_raw_text(div_text, add_meta_list):
    raw_text = ''
    t = div_text[0]
    substrings = []
    lens = []
    bad_starts = ['посвещается', '*', 'примечание'] + add_meta_list
    for c in t.children:
        sub = c.text.split('\n')
        for s in sub:
            if len(s) > 0:
                s = re.sub(r'[^-\w\s]', '', s.strip().lower())
                s = re.sub('\s+', ' ', s.strip())
                if not s.startswith(tuple(bad_starts)):
                    substrings.append(s)
                    lens.append(len(s))
    
    avg_len = np.mean(lens)
    for sub in substrings:
        if not close_to_strlist(sub, add_meta_list, 0.05) and len_check(sub, avg_len, 0.7):
            raw_text += sub + '\n'
            
    return raw_text

In [14]:
start_page = 'https://rupoem.ru/'
base_page = 'https://rupoem.ru/'

In [16]:
t = 0
q = []
q.append(start_page)
already_visited = set([])
n_downloads = 0

names_list, authors_list, links_list, texts_list, metre_list = [], [], [], [], []
batch_df = pd.DataFrame(columns = ['source_url', 'author', 'name', 'poetry', 'metre'])
batch_df.to_csv('../poetry_data/poetry_part_' + str(t) + '.csv', mode = 'a', header = True, index = False)

saveness = 100
partition_change = 5000
time_start = time.time()

while len(q) > 0:
    url = url_normalize(q.pop(0))
    if url in already_visited or url.endswith('/all.aspx'):
        continue
    already_visited.add(url)
    
    try:
        html = requests.get(url_normalize(url)).text
    except Exception as e:
        print('on url ', url, ' get')
        print(e)
        html = None
        
    if html is not None:
        soup = BeautifulSoup(html)
        if url.endswith('.aspx'):
            div_text = soup.find_all('div', class_='poem-text font-size-larger')
            if div_text:
                add_meta = get_meta(soup)
                raw_text = get_raw_text(div_text, [add_meta['author']])
                
                if raw_text is not None:
                    metre = classify_metre(raw_text, stress_predictor) 
                    if metre in ['iambos', 'choreios']:            
                        names_list.append(add_meta['name'])
                        authors_list.append(add_meta['author'])
                        links_list.append(url)
                        texts_list.append(raw_text)
                        metre_list.append(metre)
                        n_downloads += 1
                        
                        # saving
                        if n_downloads % saveness == 0:    
                            batch_df['source_url'] = links_list
                            batch_df['author'] = authors_list
                            batch_df['name'] = names_list
                            batch_df['poetry'] = texts_list
                            batch_df['metre'] = metre_list
                            batch_df.to_csv(
                                '../poetry_data/poetry_part_' + str(t) + '.csv', 
                                mode = 'a', header = False, index = False
                            )
                            links_list, authors_list, names_list, texts_list, metre_list = [], [], [], [], []
                            
                            with open('query_list.txt','wb') as f:
                                pickle.dump(q, f)
                            with open('visited_set.txt', 'wb') as f:
                                pickle.dump(already_visited, f)
                                
                            # logs    
                            print('saved')
                            time_running = time.time() - time_start
                            print('time_running = ', time_running)
                            print('n_downloads = ', n_downloads)
                            print('queue size = ', len(q))
                            print('visited size = ', len(already_visited))
                            
                            if time_running > 42000: 
                                print('stopped due to long running time')
                                break
                                
                            if n_downloads % partition_change == 0:
                                t += 1
                                batch_df = pd.DataFrame(columns = ['source_url', 'author', 'name', 'poetry', 'metre'])
                                batch_df.to_csv(
                                    '../poetry_data/poetry_part_' + str(t) + '.csv',
                                    mode = 'a', header = True, index = False
                                )
                                print('started a new partition')
                                
                            if n_downloads > 200000:
                                print('stopped due to a lot of poetry')
                                break
        
        children_links = [urljoin(url, link.get('href')) for link in soup.findAll('a')]
        for child in children_links:
            if url_normalize(child) not in already_visited and url_normalize(child).startswith(base_page) and \
                    child != url + '/all.aspx':
                q.append(child)
            
        # time.sleep(sleeping_time)



saved
time_running =  230.81140756607056
n_downloads =  100
queue size =  58234
visited size =  705
saved
time_running =  416.4208393096924
n_downloads =  200
queue size =  86348
visited size =  907
saved
time_running =  653.271630525589
n_downloads =  300
queue size =  89449
visited size =  1093
saved
time_running =  857.2503685951233
n_downloads =  400
queue size =  91927
visited size =  1241
saved
time_running =  1095.2665977478027
n_downloads =  500
queue size =  94651
visited size =  1392
saved
time_running =  1299.6306691169739
n_downloads =  600
queue size =  97006
visited size =  1537
saved
time_running =  1545.1770269870758
n_downloads =  700
queue size =  99920
visited size =  1694
saved
time_running =  1798.6908738613129
n_downloads =  800
queue size =  102375
visited size =  1870
saved
time_running =  1993.016880273819
n_downloads =  900
queue size =  104966
visited size =  2016
saved
time_running =  2218.3490500450134
n_downloads =  1000
queue size =  107447
visited size =

In [None]:
all_df = []
starting_dir = '../poetry_data/'
for f in os.listdir(starting_dir):
    if f.endswith('.csv'):
        df = pd.read_csv(starting_dir + f)
        all_df.append(df)

In [None]:
main_df = pd.concat(all_df, ignore_index = True).drop_duplicates(subset = 'poetry', ignore_index = True)
main_df = main_df.dropna(subset = 'poetry')

In [None]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(main_df, test_size=0.1)

In [None]:
train = train.reset_index()
train = train.drop(columns = ['index'])
test = test.reset_index()
test = test.drop(columns = ['index'])

In [None]:
display(test)

In [None]:
train.to_csv('../poetry_data/train.csv', header = True, index = False)
test.to_csv('../poetry_data/test.csv', header = True, index = False)