In [1]:
from bs4 import BeautifulSoup
import requests
from IPython.display import clear_output
from tqdm.notebook import tqdm

In [2]:
def get_html_page(url):
    try:
        response = requests.get(url)
        return response.text if response.ok else None
    except:
        return None

In [3]:
def get_all_links():
    
    all_links = []
    domain = 'https://tyv.wikipedia.org'
    
    with open('./cat_tyv_links_wiki.txt', 'r') as f:
        cat_links = f.read()
    
    cat_links = cat_links.split()
    
    for link in tqdm(cat_links):
        html = get_html_page(link)
        if not html: continue
            
        soup = BeautifulSoup(html)
        tags_a = soup.find('div', {'class': 'mw-category-generated'}).find_all('a')
        for tag_a in tags_a:
            post_link = tag_a['href']
            full_link = domain + post_link
            all_links.append(full_link)
            
    return all_links

In [4]:
# links = get_all_links()

In [9]:
class ArticlesCollection():
    def __init__(self, page_links=[], filename='./all_links.txt'):
        self.page_links = []
        self.filename = filename
        self.dirty_corpus = []
    
    def get_html_page(self, url):
        try:
            response = requests.get(url)
            return response.text if response.ok else None
        except:
            return None
    
    def make_list_from_file(self):
        with open(self.filename, mode='r') as f:
            self.page_links = f.read()
            
        self.page_links = self.page_links.split()
    
    def make_raw_corpus(self, num_links=None):
        links_processed = self.page_links[:num_links] if num_links else self.page_links
        for link in tqdm(links_processed):
            html = self.get_html_page(link)

            if not html: continue
            soup = BeautifulSoup(html)
            
            title = soup.find('h1', {'id': 'firstHeading'}).text
            tags = soup.find('div', {'id': 'bodyContent', 'class': 'vector-body'}).find_all('p')
            
            text = [tag.text for tag in tags]
            text = ' '.join(text)
            text = '<>'.join((title, text))
            text = '<>'.join((link, text))

            self.dirty_corpus.append(text)

In [10]:
PagesCollector = ArticlesCollection()

PagesCollector.make_list_from_file()
PagesCollector.make_raw_corpus()

  0%|          | 0/5337 [00:00<?, ?it/s]

In [None]:
def is_tyv_token(token):
    
    for ch in token:
        if ch not in tyv_alphabet: return False
    return True

In [53]:
import re
def clean_text(text):
    tyv_alphabet = 'АБВГДЕЁЖЗИЙКЛМНҢОӨПРСТУҮФХЦЧШЩЪЫЬЭЮЯабвгдеёжзийклмнңоөпрстуүфхцчшщъыьэюя'
    res_str = ''
    
    one_space_regex = r"\s((\s)(\s+)?)?"
    
    for ch in text:
        if ch not in tyv_alphabet:
            res_str += ' '
        else:
            res_str += ch
            
    
    res_str = res_str.lower()
    
    try:
        res_str = re.sub(one_space_regex, ' ', res_str)    
        res_str = res_str if res_str[0] != ' ' else res_str[1:]
        res_str = res_str if res_str[-1] != ' ' else res_str[:-1]
        
    except IndexError as e:
            pass
    
    return res_str

In [62]:
with open('./clean_texts.txt', 'w') as f:

    for raw_text in PagesCollector.dirty_corpus:

        _, title, text = raw_text.split('<>')
        title = clean_text(title)
        text = clean_text(text)
        
        f.write(','.join((title, text)))
        f.write('\n')