This notebook is dedicated to parse RBC website and extract various articles, from the main 6 categories:
* Politics
* Social
* Tech
* Business
* Finance
* Economy

Then the results are saved into 2 formats: `JSON` and `txt`
* `JSON` files, are either indexed by the hash of the article text itsself, for a more convient checkup and later operational matchings. Or they are indexed by the enumeration of the articles, starting form `0` ending with `Total count of articles-1`. Both also include for each article its `category`, `article_overview` and auxiliary `tags` for extra more info.
* `Txt` files on the other hand, made for faster dynamic separtion and uploads. They are as follows: One file for all headlines, separated by a new line and a special charachter(to avoid mixing them up while traversing them). The second file dedicated for all body text of the articles, and the last one is for all overview(annotation at the top of the articles).

In [1]:
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from urllib.parse import urljoin

from io import BytesIO
from urllib.request import urlopen

import json
import hashlib
import os
import glob
import re

In [120]:
options = Options()
options.add_argument("start-maximized")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)




[WDM] - Current google-chrome version is 100.0.4896
[WDM] - Get LATEST chromedriver version for 100.0.4896 google-chrome
[WDM] - Driver [/home/aliak/.wdm/drivers/chromedriver/linux64/100.0.4896.60/chromedriver] found in cache


In [121]:
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry


def parse(link):
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
    session = requests.Session()
    retry = Retry(connect=3, backoff_factor=0.5)
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    session.get(link, headers=headers)
    response = session.get(link, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    return soup

In [122]:
def write_To_json(file_name, entry):
    try :
        with open(file_name) as json_file:
            data = json.load(json_file)
    except :
        data = {}

    data.update(entry)

    with open(file_name, "w+") as write_file:
        json.dump(data, write_file)

In [123]:
def index_with_hash(dict_entry):
    try:
        return {hashlib.md5(dict_entry['article_text'].encode()).hexdigest(): dict_entry}
    except Exception as e:
        print(e, dict_entry)

In [126]:
def parse_page(category, url, file_name):
    driver.get(url)
    time.sleep(2)  # Allow 2 seconds for the web page to open
    scroll_pause_time = 1 # You can set your own pause time. My laptop is a bit slow so I use 1 sec
    screen_height = driver.execute_script("return window.screen.height;")   # get the screen height of the web
    i = 1

    while True:
        # scroll one screen height each time
        driver.execute_script("window.scrollTo(0, {screen_height}*{i});".format(screen_height=screen_height, i=i))  
        i += 1
        time.sleep(scroll_pause_time)
        # update scroll height each time after scrolled, as the scroll height can change after we scrolled the page
        scroll_height = driver.execute_script("return document.body.scrollHeight;")  
        # Break the loop when the height we need to scroll to is larger than the total scroll height
        if (screen_height) * i > scroll_height:
            break 

        
    soup = BeautifulSoup(driver.page_source, "html.parser")
    articles = {}

    for element in soup.body.findAll('div', attrs={'class': 'item item_image-mob js-category-item'}):
        articles['url'] = element.find('a').get('href')
        articles['date'] = element.select_one('span.item__category').text.strip()
        articles['title'] = element.select_one('span.rm-cm-item-text').text.strip()
        articles['category'] = category

        soup = parse(articles['url'])
        corpus = {}
        trash = ['article__main-image', 'article__header__info-block', 'pro-anons', 'article__authors', 'social-networks__content', 'article__inline-item']

        for text_div in soup.findAll('div', attrs={'class':'l-col-main'}):
            extracted_trash = [t.extract() for trash_class in trash for t in text_div.findAll('div', attrs={'class':trash_class})]
            del extracted_trash

            for tag_bar in text_div.findAll('div', attrs={'class':'article__tags__container'}):
                tags = [tag.string for tag in tag_bar.findAll('a') if tag_bar is not None ]

            headline = text_div.find('h1')
            overview = text_div.find('div', attrs = {'article__text__overview'})
            if text_div is not None and overview is not None and headline is not None and tags is not None:
                corpus['headline'] = str(headline.text).strip()
                corpus['article_overview'] = overview.span.text
                corpus['category'] = category
                corpus['tags'] = tags
                tag_bar.extract()
                corpus['article_text'] = (re.sub(r'\s+', ' ', text_div.text)).replace('Авторы Теги', '')
                corpus['article_text'] = re.sub(r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))', '', corpus['article_text'])
            else:
                continue
        if bool(corpus):        
            entry = index_with_hash(corpus)
        try :
            with open(file_name) as json_file:
                data = json.load(json_file)
        except :
            data = {}
            
        data.update(entry)

        with open(file_name, "w+") as write_file:
            json.dump(data, write_file)

In [None]:
%%time
link_to_parse = {}
link_to_parse['politics'] = 'https://www.rbc.ru/politics/?utm_source=topline'
link_to_parse['economics'] = 'https://www.rbc.ru/economics/?utm_source=topline'
link_to_parse['society'] = 'https://www.rbc.ru/society/?utm_source=topline'
link_to_parse['business'] = 'https://www.rbc.ru/business/?utm_source=topline'
link_to_parse['tech'] = 'https://www.rbc.ru/technology_and_media/?utm_source=topline'
link_to_parse['finance'] = 'https://www.rbc.ru/finances/?utm_source=topline'


file_name = '../dataset/hashed_shuffled_rbc.json'
for category, link in link_to_parse.items():
    parse_page(category, link, file_name)

In [1]:
import pandas as pd
from sklearn.utils import shuffle
df = pd.read_json('../dataset/hashed_shuffled_rbc.json')
df = df.transpose()
df = shuffle(df)
df.to_json('../dataset/hashed_shuffled_rbc.json', orient='index')

In [2]:
df_enumerated = df.reset_index(drop=True)

In [4]:
df_enumerated.to_json('../dataset/enumerated_shuffled_rbc.json', orient='index')

In [None]:
def split_to_textfiles(dataset='../dataset/enumerated_shuffled_rbc.json', headlines_txt_path='../dataset/rbc_only_headlines.txt', annotation_txt_path='../dataset/rbc_only_annotation.txt', article_body_txt_path='../dataset/rbc_only_body.txt'):
    df = pd.read_json(dataset)
    df = df.transpose()
    special_separator = '<@$>\n\n'
    with open(headlines_txt_path, 'w', encoding = 'utf-8') as h, open(annotation_txt_path, 'w', encoding = 'utf-8') as a, open(article_body_txt_path, 'w', encoding = 'utf-8') as b:
        for _, rec in df.iterrows():
            h.write(rec['headline'] + special_separator)
            a.write(rec['article_overview'] + special_separator)
            b.write(rec['article_text'] + special_separator)

In [95]:
import pandas as pd

In [114]:
df = pd.read_json('../dataset/rbc_2k.json')

In [115]:
df = df.transpose()

In [123]:
import nltk
from nltk.translate import bleu
from nltk.translate.bleu_score import SmoothingFunction
smoothie = SmoothingFunction().method4

for i in range(len(df)):
    headline = df['headline'][0].lower()
    headline = headline.split('.')[0]
    text = df['article_text'][i]
    
    C1= headline
    C2= text.split('.')[0].lower()
    score = bleu([C1], C2, smoothing_function=smoothie)
    if score > 0.4:
        print('BLEUscore:', C1, C2, score)

BLEUscore: пресс-конференция владимира путина   в столичном манеже прошла пресс-конференция президента россии владимира путина 0.4037500002676021
BLEUscore: пресс-конференция владимира путина   в московском манеже прошла ежегодная пресс-конференция владимира путина 0.45433186333194825


### Ria Pre-proccessing

In [143]:
def read_ria_records(file_name, dataset_size, shuffle=False):
    records = []
    count = 0
    with open(file_name, "r", buffering=100000) as r:
        for line in r:
            records.append(json.loads(line))
            count+=1
            if count == dataset_size:
                break
    if shuffle:
        random.shuffle(records)
    return records

In [144]:
ria_records = read_ria_records('../dataset/full_ria.json', 2000, shuffle=False)

In [145]:
ria_df = pd.DataFrame.from_records(ria_records)

In [146]:
ria_df = ria_df.iloc[:2000]

In [147]:
ria_df['text'][1]

'<p><strong></strong></p>\n<p><strong>киев, 31 янв - риа новости, марина шмаюн.</strong> премьер-министр украины, кандидат в президенты юлия тимошенко в воскресенье в прямом эфире <a href="http://www.1plus1.ua/" target="_blank">украинского телеканала 1+1</a> заявила, что в случае ее победы на выборах президента юрий луценко будет работать в ее команде.</p>\n<p>17 января в украине состоялся первый тур выборов президента, по итогам которого <a href="http://rian.ru/politics/20100125/206201353.html" target="_blank">виктор янукович набрал 35,32% голосов, а премьер-министр страны юлия тимошенко оказалась на втором месте с результатом 25,05%</a>. второй тур выборов президента украины <a href="http://www.rian.ru/trend/ukraine_election_18012010/" target="_blank">состоится 7 февраля</a>.</p>\n<p>парламент украины по инициативе партии регионов 28 января <a href="http://rian.ru/trend/ukr_lutsenko_retired_28012010/" target="_blank">отправил в отставку главу мвд юрия луценко</a>, однако премьер-мини

In [148]:
from bs4 import BeautifulSoup
import unicodedata
def remove_html_tags(text):
    text_string = BeautifulSoup(text, "lxml").text
    clean_text = unicodedata.normalize("NFKD",text_string).replace('\n','')
    return clean_text

In [149]:
columns_titles = ["title","text"]
ria_df=ria_df.reindex(columns=columns_titles)

In [150]:
def clean_date(text):
    clean_text = '.'.join(text.split('.')[1:])
    return clean_text

In [151]:
ria_df['text'] = ria_df['text'].apply(func = remove_html_tags)
ria_df['text'] = ria_df['text'].apply(func = clean_date)
ria_df['title'] = ria_df['title'].apply(func = remove_html_tags)

In [152]:
ria_df.to_json('../dataset/ria_2k.json',orient='index')

### Gazeta Preparation

In [6]:
def read_gazeta_records(file_name, shuffle=False, sort_by_date=True):
    assert shuffle != sort_by_date
    records = []
    with open(file_name, "r") as r:
        for line in r:
            records.append(json.loads(line))
    if sort_by_date:
        records.sort(key=lambda x: x["date"])
    if shuffle:
        random.shuffle(records)
    return records

In [7]:
test_records = read_gazeta_records("../dataset/gazeta_test.jsonl")

In [78]:
gazeta_df = pd.DataFrame.from_records(test_records)

In [14]:
gazeta_df = gazeta_df.iloc[:2000]

In [None]:
gazeta_df.to_json('../dataset/gazeta_2k.json',orient='index')