In [20]:
import time
from selenium import webdriver
from bs4 import BeautifulSoup
from urllib.parse import urljoin

from io import BytesIO
from urllib.request import urlopen
import csv

import json
import hashlib
import os
import glob
import pandas as pd
import re

In [21]:
%%bash
chmod +x chromedriver

In [4]:
options = webdriver.ChromeOptions()
binary_yandex_driver_file = './chromedriver'
driver = webdriver.Chrome(binary_yandex_driver_file, options=options)

  This is separate from the ipykernel package so we can avoid doing imports until


In [22]:
def parse_to_csv(category, url):
    driver.get(url)
    time.sleep(2)  # Allow 2 seconds for the web page to open
    scroll_pause_time = 1 # You can set your own pause time. My laptop is a bit slow so I use 1 sec
    screen_height = driver.execute_script("return window.screen.height;")   # get the screen height of the web
    i = 1

    while True:
        # scroll one screen height each time
        driver.execute_script("window.scrollTo(0, {screen_height}*{i});".format(screen_height=screen_height, i=i))  
        i += 1
        time.sleep(scroll_pause_time)
        # update scroll height each time after scrolled, as the scroll height can change after we scrolled the page
        scroll_height = driver.execute_script("return document.body.scrollHeight;")  
        # Break the loop when the height we need to scroll to is larger than the total scroll height
        if (screen_height) * i > scroll_height:
            break 
        articles = {}
        
    directory_to_save = './rbc_links_parser/'
    try:
        # Create target Directory
        os.mkdir(directory_to_save)
    except FileExistsError:
        print("Directory " , directory_to_save ,  " already exists")
        
    with open(directory_to_save+'rbc_'+category+'.csv', 'w') as f:
        soup = BeautifulSoup(driver.page_source, "html.parser")
        articles = {}
        for element in soup.body.findAll('div', attrs={'class': 'item item_image-mob js-category-item'}):
            articles['date'] = element.select_one('span.item__category').text.strip()
            articles['title'] = element.select_one('span.rm-cm-item-text').text.strip()
            articles['url'] = element.find('a').get('href')
            w = csv.DictWriter(f, articles.keys())
            w.writerow(articles)

In [20]:
link_to_parse = {}
link_to_parse['politics'] = 'https://www.rbc.ru/politics/?utm_source=topline'
link_to_parse['economics'] = 'https://www.rbc.ru/economics/?utm_source=topline'
link_to_parse['society'] = 'https://www.rbc.ru/society/?utm_source=topline'
link_to_parse['business'] = 'https://www.rbc.ru/business/?utm_source=topline'
link_to_parse['tech'] = 'https://www.rbc.ru/technology_and_media/?utm_source=topline'
link_to_parse['finance'] = 'https://www.rbc.ru/finances/?utm_source=topline'

for category, link in link_to_parse.items():
    parse_to_csv(category, link)

Directory  ./rbc_links_parser/  already exists
Directory  ./rbc_links_parser/  already exists
Directory  ./rbc_links_parser/  already exists
Directory  ./rbc_links_parser/  already exists
Directory  ./rbc_links_parser/  already exists


In [23]:
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry


def parse(link):
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
    session = requests.Session()
    retry = Retry(connect=3, backoff_factor=0.5)
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    session.get(link, headers=headers)
    response = session.get(link, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    return soup

In [25]:
def add_entry(dict_entry):
    try:
        return {hashlib.md5(dict_entry['body'].encode()).hexdigest(): dict_entry}
    except Exception as e:
        print(e, dict_entry)
        
for file_name in glob.glob('./rbc_links_parser/'+'*.csv'):
    df = pd.read_csv(file_name, names=['url', 'title', 'date'])
    links= df.url
    for link in links:
        soup = parse(link)
        corpus = {}
        trash = ['article__main-image', 'article__header__info-block', 'pro-anons', 'article__authors', 'social-networks__content', 'article__inline-item']
        
        for text_div in soup.findAll('div', attrs={'class':'l-col-main'}):
            extracted_trash = [t.extract() for trash_class in trash for t in text_div.findAll('div', attrs={'class':trash_class})]
            del extracted_trash

            for tag_bar in text_div.findAll('div', attrs={'class':'article__tags__container'}):
                tags = [tag.string for tag in tag_bar.findAll('a') if tag_bar is not None ]
                
            headline = text_div.find('h1')
            overview = text_div.find('div', attrs = {'article__text__overview'})
            if text_div is not None and overview is not None and headline is not None and tags is not None:
                corpus['headline'] = str(headline.text).strip()
                corpus['overview'] = overview.span.text
                corpus['category'] = str(file_name.split('_')[3].split('.')[0])
                corpus['tags'] = tags
                tag_bar.extract()
                corpus['body'] = (re.sub(r'\s+', ' ', text_div.text))
            else:
                continue
        if bool(corpus):        
            entry = add_entry(corpus)
        try :
            with open('data_file.json') as json_file:
                data = json.load(json_file)
        except :
            data = {}
            
        data.update(entry)

        with open("data_file.json", "w+") as write_file:
            json.dump(data, write_file)

In [32]:
import bigjson
def query_big_json(article_id):
    with open('./data_file.json', 'rb') as data_set:
        data = bigjson.load(data_set)
        corpus_data = data[str(article_id)]
        category = corpus_data['category']
        text = corpus_data['body']
        
    return (article_id, category)

In [37]:
# let's randomly pick a hash and test it
import random
df = pd.read_json('data_file.json')
df = df.transpose()
h = random.choice(list(df.index))

query_big_json('30353924877bc034ba054e46311f301b')

('30353924877bc034ba054e46311f301b', 'business')

In [40]:
df.head(3)

Unnamed: 0,tags,overview,headline,body,category
30353924877bc034ba054e46311f301b,"[вакцинация от COVID-19, ВТО, экспорт, базы да...",Россия занимает пятое место в мире по объему п...,Россия заняла пятое место в мире по экспорту в...,Россия заняла пятое место в мире по экспорту ...,business
fc0b7fb1406eb3bb4f29a9335e25d65e,"[вакцинация от COVID-19, ВТО, экспорт, базы да...",Россия занимает пятое место в мире по объему п...,Россия заняла пятое место в мире по экспорту в...,Россия заняла пятое место в мире по экспорту ...,business
eb08734fd3d421e15ebbcc81f53398b9,"[черная металлургия, водородная энергетика]",Владелец «Северстали» Алексей Мордашов оценил ...,Мордашов оценил переход российской металлургии...,Мордашов оценил переход российской металлурги...,business


In [1]:
import json

def remove_article(article_id):
    with open('data_file.json', 'r') as data_file:
        data = json.load(data_file)
        data.pop(article_id, None)

    with open('data_file.json', 'w') as data_file:
        data = json.dump(data, data_file)

In [41]:
import hashlib
def hash_it(file):
    hashed_articles = dict()

    category = str(file.split('_')[1].split('.')[0])
    BLOCK_SIZE = 65536 # The size of each read from the file

    file_hash = hashlib.sha256() # Create the hash object, can use something other than `.sha256()` if you wish
    with open(file, 'rb') as f: # Open the file to read it's bytes
        fb = f.read(BLOCK_SIZE) # Read from the file. Take in the amount declared above
        while len(fb) > 0: # While there is still data being read from the file
            file_hash.update(fb) # Update the hash
            fb = f.read(BLOCK_SIZE) # Read the next block from the file

    hashed_articles[file_hash.hexdigest()] = category
    return hashed_articles # Get the hexadecimal digest of the hash

It would be useful to have two dictionaries one that have the hashed text body of the articles as its key\
(this dictionary is the one with labeled category)\
And the other one is the one that we will make after we do topic modelling\
Then we want to get an insight about our articles we just compare the hashes, and then understand the pattern behind the clustering

In [42]:
as_original = hash_it("oo_politics.txt")
as_original.update(hash_it("rbc_politics.csv"))

In [44]:
as_classified = {'e6ac6518a2afbe64f54b5b590a79c25c7acd6978bdb1b4fcd8782bb928e2ba96': 'economy',
     'e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855': 'politics'}

In [45]:
as_classified

{'e6ac6518a2afbe64f54b5b590a79c25c7acd6978bdb1b4fcd8782bb928e2ba96': 'economy',
 'e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855': 'politics'}

In [46]:
misclassified = {k: as_classified[k] for k in as_original if k in as_classified and as_original[k] != as_classified[k]}
print(misclassified)

{'e6ac6518a2afbe64f54b5b590a79c25c7acd6978bdb1b4fcd8782bb928e2ba96': 'economy'}
