In [None]:
from bs4 import BeautifulSoup
from pymystem3 import Mystem
import os
import glob
import json
import collections
import re
import chardet


def get_encoding_type(file_path):
    rawdata = open(file_path, 'rb').read()
    result = chardet.detect(rawdata)
    return result['encoding']


def parse_html(file_path):
    encoding = get_encoding_type(file_path)
    with open(file_path, 'r', encoding=encoding) as f:
        contents = f.read()

    soup = BeautifulSoup(contents, 'html.parser')

    for script in soup(["script", "style"]):
        script.extract()

    text = soup.get_text()

    # clean text from unwanted symbols
    text = re.sub(r'[,\n/]+', '', text)

    lines = (line.strip() for line in text.splitlines())

    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))

    text = ' '.join(chunk for chunk in chunks if chunk)

    return text


def find_html_files(directory):
    html_files = []
    for filename in glob.iglob(directory + '**/*.html', recursive=True):
        html_files.append(filename)
    return html_files


def lemmanize_text(text):
    mystem = Mystem()
    lemmas = mystem.lemmatize(text)
    return lemmas


def count_words(words_list):
    return collections.Counter(words_list)


def export_as_json(data, filename):
    with open(filename, 'w', encoding='utf-8') as fp:
        json.dump(data, fp, ensure_ascii=False, indent=4, sort_keys=True)


# Enter directories with .html files here:
dirs = ["data/b1", "data/b2", "data/b3"]

html_files = []
for curr_dir in dirs:
    html_files += find_html_files(curr_dir)

all_text = ''
for file in html_files:
    all_text += parse_html(file)

lemmas = lemmanize_text(all_text)
word_counts = count_words(lemmas)
export_as_json(word_counts, 'word_counts.json')
