In [None]:
import os
import re
import json
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from random import randint
from bs4 import BeautifulSoup
from langdetect import detect
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', None)

In [None]:
WORK_PATH = '../../__STS'

RAW_DATA_PATH_BLACK = 'raw_data_black'
all_black_htmls = os.listdir(f'{WORK_PATH}/{RAW_DATA_PATH_BLACK}')
print(len(all_black_htmls))
print(all_black_htmls[:10])

RAW_DATA_PATH_WHITE = 'raw_data_white'
all_white_htmls = os.listdir(f'{WORK_PATH}/{RAW_DATA_PATH_WHITE}')
print(len(all_white_htmls))
print(all_white_htmls[:10])

In [None]:
def get_text(html):
    """
    Extracts just text from the sites
    Text is enough for sites classificatio
    You do not need to look at bad pictures...
    
    """
    soup = BeautifulSoup(html, 'html.parser')
    for script in soup(['script', 'style']):
        script.extract()
    page_text = soup.get_text()
    for ch in ['\n', '\t', '\r']:
        page_text = page_text.replace(ch, ' ')
    return ' '.join(page_text.split())

In [None]:
def collect_texts(all_htmls, path):
    """
    Loop for data processing, 
    we process only files of loaded htmls
    
    """
    data_list = []
    for html_file in tqdm(all_htmls):
        try:
            data_dict = {}
            with open(f'{path}/{html_file}', 'r', encoding='utf-8') as file:
                html = file.read()
            text = get_text(html)
            data_dict['site_name'] = html_file.replace('.html', '')
            data_dict['text'] = text
            data_dict['language'] = detect(text) if text else 'unknown'
            data_list.append(data_dict)
        except Exception as e:
            print(html_file, '| error |', e)
    return data_list

In [None]:
data_list = collect_texts(
    all_white_htmls, 
    f'{WORK_PATH}/{RAW_DATA_PATH_WHITE}'
)

In [None]:
data_list[0]

In [None]:
df_data = pd.DataFrame(data_list)
print(df_data.shape)
display(df_data.head(20))

In [None]:
df_data.groupby(['language'])['language'].count()

In [None]:
import pymorphy2 as pm
import nltk
import multiprocessing
from multiprocessing import Pool
N_CORES = min(
    multiprocessing.cpu_count(), 
    int(float(os.environ['CPU_LIMIT']))
)
print('cores:', N_CORES)

In [None]:
# process only english for start
df_data_en = df_data[df_data.language == 'en']
df_data_en.reset_index(inplace=True)
del df_data_en['index']

In [None]:
LANG = 'english' # 'russian' or 'english'
MORPH = pm.MorphAnalyzer()
nltk.download('stopwords')
STOPWORDS = nltk.corpus.stopwords.words(LANG)

In [None]:
def preprocessing(sentence, as_list=False):
    s = re.sub('[^а-яА-Яa-zA-Z]+', ' ', sentence).strip().lower()
    s = re.sub('ё', 'е', s)
    funсtion_words = {'INTJ', 'PRCL', 'CONJ', 'PREP'}
    lemmatized_words = list(map(lambda word: MORPH.parse(word)[0], s.split()))
    result = []
    for word in lemmatized_words:
        if word.tag.POS not in funсtion_words:
            result.append(word.normal_form)
    result = [w for w in result if w not in STOPWORDS]
    if as_list:
        return result
    else:
        return ' '.join(result)
    
def apply_parallel(texts, func, n_cores=2):
    pool = Pool(n_cores)
    split = np.array_split(texts, n_cores)
    res = [item for sub in pool.map(func, split) for item in sub]
    pool.close()
    pool.join()
    return res

def preprocessing_list(sentences):
    return [preprocessing(s) for s in sentences]

In [None]:
%%time
proc = apply_parallel(df_data_en.text, preprocessing_list, n_cores=N_CORES)

In [None]:
df_data_en.loc[:, 'proc'] = proc
print(df_data_en.shape)
display(df_data_en.head())

In [None]:
df_data_en[['site_name', 'proc']].to_csv('data_white_en.csv', index=None)

In [None]:
df_tmp = pd.read_csv('data_white_en.csv')
print(df_tmp.shape)
display(df_tmp.head())