In [1]:
from collections import defaultdict

from pandas import read_csv
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
import gensim.downloader
from gensim.models import Word2Vec, KeyedVectors
from natasha import Doc, Segmenter, MorphVocab, NewsEmbedding, NewsMorphTagger
import mlxtend
from mlxtend.evaluate import paired_ttest_kfold_cv
from plotly.offline import iplot
import cufflinks as cf
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import datetime
from wordcloud import WordCloud, STOPWORDS
import nltk
from nltk.corpus import stopwords
nltk.download("stopwords")
%matplotlib inline
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)

[nltk_data] Downloading package stopwords to /Users/joy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
RND_STATE = 73

In [5]:
df = pd.read_csv("/data/4000_per_month_article.csv")

In [6]:
df.dropna(inplace=True)

In [7]:
cats = ['Деньги', 'Свое дело', 'Авто', 'Недвижимость']
df.drop(df[df['category'].isin(cats)].index, inplace=True)

In [3]:
def text_prep(text) -> str:
    segmenter = Segmenter()
    morph_vocab = MorphVocab()
    emb = NewsEmbedding()
    morph_tagger = NewsMorphTagger(emb)
    stop_words = stopwords.words('russian')
    doc = Doc(text)
    doc.segment(segmenter)
    doc.tag_morph(morph_tagger)

    for token in doc.tokens:
        token.lemmatize(morph_vocab)

    lemmas = [_.lemma for _ in doc.tokens]
    words = [lemma for lemma in lemmas if lemma.isalpha() and len(lemma) > 2]
    filtered_words = [word for word in words if word not in stop_words]
    return " ".join(filtered_words)

In [12]:
df['tokenised_text'] = df.text.apply(text_prep)


/Users/joy/ml_sandbox/t_nlp/tests


In [18]:
dft = pd.read_csv('tests/4000_per_month_article_tokens.csv')
dft.dropna(inplace=True)
dft.columns

Index(['date', 'tags', 'category', 'text', 'tokenised_text'], dtype='object')

In [22]:
X_train, X_test, y_train, y_test = train_test_split(dft.tokenised_text.str.split(),
                                                    dft.category.values,
                                                    random_state=RND_STATE)

In [23]:
model = Word2Vec(sentences=X_train,
                 vector_size=200,
                 min_count=10,
                 window=2,
                 seed=RND_STATE)

Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'


In [27]:
model.wv.most_similar(positive=["родина"])

[('белорус', 0.6354150772094727),
 ('кобахидзе', 0.6200356483459473),
 ('грузинский', 0.6056091785430908),
 ('курец', 0.5962410569190979),
 ('мечта', 0.5945427417755127),
 ('сдпг', 0.5923218131065369),
 ('опзж', 0.5920796394348145),
 ('социалист', 0.5899027585983276),
 ('коммунист', 0.5895706415176392),
 ('провозгласить', 0.5887891054153442)]

In [4]:
class MeanEmbeddingVectorizer(object):
    """Get mean of vectors"""
    def __init__(self, model):
        self.word2vec = model.wv
        self.dim = model.vector_size

    def fit(self, X, y):
        return self

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec.get_vector(w)
                for w in words if w in self.word2vec] or
                [np.zeros(self.dim)], axis=0)
            for words in X])

In [30]:
import joblib
pipe = Pipeline([('w2v', MeanEmbeddingVectorizer(model)),
                 ('clf', RandomForestClassifier(random_state=RND_STATE))])

In [31]:
pipe.fit(X_train, y_train)
joblib.dump(pipe, 'models/4000_doc.pkl')

['4000_doc.pkl']

In [10]:
import joblib
# make sure W2VMeanVectorizer class is defined/importable before this line
pipe = joblib.load("/models/4000_doc.pkl")

In [33]:
print(classification_report(y_test, pipe.predict(X_test)))

                    precision    recall  f1-score   support

            Бизнес       0.65      0.60      0.62       405
          Общество       0.84      0.91      0.87      2971
          Политика       0.85      0.84      0.84      1941
Технологии и медиа       0.73      0.53      0.62       244
           Финансы       0.80      0.67      0.73       159
         Экономика       0.54      0.29      0.38       189

          accuracy                           0.82      5909
         macro avg       0.73      0.64      0.68      5909
      weighted avg       0.81      0.82      0.82      5909



In [36]:
# %pip install selenium
import requests as rq
from bs4 import BeautifulSoup as bs
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import time
from IPython import display
from urllib.parse import urlencode
from tqdm import tqdm
from bs4 import BeautifulSoup
from selenium import webdriver
from dataclasses import dataclass
from concurrent.futures import ThreadPoolExecutor
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from dateutil.relativedelta import relativedelta

class RBCparser:
    def __init__(self,
                 query='РБК',
                 project='rbcnews',
                 category='TopRbcRu_economics',
                 material='',
                 dateFrom='2025-01-01',
                 dateTo='2025-01-01',
                 page=0):

        self.query = query
        self.project = project
        self.category = category
        self.material = material
        self.dateFrom = dateFrom
        self.dateTo = dateTo
        self.page = page

        self.param_dict = {
            'query': query,
            'project': project,
            'category': category,
            'dateFrom': datetime.strptime(dateFrom, '%Y-%m-%d').strftime('%d.%m.%Y'),
            'dateTo': datetime.strptime(dateTo, '%Y-%m-%d').strftime('%d.%m.%Y'),
            'page': str(page),
            'material': material
        }
    @staticmethod
    def _get_url(parameters: dict) -> str:
        base_url = "https://www.rbc.ru/search/ajax/"
        return base_url + "?" + urlencode(parameters, encoding="utf-8")

    @staticmethod
    def fetch_article(session, url, ar_date, ar_category):
        r_page = session.get(url)
        soup = bs(r_page.text, "html.parser")

        container_text = soup.find('div', class_='article__text article__text_free')
        ar_text = " ".join(
            p.get_text(strip=True) for p in container_text.find_all('p')
        ) if container_text else ""

        container_tags = soup.find('div', class_='article__tags__container')
        ar_tags = ". ".join(
            tag.get_text(strip=True) for tag in container_tags.find_all('a', class_='article__tags__item')
        ) if container_tags else ""

        return {
            'date': ar_date,
            'tags': ar_tags,
            'category': ar_category,
            'text': ar_text
        }

    def _get_data(self, max_articles=2000):
        res = []
        session = rq.Session()
        page = 0

        with ThreadPoolExecutor(max_workers=10) as executor:
            while len(res) < max_articles:
                d = {**self.param_dict, "page": str(page)}
                try:
                    response = session.get(self._get_url(d))
                    response.raise_for_status()
                    items = response.json().get('items', [])
                except Exception as e:
                    print(f"[Ошибка на странице {page}]: {e}")
                    break

                if not items:
                    break

                futures = [
                    executor.submit(
                        self.fetch_article,
                        session,
                        item.get('fronturl'),
                        item.get('publish_date'),
                        item.get('category')
                    )
                    for item in items
                ]

                for f in futures:
                    res.append(f.result())

                page += 1
                if len(res) >= max_articles:
                    break

        return pd.DataFrame(res[:max_articles])

    def get_range_data(self, save_csv=False, csv_name="default_name.csv", max_articles_per_month=2000):

        start = datetime.strptime(self.dateFrom, "%Y-%m-%d")
        end = datetime.strptime(self.dateTo, "%Y-%m-%d")

        all_dfs = []

        while start <= end:
            month_start = start.replace(day=1)
            month_end = (month_start + relativedelta(months=1)) - relativedelta(days=1)

            if month_end > end:
                month_end = end

            print(f"Статьи за {month_start.strftime('%Y-%m')}")

            parser = RBCparser(
                query=self.query,
                project=self.project,
                category=self.category,
                material=self.material,
                dateFrom=month_start.strftime("%Y-%m-%d"),
                dateTo=month_end.strftime("%Y-%m-%d")
            )

            df_month = parser._get_data(max_articles=max_articles_per_month)
            if not df_month.empty:
                all_dfs.append(df_month)

            start = month_start + relativedelta(months=1)

        final_df = pd.concat(all_dfs, ignore_index=True)

        if save_csv:
            final_df.to_csv(csv_name, index=False, encoding="utf-8-sig")

        return final_df


In [43]:
p = RBCparser(dateFrom = '2020-01-01', dateTo = '2020-01-20')

In [44]:
test_df = p.get_range_data(max_articles_per_month=200)

Статьи за 2020-01


In [52]:
test_df.dropna(inplace=True)

(199, 4)

In [53]:
test_df['tokenise_text'] = test_df.text.apply(text_prep)

In [55]:
X = test_df['tokenise_text']
y = test_df['category']

In [56]:
print(classification_report(y, pipe.predict(X)))

                    precision    recall  f1-score   support

            Бизнес       0.00      0.00      0.00        13
          Общество       0.53      1.00      0.69       105
          Политика       0.00      0.00      0.00        58
Технологии и медиа       0.00      0.00      0.00         5
           Финансы       0.00      0.00      0.00         9
         Экономика       0.00      0.00      0.00         9

          accuracy                           0.53       199
         macro avg       0.09      0.17      0.12       199
      weighted avg       0.28      0.53      0.36       199




Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.

