In [1]:
import nltk, razdel
from pymystem3 import Mystem
import string, regex as re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
# Download nltk packages used in this example
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to /home/aliak/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/aliak/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [3]:
df = pd.read_json("../dataset/enumerated_shuffled_rbc_dataset.json")
df = df.transpose()

In [4]:
df.category.value_counts()

finance      339
business     338
economics    261
politics     258
tech         233
society      177
Name: category, dtype: int64

In [5]:
def get_article_sentences(article_text):
    sentences = list()
    for sentence in razdel.sentenize(article_text):
        sentences.append(sentence.text)
    return sentences

In [6]:
def get_article_tokens(article_sentences):
    tokens = list()

    for sentence in article_sentences:
        for token in razdel.tokenize(sentence):
            if token.text not in stop_words:
                tokens.append(token.text.lower())
    return tokens

In [7]:
def get_article_lemmas(article_sentences):
    mystem = Mystem()
    lemmas = list()
    
    for sentence in article_sentences:
        sentence_lemmas = mystem.lemmatize(sentence.lower())
        sentence_lemmas = [lemma for lemma in sentence_lemmas if lemma not in stop_words\
          and lemma != " "\
          and lemma.strip() not in punctuations]
    lemmas+=sentence_lemmas
    return set(lemmas)

In [8]:
# Customize list of stopwords as needed. Here, we append common
# punctuation and contraction artifacts.
with open('../aux/stopwords-ru.txt', 'r') as f:
    ru_stop_words_extensive = f.read().splitlines()
    
punctuations = list(string.punctuation) + ["—", "«", "»", "\n"]
stop_words = list(set(ru_stop_words_extensive + stopwords.words('russian'))) + punctuations

In [9]:
def normalize_document(doc):
    # lower case and remove special characters\whitespaces
    doc = re.sub(r"[^а-яА-Я]", " ", doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    sentences = get_article_sentences(doc)
    # tokenize document
    lemmas = get_article_lemmas(sentences)
    #filter stopwords out of document
    tokens = get_article_tokens(sentences)
    # re-create document from filtered tokens
    doc = ' '.join(lemmas)
    return doc

normalize_corpus = np.vectorize(normalize_document)

In [27]:
from sklearn.feature_extraction.text import CountVectorizer
def calculate_cv_matrix(corpus, gram_count, min_df=0.5, max_df=1):
    ngrams = {'only_unigrams':(1,1), 'unigrams_bigrams':(1,2), 'only_bigrams':(2,2), 'bigrams_trigrams':(2,3), 'only_trigrams':(3,3)}
    cv = CountVectorizer(ngram_range=ngrams[gram_count], min_df=min_df, max_df=max_df, stop_words=stop_words)
    cv_matrix = cv.fit_transform(corpus)
    cv_matrix.shape
    return cv_matrix, cv

In [11]:
from sklearn.cluster import KMeans
def apply_kmeans(cv_matrix, cluster_count):
    NUM_CLUSTERS = cluster_count
    km = KMeans(n_clusters=NUM_CLUSTERS, max_iter=10000, n_init=100, random_state=32).fit(cv_matrix)
    return km

In [13]:
norm_corpus = normalize_corpus(list(df['article_text']))
len(norm_corpus)

1606

In [None]:
from collections import Counter
cv_matrix, cv = calculate_cv_matrix(norm_corpus)

In [15]:
km = apply_kmeans(cv_matrix, 6)
Counter(km.labels_)

Counter({2: 447, 4: 918, 1: 236, 3: 1, 0: 2, 5: 2})

In [18]:
def print_clusters(km, cv, df, NUM_CLUSTERS):
    df['kmeans_cluster'] = km.labels_
    article_clusters = (df[['category', 'kmeans_cluster',]]
                  .sort_values(by=['kmeans_cluster'], 
                               ascending=False)
                  .groupby('kmeans_cluster').head(20))
    article_clusters = article_clusters.copy(deep=True)
    
    feature_names = cv.get_feature_names()
    topn_features = 15
    ordered_centroids = km.cluster_centers_.argsort()[:, ::-1]
    counts = dict()

    # get key features for each cluster
    # get articles belonging to each cluster
    for cluster_num in range(NUM_CLUSTERS):
        key_features = [feature_names[index] 
                            for index in ordered_centroids[cluster_num, :topn_features]]
        articles = article_clusters[article_clusters['kmeans_cluster'] == cluster_num]['category'].values.tolist()
        for category in articles:
            counts[category] = counts.get(category, 0) + 1

        counts = dict(sorted(counts.items(), key=lambda item: item[1], reverse=True))
        print('CLUSTER #'+str(cluster_num+1))
        print('Key Features:', key_features, "\n")
        print('Corresponding categories: \n', counts)
        print('-'*80)

In [19]:
print_clusters(km, cv, df, 6)

CLUSTER #1
Key Features: ['технический', 'тыс baring', 'соловьев', 'подросток', 'счет', 'пятачок рбк', 'пятачок', 'оставаться', 'взрослый', 'счет венчурный', 'взрослый топ', 'принцип сад', 'определять', 'сделка', 'тыс'] 

Corresponding categories: 
 {'tech': 2}
--------------------------------------------------------------------------------
CLUSTER #2
Key Features: ['www', 'adv', 'rbc', 'ru', 'составлять', 'rbc ru', 'рынок', 'данные', 'российский', 'начало', 'рост', 'компания', 'отмечать', 'цена', '2021'] 

Corresponding categories: 
 {'economics': 9, 'business': 4, 'society': 4, 'tech': 3, 'politics': 1, 'finance': 1}
--------------------------------------------------------------------------------
CLUSTER #3
Key Features: ['ru', 'rbc', 'adv', 'www', 'rbc ru', 'сообщать', 'российский', 'компания', 'заявлять', 'президент', 'становиться', 'рбк', 'данные', 'начало', 'число'] 

Corresponding categories: 
 {'economics': 10, 'business': 10, 'society': 8, 'politics': 7, 'tech': 6, 'finance': 

In [22]:
df.to_pickle('../dataset/pickles/clustered_articles.pkl')

As we can see it does pretty good job, even for man-kind attempt, distinguishing between the categories (finance, business, economics) isn't that straightforward.
It might be interesting to try clustering only articles from these categories and figure out how successful it performs.

In [23]:
money_related_df = df.loc[(df.category == "finance") | (df.category == "business") | (df.category == "economics")]

In [26]:
norm_corpus = normalize_corpus(list(money_related_df['article_overview']))
cv_matrix, cv = calculate_cv_matrix(norm_corpus, 'only_bigrams')
NUM_CLUSTERS = 3
km = apply_kmeans(cv_matrix, NUM_CLUSTERS)
Counter(km.labels_)
print_clusters(km, cv, money_related_df,NUM_CLUSTERS)

  'stop_words.' % sorted(inconsistent))


CLUSTER #1
Key Features: ['снижать цб', 'поставка компания', 'компания российский', 'цб кредит', 'рассказывать рбк', 'российский бизнес', 'риск рбк', 'банк эксперт', 'ставка руб', 'компания бизнес', 'продажа компания', 'цена достигать', 'санкция сообщать', 'втб банк', 'ес решение'] 

Corresponding categories: 
 {'economics': 9, 'business': 7, 'finance': 4}
--------------------------------------------------------------------------------
CLUSTER #2
Key Features: ['поставка российский', 'ес решение', 'поставка компания', 'банк эксперт', 'банка банк', 'втб банк', 'газпром компания', 'данные сша', 'инфляция рост', 'компания бизнес', 'компания российский', 'малый бизнес', 'нефть рынок', 'объем млрд', 'пандемия фон'] 

Corresponding categories: 
 {'economics': 17, 'business': 12, 'finance': 4}
--------------------------------------------------------------------------------
CLUSTER #3
Key Features: ['российский компания', 'цена достигать', 'пандемия фон', 'объем млрд', 'нефть рынок', 'малый би

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


In [30]:
norm_corpus = normalize_corpus(list(money_related_df['article_text']))
cv_matrix, cv = calculate_cv_matrix(norm_corpus, 'only_trigrams', 3)
NUM_CLUSTERS = 3
km = apply_kmeans(cv_matrix, NUM_CLUSTERS)
Counter(km.labels_)
print_clusters(km, cv, money_related_df,NUM_CLUSTERS)

  'stop_words.' % sorted(inconsistent))


CLUSTER #1
Key Features: ['принадлежать rbc ru', 'находиться принадлежать rbc', 'неустойка информация суд', 'указывать принадлежать rbc', 'санация принадлежать rbc', 'rbc ru объединять', 'втб акция банк', 'центробанк финансовый собеседник', 'начало банкротство мера', 'руб ограничение покидать', 'признание 2020 продавать', 'крупный adv возможность', 'российский компания имущество', 'финансовый собеседник означать', 'фон директор рбк'] 

Corresponding categories: 
 {'business': 9, 'finance': 5, 'economics': 2}
--------------------------------------------------------------------------------
CLUSTER #2
Key Features: ['rbc ru власть', 'ru власть санкция', 'находиться rbc ru', 'начинать оказываться заявлять', 'срок rbc ru', 'экономика российский компания', 'рамка пресс служба', 'торговаться rbc ru', 'российский компания 20', 'вернуть денежный российский', 'пакет руб апрель', 'повышать цена достигать', 'повышать прошлый цена', 'переставать являться насколько', 'отмечать упасть нефть'] 

Corre

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


#### We can notice how the results would get a better divergence as the window size of ngrams exapands, and the min-occurencing frequency increases.