In [1]:
import nltk, razdel
from pymystem3 import Mystem
import string, regex as re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
# Download nltk packages used in this example
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to /home/aliak/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/aliak/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [6]:
df = pd.read_json("../dataset/rbc_2k_hashed_shuffled.json")
df = df.transpose()

In [7]:
df.category.value_counts()

politics     524
economics    404
finance      339
business     338
tech         233
society      177
Name: category, dtype: int64

In [8]:
def get_article_sentences(article_text):
    sentences = list()
    for sentence in razdel.sentenize(article_text):
        sentences.append(sentence.text)
    return sentences

In [9]:
def get_article_tokens(article_sentences):
    tokens = list()

    for sentence in article_sentences:
        for token in razdel.tokenize(sentence):
            if token.text not in stop_words:
                tokens.append(token.text.lower())
    return tokens

In [10]:
def get_article_lemmas(article_sentences):
    mystem = Mystem()
    lemmas = list()
    
    for sentence in article_sentences:
        sentence_lemmas = mystem.lemmatize(sentence.lower())
        sentence_lemmas = [lemma for lemma in sentence_lemmas if lemma not in stop_words\
          and lemma != " "\
          and lemma.strip() not in punctuations]
    lemmas+=sentence_lemmas
    return set(lemmas)

In [11]:
# Customize list of stopwords as needed. Here, we append common
# punctuation and contraction artifacts.
with open('../aux/stopwords-ru.txt', 'r') as f:
    ru_stop_words_extensive = f.read().splitlines()
    
punctuations = list(string.punctuation) + ["—", "«", "»", "\n"]
stop_words = list(set(ru_stop_words_extensive + stopwords.words('russian'))) + punctuations

In [12]:
def normalize_document(doc):
    # lower case and remove special characters\whitespaces
    doc = re.sub(r"[^а-яА-Я]", " ", doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    sentences = get_article_sentences(doc)
    # tokenize document
    lemmas = get_article_lemmas(sentences)
    #filter stopwords out of document
    tokens = get_article_tokens(sentences)
    # re-create document from filtered tokens
    doc = ' '.join(lemmas)
    return doc

normalize_corpus = np.vectorize(normalize_document)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
def calculate_cv_matrix(corpus, gram_count, min_df=10, max_df=0.009):
    ngrams = {'only_unigrams':(1,1), 'unigrams_bigrams':(1,2), 'only_bigrams':(2,2), 'bigrams_trigrams':(2,3), 'only_trigrams':(3,3)}
    cv = TfidfVectorizer(ngram_range=ngrams[gram_count], min_df=min_df, max_df=max_df, stop_words=stop_words)
    cv_matrix = cv.fit_transform(corpus)
    cv_matrix.shape
    return cv_matrix, cv

In [28]:
from sklearn.cluster import KMeans
def apply_kmeans(cv_matrix, cluster_count):
    NUM_CLUSTERS = cluster_count
    km = KMeans(n_clusters=NUM_CLUSTERS, max_iter=10000, n_init=100, random_state=32).fit(cv_matrix)
    return km

In [15]:
norm_corpus = normalize_corpus(list(df['article_text']))
len(norm_corpus)

2015

In [29]:
from collections import Counter
cv_matrix, cv = calculate_cv_matrix(norm_corpus, 'only_unigrams', 10, 0.009)

  'stop_words.' % sorted(inconsistent))


In [30]:
km = apply_kmeans(cv_matrix, 6)
Counter(km.labels_)

Counter({2: 1310, 5: 65, 4: 76, 0: 404, 3: 135, 1: 25})

In [31]:
def print_clusters(km, cv, df, NUM_CLUSTERS):
    df['kmeans_cluster'] = km.labels_
    article_clusters = (df[['category', 'kmeans_cluster',]]
                  .sort_values(by=['kmeans_cluster'], 
                               ascending=False)
                  .groupby('kmeans_cluster').head(20))
    article_clusters = article_clusters.copy(deep=True)
    
    feature_names = cv.get_feature_names()
    topn_features = 15
    ordered_centroids = km.cluster_centers_.argsort()[:, ::-1]
    counts = dict()

    # get key features for each cluster
    # get articles belonging to each cluster
    for cluster_num in range(NUM_CLUSTERS):
        key_features = [feature_names[index] 
                            for index in ordered_centroids[cluster_num, :topn_features]]
        articles = article_clusters[article_clusters['kmeans_cluster'] == cluster_num]['category'].values.tolist()
        for category in articles:
            counts[category] = counts.get(category, 0) + 1

        counts = dict(sorted(counts.items(), key=lambda item: item[1], reverse=True))
        print('CLUSTER #'+str(cluster_num+1))
        print('Key Features:', key_features, "\n")
        print('Corresponding categories: \n', counts)
        print('-'*80)

In [32]:
print_clusters(km, cv, df, 6)

CLUSTER #1
Key Features: ['роскосмос', 'согласование', 'односторонний', 'совкомбанк', 'ценовый', 'беспрецедентный', 'пошлина', 'африка', 'ран', 'держать', 'констатировать', 'послабление', 'ученый', 'абрамченко', 'фтс'] 

Corresponding categories: 
 {'economics': 7, 'finance': 4, 'business': 3, 'tech': 3, 'politics': 3}
--------------------------------------------------------------------------------
CLUSTER #2
Key Features: ['вкусно', 'франшиза', 'говор', 'весело', 'пушкинский', 'касса', 'логотип', 'юрлицо', 'гребенщиков', 'сооснователь', 'зеленый', 'антимонопольный', 'обратно', 'отель', 'фас'] 

Corresponding categories: 
 {'business': 17, 'economics': 7, 'finance': 5, 'tech': 4, 'society': 4, 'politics': 3}
--------------------------------------------------------------------------------
CLUSTER #3
Key Features: ['траст', 'роснано', 'безусловно', 'прокуратура', 'маск', 'ежегодный', 'потребкредитование', 'ускорять', 'райффайзенбанк', 'прирост', 'понести', 'обходиться', 'евробонд', 'взам

In [22]:
df.to_pickle('../dataset/pickles/clustered_articles.pkl')

As we can see it does pretty good job, even for man-kind attempt, distinguishing between the categories (finance, business, economics) isn't that straightforward.
It might be interesting to try clustering only articles from these categories and figure out how successful it performs.

In [33]:
money_related_df = df.loc[(df.category == "finance") | (df.category == "business") | (df.category == "economics")]

In [None]:
norm_corpus = normalize_corpus(list(money_related_df['article_overview']))
cv_matrix, cv = calculate_cv_matrix(norm_corpus, 'only_bigrams', min_df=20, max_df=0.009)
NUM_CLUSTERS = 3
km = apply_kmeans(cv_matrix, NUM_CLUSTERS)
Counter(km.labels_)
print_clusters(km, cv, money_related_df,NUM_CLUSTERS)

In [None]:
norm_corpus = normalize_corpus(list(money_related_df['article_text']))
cv_matrix, cv = calculate_cv_matrix(norm_corpus, 'only_trigrams', min_df=10, max_df=0.009)
NUM_CLUSTERS = 3
km = apply_kmeans(cv_matrix, NUM_CLUSTERS)
Counter(km.labels_)
print_clusters(km, cv, money_related_df,NUM_CLUSTERS)

#### We can notice how the results would get a better divergence as the window size of ngrams exapands, and the min-occurencing frequency increases.

In [None]:
social_related_df = df.loc[(df.category != "finance") | (df.category != "business") | (df.category != "economics")]

In [None]:
norm_corpus = normalize_corpus(list(social_related_df['article_overview']))
cv_matrix, cv = calculate_cv_matrix(norm_corpus, 'only_bigrams')
NUM_CLUSTERS = 3
km = apply_kmeans(cv_matrix, NUM_CLUSTERS)
Counter(km.labels_)
print_clusters(km, cv, money_related_df,NUM_CLUSTERS)

In [None]:
norm_corpus = normalize_corpus(list(social_related_df['article_text']))
cv_matrix, cv = calculate_cv_matrix(norm_corpus, 'only_trigrams', 3)
NUM_CLUSTERS = 3
km = apply_kmeans(cv_matrix, NUM_CLUSTERS)
Counter(km.labels_)
print_clusters(km, cv, money_related_df,NUM_CLUSTERS)