## Dataset Statistics

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
import numpy as np
import json
import random
import string
import re
import nltk, razdel
from nltk.corpus import stopwords
from pymystem3 import Mystem

# Download nltk packages used in this example
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to /home/aliak/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/aliak/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [2]:
df = pd.read_json('../dataset/enumerated_shuffled_rbc.json')
df = df.transpose()

In [70]:
# Customize list of stopwords as needed. Here, we append common
# punctuation and contraction artifacts.
with open('../aux/stopwords-ru.txt', 'r') as f:
    ru_stop_words_extensive = f.read().splitlines()
    
punctuations = list(string.punctuation) + ["—", "«", "»", "\n"]
stop_words = list(set(ru_stop_words_extensive + stopwords.words('russian'))) + punctuations



def get_normalized_sentences(doc):
    doc = re.sub(r"[^а-яА-Я]", " ", doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    sentences = get_article_sentences(doc)
    return sentences
    
    
def get_article_sentences(article_text):
    sentences = list()
    for sentence in razdel.sentenize(article_text):
        sentences.append(sentence.text)
    return sentences

def get_article_tokens(article_sentences):
    tokens = list()
    for sentence in article_sentences:
        for token in razdel.tokenize(sentence):
            if token.text not in stop_words:
                tokens.append(token.text.lower().strip())
    return tokens

def get_article_lemmas(article_sentences):
    mystem = Mystem()
    lemmas = list()
    for sentence in article_sentences:
        sentence_lemmas = mystem.lemmatize(sentence.lower())
        sentence_lemmas = [lemma for lemma in sentence_lemmas if lemma not in stop_words\
          and lemma != " "\
          and not lemma.isdigit()
          and lemma.strip() not in punctuations]
    lemmas+=sentence_lemmas
    return lemmas

def normalize_document(doc):
    # lower case and remove special characters\whitespaces
    doc = re.sub(r"[^а-яА-Я]", " ", doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    sentences = get_article_sentences(doc)
    # tokenize document
    lemmas, unique_lemmas = get_article_lemmas(sentences)
    #filter stopwords out of document
    tokens, unique_tokens = get_article_tokens(sentences)
    # re-create document from filtered tokens
    doc = ' '.join(lemmas)
    return doc, 

normalize_corpus = np.vectorize(normalize_document)

In [31]:
corpus = normalize_corpus(list(df['article_text']))
len(corpus)

1606

In [75]:
df.head(3)

Unnamed: 0,headline,article_overview,article_text,category,tags
0,Песков заявил об отсутствии соглашений по укра...,Об отправке 11 вагонов с зерном для продажи со...,Об отправке 11 вагонов с зерном для продажи ...,politics,"[Украина, зерно, поставки, переговоры]"
1,В Киеве попросили уточнить слова Байдена об от...,Западные страны не стали вводить превентивные ...,Западные страны не стали вводить превентивны...,politics,"[Украина, Джо Байден, санкции, Владимир Зеленс..."
2,Владелец бондов «Роснано» назвал возможный деф...,"По мнению гендиректора «Арикапитала», если гос...","По мнению гендиректора «Арикапитала», если г...",economics,"[«Роснано», дефолт, облигации, рублевые облига..."


In [207]:
train, validate, test = \
              np.split(df.sample(frac=1, random_state=777), 
                       [int(.6*len(df)), int(.8*len(df))])

In [218]:
def stats_per_group(df):
    
    results = pd.DataFrame()
    words_for_all_headlines = []
    lemmas_for_all_headlines = []
    words_for_all_articles = []
    lemmas_for_all_articles = []
    
    for index, row in df.iterrows():
        
        headline = df['headline'][index]
        norm_headline = get_normalized_sentences(headline)
        headline_words = get_article_tokens(norm_headline)
        results.loc[index,"headline_word_count"] = len(headline_words)
        results.loc[index,"headline_unique_words_count"] = len(set(headline_words))
        results.loc[index,"headline_sentence_count"] = len(norm_headline)
        headline_lemmas = get_article_lemmas(norm_headline)
        results.loc[index,"headline_lemmas_count"] = len(headline_lemmas)
        results.loc[index,"headline_unique_lemmas_count"] = len(set(headline_lemmas))
        words_for_all_headlines += headline_words
        lemmas_for_all_headlines += headline_lemmas
        
        article = df['article_text'][index]
        norm_article = get_normalized_sentences(article)
        article_words = get_article_tokens(norm_article)
        results.loc[index,"article_word_count"] = len(article_words)
        results.loc[index,"article_unique_words_count"] = len(set(article_words))
        results.loc[index,"article_sentence_count"] = len(norm_article)
        article_lemmas = get_article_lemmas(norm_article)
        results.loc[index,"article_lemmas_count"] = len(article_lemmas)
        results.loc[index,"article_unique_lemmas_count"] = len(set(article_lemmas))
        words_for_all_articles += article_words
        lemmas_for_all_articles += article_lemmas

        
#     return results, words_for_all_headlines, lemmas_for_all_headlines, words_for_all_articles, lemmas_for_all_articles
    return results

In [221]:
train_stats = stats_per_group(train)

In [227]:
train_stats.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
headline_word_count,963.0,6.46729,1.235191,2.0,6.0,7.0,7.0,10.0
headline_unique_words_count,963.0,6.463136,1.233389,2.0,6.0,7.0,7.0,10.0
headline_sentence_count,963.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
headline_lemmas_count,963.0,6.988577,1.639201,1.0,6.0,7.0,8.0,14.0
headline_unique_lemmas_count,963.0,6.760125,1.399283,1.0,6.0,7.0,8.0,11.0
article_word_count,963.0,343.263759,279.043393,18.0,197.0,257.0,386.0,3189.0
article_unique_words_count,963.0,253.218069,184.508602,18.0,153.0,197.0,284.5,2119.0
article_sentence_count,963.0,2.660436,2.383232,1.0,1.0,2.0,3.0,28.0
article_lemmas_count,963.0,139.322949,123.23436,3.0,38.0,108.0,214.0,1289.0
article_unique_lemmas_count,963.0,90.161994,67.66833,3.0,35.0,81.0,129.0,668.0


In [222]:
validate.reset_index(drop=True, inplace=True)

In [223]:
v_stats = stats_per_group(validate)

In [226]:
v_stats.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
headline_word_count,321.0,6.401869,1.174147,3.0,6.0,6.0,7.0,10.0
headline_unique_words_count,321.0,6.401869,1.174147,3.0,6.0,6.0,7.0,10.0
headline_sentence_count,321.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
headline_lemmas_count,321.0,6.816199,1.575353,3.0,6.0,7.0,8.0,11.0
headline_unique_lemmas_count,321.0,6.623053,1.373169,3.0,6.0,7.0,7.0,10.0
article_word_count,321.0,399.834891,619.928621,19.0,197.0,257.0,409.0,6322.0
article_unique_words_count,321.0,279.993769,325.769023,19.0,149.0,198.0,291.0,3081.0
article_sentence_count,321.0,3.274143,7.127823,1.0,1.0,2.0,3.0,73.0
article_lemmas_count,321.0,136.261682,117.75767,1.0,39.0,105.0,214.0,1115.0
article_unique_lemmas_count,321.0,87.971963,65.421727,1.0,36.0,78.0,126.0,627.0


In [225]:
t_stats = stats_per_group(test)

Unnamed: 0,19,1312,466,1524,379,118,169,442,988,570,...,814,985,1447,116,639,71,934,1595,815,103
headline_word_count,7.0,7.0,6.0,6.0,7.0,6.0,6.0,8.0,5.0,7.0,...,8.0,7.0,7.0,10.0,7.0,6.0,8.0,5.0,7.0,5.0
headline_unique_words_count,7.0,7.0,6.0,6.0,7.0,6.0,6.0,8.0,5.0,7.0,...,8.0,7.0,7.0,10.0,7.0,6.0,8.0,5.0,7.0,5.0
headline_sentence_count,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
headline_lemmas_count,9.0,7.0,7.0,7.0,8.0,6.0,7.0,9.0,5.0,6.0,...,7.0,6.0,9.0,10.0,4.0,6.0,7.0,6.0,7.0,3.0
headline_unique_lemmas_count,9.0,7.0,7.0,7.0,8.0,6.0,6.0,9.0,5.0,6.0,...,7.0,6.0,8.0,10.0,4.0,6.0,7.0,6.0,7.0,3.0
article_word_count,256.0,713.0,245.0,190.0,164.0,171.0,114.0,200.0,277.0,307.0,...,343.0,24.0,155.0,188.0,184.0,316.0,265.0,266.0,758.0,776.0
article_unique_words_count,199.0,515.0,197.0,135.0,118.0,125.0,90.0,139.0,229.0,239.0,...,248.0,23.0,132.0,149.0,153.0,265.0,222.0,208.0,541.0,630.0
article_sentence_count,1.0,6.0,2.0,1.0,2.0,2.0,1.0,1.0,6.0,2.0,...,5.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,10.0,2.0
article_lemmas_count,262.0,49.0,75.0,223.0,7.0,40.0,143.0,222.0,67.0,72.0,...,73.0,28.0,173.0,205.0,197.0,316.0,17.0,89.0,13.0,352.0
article_unique_lemmas_count,165.0,40.0,61.0,119.0,7.0,36.0,78.0,112.0,57.0,64.0,...,59.0,22.0,108.0,123.0,135.0,201.0,17.0,74.0,13.0,259.0


In [228]:
t_stats.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
headline_word_count,322.0,6.537267,1.199757,3.0,6.0,7.0,7.0,11.0
headline_unique_words_count,322.0,6.531056,1.202528,3.0,6.0,7.0,7.0,11.0
headline_sentence_count,322.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
headline_lemmas_count,322.0,7.024845,1.666792,3.0,6.0,7.0,8.0,12.0
headline_unique_lemmas_count,322.0,6.791925,1.452315,3.0,6.0,7.0,8.0,11.0
article_word_count,322.0,348.689441,271.468735,24.0,192.0,262.0,400.5,2674.0
article_unique_words_count,322.0,256.732919,179.811004,23.0,152.0,202.5,290.75,1719.0
article_sentence_count,322.0,2.913043,2.617676,1.0,1.0,2.0,4.0,15.0
article_lemmas_count,322.0,134.574534,125.754182,2.0,38.25,100.5,208.75,1125.0
article_unique_lemmas_count,322.0,87.108696,69.170321,2.0,33.25,77.5,124.75,593.0


In [63]:
def print_stats(dataset_name ,words_for_all_headlines, lemmas_for_all_headlines, words_for_all_articles, lemmas_for_all_articles):
    headline_lemmas = [i.strip() for i in lemmas_for_all_headlines]
    headline_words = [i.strip() for i in words_for_all_headlines]
    articles_lemmas = [i.strip() for i in lemmas_for_all_articles]
    articles_words = [i.strip() for i in words_for_all_articles]
    
    count_article_unique_words = len(set(articles_words))
    count_article_unique_lemmas = len(set(articles_lemmas))
    count_headline_unique_words = len(set(headline_words))
    count_headline_unique_lemmas = len(set(headline_lemmas))
    
    common_unique_lemmas = set.intersection(set(headline_lemmas), set(articles_lemmas))
    print('+' * 10, dataset_name, '+' * 10)
    print("="*5,'Summary Lemmas {}'.format(len(headline_lemmas)))
    print("="*5,'Summary Words {}'.format(len(headline_words)))
    print("="*5,'Article Lemmas {}'.format(len(articles_lemmas)))
    print("="*5,'Article Words {}'.format(len(articles_words)))
    print('.'*20)
    print("="*5,'Common Unique Lemmas {}'.format(len(common_unique_lemmas)))
    print("="*5,'Summary Unique Lemmas {}'.format(count_headline_unique_lemmas))
    print("="*5,'Summary Unique Words {}'.format(count_headline_unique_words))
    print("="*5,'Article Unique Lemmas {}'.format(count_article_unique_lemmas))
    print("="*5,'Article Unique Words {}'.format(count_article_unique_words))

In [35]:
train_stats, words_for_all_headlines, lemmas_for_all_headlines, words_for_all_articles, lemmas_for_all_articles = stats_per_group(train)

In [64]:
print_stats('Train Dataset', words_for_all_headlines, lemmas_for_all_headlines, words_for_all_articles, lemmas_for_all_articles)

++++++++++ Train Dataset ++++++++++
===== Number of Summary Lemmas 6730
===== Number of Summary Words 6228
===== Number of Article Lemmas 134168
===== Number of Article Words 6228
===== Number of Common Unique Lemmas 2142
....................
===== Number of Summary Unique Lemmas 2345
===== Number of Summary Unique Words 3530
===== Number of Article Unique Lemmas 13140
===== Number of Article Unique Words 3530


In [36]:
validate_stats, words_for_all_headlines, lemmas_for_all_headlines, words_for_all_articles, lemmas_for_all_articles = stats_per_group(validate)

In [65]:
print_stats('Validate Dataset', words_for_all_headlines, lemmas_for_all_headlines, words_for_all_articles, lemmas_for_all_articles)

++++++++++ Validate Dataset ++++++++++
===== Number of Summary Lemmas 2188
===== Number of Summary Words 2055
===== Number of Article Lemmas 43740
===== Number of Article Words 128347
===== Number of Common Unique Lemmas 1012
....................
===== Number of Summary Unique Lemmas 1114
===== Number of Summary Unique Words 1468
===== Number of Article Unique Lemmas 7112
===== Number of Article Unique Words 28137


In [37]:
test_stats, words_for_all_headlines, lemmas_for_all_headlines, words_for_all_articles, lemmas_for_all_articles = stats_per_group(test)

In [66]:
print_stats('Test Dataset', words_for_all_headlines, lemmas_for_all_headlines, words_for_all_articles, lemmas_for_all_articles)

++++++++++ Test Dataset ++++++++++
===== Number of Summary Lemmas 2262
===== Number of Summary Words 2105
===== Number of Article Lemmas 43333
===== Number of Article Words 112278
===== Number of Common Unique Lemmas 1020
....................
===== Number of Summary Unique Lemmas 1139
===== Number of Summary Unique Words 1488
===== Number of Article Unique Lemmas 6983
===== Number of Article Unique Words 26504
