## Dataset Statistics

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
import numpy as np
import json
import random
import string
import re
import nltk, razdel
from nltk.corpus import stopwords
from pymystem3 import Mystem

# Download nltk packages used in this example
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to /home/aliak/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/aliak/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [2]:
# Customize list of stopwords as needed. Here, we append common
# punctuation and contraction artifacts.
with open('../aux/stopwords-ru.txt', 'r') as f:
    ru_stop_words_extensive = f.read().splitlines()
    
punctuations = list(string.punctuation) + ["—", "«", "»", "\n"]
stop_words = list(set(ru_stop_words_extensive + stopwords.words('russian'))) + punctuations



def get_normalized_sentences(doc):
    doc = re.sub(r"[^а-яА-Я]", " ", doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    sentences = get_article_sentences(doc)
    return sentences
    
    
def get_article_sentences(article_text):
    sentences = list()
    for sentence in razdel.sentenize(article_text):
        sentences.append(sentence.text)
    return sentences

def get_article_tokens(article_sentences):
    tokens = list()
    for sentence in article_sentences:
        for token in razdel.tokenize(sentence):
            if token.text not in stop_words:
                tokens.append(token.text.lower().strip())
    return tokens

def get_article_lemmas(article_sentences):
    mystem = Mystem()
    lemmas = list()
    for sentence in article_sentences:
        sentence_lemmas = mystem.lemmatize(sentence.lower())
        sentence_lemmas = [lemma for lemma in sentence_lemmas if lemma not in stop_words\
          and lemma != " "\
          and not lemma.isdigit()
          and lemma.strip() not in punctuations]
    lemmas+=sentence_lemmas
    return lemmas

def normalize_document(doc):
    # lower case and remove special characters\whitespaces
    doc = re.sub(r"[^а-яА-Я]", " ", doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    sentences = get_article_sentences(doc)
    # tokenize document
    lemmas, unique_lemmas = get_article_lemmas(sentences)
    #filter stopwords out of document
    tokens, unique_tokens = get_article_tokens(sentences)
    # re-create document from filtered tokens
    doc = ' '.join(lemmas)
    return doc, 

normalize_corpus = np.vectorize(normalize_document)

In [3]:
def stats_per_group(df, ref_column, text_column):
    
    results = pd.DataFrame()
    words_for_all_headlines = []
    lemmas_for_all_headlines = []
    words_for_all_articles = []
    lemmas_for_all_articles = []
    
    for index, row in df.iterrows():
        
        headline = df[ref_column][index]
        norm_headline = get_normalized_sentences(headline)
        headline_words = get_article_tokens(norm_headline)
        results.loc[index,"headline_word_count"] = len(headline_words)
        results.loc[index,"headline_unique_words_count"] = len(set(headline_words))
        results.loc[index,"headline_sentence_count"] = len(norm_headline)
        headline_lemmas = get_article_lemmas(norm_headline)
        results.loc[index,"headline_lemmas_count"] = len(headline_lemmas)
        results.loc[index,"headline_unique_lemmas_count"] = len(set(headline_lemmas))
        words_for_all_headlines += headline_words
        lemmas_for_all_headlines += headline_lemmas
        
        article = df[text_column][index]
        norm_article = get_normalized_sentences(article)
        article_words = get_article_tokens(norm_article)
        results.loc[index,"article_word_count"] = len(article_words)
        results.loc[index,"article_unique_words_count"] = len(set(article_words))
        results.loc[index,"article_sentence_count"] = len(norm_article)
        article_lemmas = get_article_lemmas(norm_article)
        results.loc[index,"article_lemmas_count"] = len(article_lemmas)
        results.loc[index,"article_unique_lemmas_count"] = len(set(article_lemmas))
        words_for_all_articles += article_words
        lemmas_for_all_articles += article_lemmas

        
    return results, words_for_all_headlines, lemmas_for_all_headlines, words_for_all_articles, lemmas_for_all_articles
#     return results

In [41]:
df = pd.read_json('../dataset/rbc_2k.json')
df = df.transpose()

In [42]:
train, validate, test = \
              np.split(df.sample(frac=1, random_state=777), 
                       [int(.6*len(df)), int(.8*len(df))])

In [57]:
print(len(train.index))
print(len(validate.index))
print(len(test.index))

1209
403
403


In [4]:
def print_stats(dataset_name ,words_for_all_headlines, lemmas_for_all_headlines, words_for_all_articles, lemmas_for_all_articles):
    headline_lemmas = [i.strip() for i in lemmas_for_all_headlines]
    headline_words = [i.strip() for i in words_for_all_headlines]
    articles_lemmas = [i.strip() for i in lemmas_for_all_articles]
    articles_words = [i.strip() for i in words_for_all_articles]
    
    count_article_unique_words = len(set(articles_words))
    count_article_unique_lemmas = len(set(articles_lemmas))
    count_headline_unique_words = len(set(headline_words))
    count_headline_unique_lemmas = len(set(headline_lemmas))
    
    common_unique_lemmas = set.intersection(set(headline_lemmas), set(articles_lemmas))
    print('+' * 10, dataset_name, '+' * 10)
    print("="*5,'Summary Lemmas {}'.format(len(headline_lemmas)))
    print("="*5,'Summary Words {}'.format(len(headline_words)))
    print("="*5,'Article Lemmas {}'.format(len(articles_lemmas)))
    print("="*5,'Article Words {}'.format(len(articles_words)))
    print('.'*20)
    print("="*5,'Common Unique Lemmas {}'.format(len(common_unique_lemmas)))
    print("="*5,'Summary Unique Lemmas {}'.format(count_headline_unique_lemmas))
    print("="*5,'Summary Unique Words {}'.format(count_headline_unique_words))
    print("="*5,'Article Unique Lemmas {}'.format(count_article_unique_lemmas))
    print("="*5,'Article Unique Words {}'.format(count_article_unique_words))

In [46]:
train_stats, words_for_all_headlines, lemmas_for_all_headlines, words_for_all_articles, lemmas_for_all_articles = stats_per_group(train, 'headline', 'article_text')

In [51]:
train_stats.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
headline_word_count,1209.0,6.497105,1.218728,2.0,6.0,7.0,7.0,11.0
headline_unique_words_count,1209.0,6.494624,1.21872,2.0,6.0,7.0,7.0,11.0
headline_sentence_count,1209.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
headline_lemmas_count,1209.0,6.931348,1.629433,2.0,6.0,7.0,8.0,14.0
headline_unique_lemmas_count,1209.0,6.721257,1.413939,2.0,6.0,7.0,8.0,11.0
article_word_count,1209.0,347.717122,398.970819,18.0,193.0,250.0,372.0,6314.0
article_unique_words_count,1209.0,253.045492,228.354774,18.0,150.0,195.0,278.0,3077.0
article_sentence_count,1209.0,2.837883,4.409335,1.0,1.0,2.0,3.0,80.0
article_lemmas_count,1209.0,136.900744,120.617252,1.0,38.0,110.0,215.0,1283.0
article_unique_lemmas_count,1209.0,87.770058,66.639711,1.0,34.0,81.0,125.0,664.0


In [47]:
print_stats('Train Dataset', words_for_all_headlines, lemmas_for_all_headlines, words_for_all_articles, lemmas_for_all_articles)

++++++++++ Train Dataset ++++++++++
===== Summary Lemmas 8380
===== Summary Words 7855
===== Article Lemmas 165513
===== Article Words 420390
....................
===== Common Unique Lemmas 2315
===== Summary Unique Lemmas 2512
===== Summary Unique Words 3859
===== Article Unique Lemmas 13641
===== Article Unique Words 54135


In [48]:
validate_stats, words_for_all_headlines, lemmas_for_all_headlines, words_for_all_articles, lemmas_for_all_articles = stats_per_group(validate, 'headline', 'article_text')

In [52]:
v_stats.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
headline_word_count,403.0,6.635236,1.17115,3.0,6.0,7.0,7.0,10.0
headline_unique_words_count,403.0,6.630273,1.169588,3.0,6.0,7.0,7.0,10.0
headline_sentence_count,403.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
headline_lemmas_count,403.0,7.01737,1.641258,2.0,6.0,7.0,8.0,12.0
headline_unique_lemmas_count,403.0,6.808933,1.396769,2.0,6.0,7.0,8.0,10.0
article_word_count,403.0,312.397022,356.157969,48.0,185.0,237.0,325.5,5907.0
article_unique_words_count,403.0,228.791563,197.631451,41.0,143.5,180.0,242.5,2885.0
article_sentence_count,403.0,2.501241,4.233469,1.0,1.0,2.0,3.0,75.0
article_lemmas_count,403.0,148.57072,104.935691,4.0,52.5,163.0,222.0,634.0
article_unique_lemmas_count,403.0,92.950372,55.877821,4.0,42.5,95.0,133.0,294.0


In [49]:
print_stats('Validate Dataset', words_for_all_headlines, lemmas_for_all_headlines, words_for_all_articles, lemmas_for_all_articles)

++++++++++ Validate Dataset ++++++++++
===== Summary Lemmas 2828
===== Summary Words 2674
===== Article Lemmas 59874
===== Article Words 125896
....................
===== Common Unique Lemmas 1233
===== Summary Unique Lemmas 1335
===== Summary Unique Words 1805
===== Article Unique Lemmas 8214
===== Article Unique Words 28717


In [50]:
test_stats, words_for_all_headlines, lemmas_for_all_headlines, words_for_all_articles, lemmas_for_all_articles = stats_per_group(test, 'headline', 'article_text')

In [53]:
t_stats.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
headline_word_count,403.0,6.476427,1.14445,2.0,6.0,7.0,7.0,10.0
headline_unique_words_count,403.0,6.473945,1.144396,2.0,6.0,7.0,7.0,10.0
headline_sentence_count,403.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
headline_lemmas_count,403.0,7.024814,1.496058,1.0,6.0,7.0,8.0,11.0
headline_unique_lemmas_count,403.0,6.794045,1.265605,1.0,6.0,7.0,8.0,10.0
article_word_count,403.0,343.573201,388.08241,24.0,187.0,241.0,364.0,6298.0
article_unique_words_count,403.0,251.684864,222.179385,23.0,146.5,191.0,267.5,3077.0
article_sentence_count,403.0,2.677419,4.465474,1.0,1.0,2.0,3.0,81.0
article_lemmas_count,403.0,146.444169,130.96224,3.0,40.0,129.0,213.5,1125.0
article_unique_lemmas_count,403.0,92.282878,69.73949,3.0,36.0,83.0,129.0,588.0


In [54]:
print_stats('Test Dataset', words_for_all_headlines, lemmas_for_all_headlines, words_for_all_articles, lemmas_for_all_articles)

++++++++++ Test Dataset ++++++++++
===== Summary Lemmas 2831
===== Summary Words 2610
===== Article Lemmas 59017
===== Article Words 138460
....................
===== Common Unique Lemmas 1243
===== Summary Unique Lemmas 1353
===== Summary Unique Words 1794
===== Article Unique Lemmas 8164
===== Article Unique Words 30616


In [6]:
def get_results(dataset_file, ref_column, text_column):
    df = pd.read_json(dataset_file)
    df = df.transpose()
    train, validate, test = \
              np.split(df.sample(frac=1, random_state=777), 
                       [int(.6*len(df)), int(.8*len(df))])
    
    print(len(train.index))
    print(len(validate.index))
    print(len(test.index))
    
    train_stats, words_for_all_headlines, lemmas_for_all_headlines, words_for_all_articles, lemmas_for_all_articles = stats_per_group(train, ref_column, text_column)
    print_stats('Train Dataset', words_for_all_headlines, lemmas_for_all_headlines, words_for_all_articles, lemmas_for_all_articles)
    train_stats.describe().transpose()
    
    validate_stats, words_for_all_headlines, lemmas_for_all_headlines, words_for_all_articles, lemmas_for_all_articles = stats_per_group(validate, ref_column, text_column)
    print_stats('Validate Dataset', words_for_all_headlines, lemmas_for_all_headlines, words_for_all_articles, lemmas_for_all_articles)
    validate_stats.describe().transpose()
    
    test_stats, words_for_all_headlines, lemmas_for_all_headlines, words_for_all_articles, lemmas_for_all_articles = stats_per_group(test, ref_column, text_column)
    print_stats('Test Dataset', words_for_all_headlines, lemmas_for_all_headlines, words_for_all_articles, lemmas_for_all_articles)
    test_stats.describe().transpose()
    
    return train_stats, validate_stats, test_stats

In [7]:
%timeit
train_stats_ria, validate_stats_ria, test_stats_ria = get_results('../dataset/ria_2k.json', 'title', 'text')

1200
400
400
++++++++++ Train Dataset ++++++++++
===== Summary Lemmas 8816
===== Summary Words 7750
===== Article Lemmas 201366
===== Article Words 213224
....................
===== Common Unique Lemmas 3273
===== Summary Unique Lemmas 3447
===== Summary Unique Words 4769
===== Article Unique Lemmas 21506
===== Article Unique Words 51374
++++++++++ Validate Dataset ++++++++++
===== Summary Lemmas 2846
===== Summary Words 2530
===== Article Lemmas 67183
===== Article Words 73540
....................
===== Common Unique Lemmas 1459
===== Summary Unique Lemmas 1561
===== Summary Unique Words 1924
===== Article Unique Lemmas 11779
===== Article Unique Words 26185
++++++++++ Test Dataset ++++++++++
===== Summary Lemmas 2961
===== Summary Words 2576
===== Article Lemmas 66489
===== Article Words 73054
....................
===== Common Unique Lemmas 1476
===== Summary Unique Lemmas 1624
===== Summary Unique Words 1993
===== Article Unique Lemmas 12043
===== Article Unique Words 26375


In [9]:
train_stats_ria.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
headline_word_count,1200.0,6.458333,1.452592,1.0,6.0,7.0,7.0,13.0
headline_unique_words_count,1200.0,6.4475,1.449941,1.0,6.0,7.0,7.0,13.0
headline_sentence_count,1200.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
headline_lemmas_count,1200.0,7.346667,1.93208,1.0,6.0,7.0,9.0,16.0
headline_unique_lemmas_count,1200.0,6.9875,1.64152,1.0,6.0,7.0,8.0,14.0
article_word_count,1200.0,177.686667,176.096679,0.0,84.0,133.5,206.25,2880.0
article_unique_words_count,1200.0,144.365,127.280486,0.0,73.0,111.0,168.0,1779.0
article_sentence_count,1200.0,1.166667,0.832346,1.0,1.0,1.0,1.0,14.0
article_lemmas_count,1200.0,167.805,122.015737,0.0,91.0,146.0,211.0,1411.0
article_unique_lemmas_count,1200.0,103.886667,71.381019,0.0,60.75,88.0,131.0,624.0


In [10]:
validate_stats_ria.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
headline_word_count,400.0,6.325,1.435104,1.0,5.0,6.0,7.0,10.0
headline_unique_words_count,400.0,6.32,1.434484,1.0,5.0,6.0,7.0,10.0
headline_sentence_count,400.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
headline_lemmas_count,400.0,7.115,1.925108,0.0,6.0,7.0,8.0,13.0
headline_unique_lemmas_count,400.0,6.8,1.693271,0.0,6.0,7.0,8.0,11.0
article_word_count,400.0,183.85,170.475003,0.0,87.75,136.5,215.25,1161.0
article_unique_words_count,400.0,147.8875,125.614818,0.0,74.0,112.5,173.5,816.0
article_sentence_count,400.0,1.2325,1.039782,1.0,1.0,1.0,1.0,12.0
article_lemmas_count,400.0,167.9575,108.128651,0.0,95.5,151.0,213.25,696.0
article_unique_lemmas_count,400.0,103.96,64.673372,0.0,62.75,92.0,129.0,421.0


In [11]:
test_stats_ria.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
headline_word_count,400.0,6.44,1.323169,2.0,6.0,6.0,7.0,11.0
headline_unique_words_count,400.0,6.4325,1.317111,2.0,6.0,6.0,7.0,11.0
headline_sentence_count,400.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
headline_lemmas_count,400.0,7.4025,1.962172,2.0,6.0,7.0,9.0,16.0
headline_unique_lemmas_count,400.0,7.005,1.611355,2.0,6.0,7.0,8.0,12.0
article_word_count,400.0,182.635,261.994101,0.0,75.0,122.5,206.25,4227.0
article_unique_words_count,400.0,144.975,167.440063,0.0,64.0,106.0,169.25,2414.0
article_sentence_count,400.0,1.19,0.990633,1.0,1.0,1.0,1.0,16.0
article_lemmas_count,400.0,166.2225,130.05057,0.0,84.0,143.0,209.5,1111.0
article_unique_lemmas_count,400.0,102.705,75.854583,0.0,56.75,84.0,127.0,600.0


In [67]:
train_stats_gazeta, validate_stats_gazeta, test_stats_gazeta = get_results('../dataset/gazeta_2k.json', 'title', 'text')

1200
400
400
++++++++++ Train Dataset ++++++++++
===== Summary Lemmas 7736
===== Summary Words 7553
===== Article Lemmas 111418
===== Article Words 466193
....................
===== Common Unique Lemmas 2624
===== Summary Unique Lemmas 3003
===== Summary Unique Words 4546
===== Article Unique Lemmas 15697
===== Article Unique Words 74616
++++++++++ Validate Dataset ++++++++++
===== Summary Lemmas 2529
===== Summary Words 2487
===== Article Lemmas 35162
===== Article Words 156203
....................
===== Common Unique Lemmas 1151
===== Summary Unique Lemmas 1404
===== Summary Unique Words 1848
===== Article Unique Lemmas 8464
===== Article Unique Words 40844
++++++++++ Test Dataset ++++++++++
===== Summary Lemmas 2544
===== Summary Words 2489
===== Article Lemmas 30924
===== Article Words 153920
....................
===== Common Unique Lemmas 1152
===== Summary Unique Lemmas 1434
===== Summary Unique Words 1872
===== Article Unique Lemmas 7910
===== Article Unique Words 39367


In [8]:
train_stats_gazeta.describe().transpose()

NameError: name 'train_stats' is not defined

In [73]:
validate_stats_gazeta.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
headline_word_count,400.0,6.2175,1.358236,3.0,5.0,6.0,7.0,10.0
headline_unique_words_count,400.0,6.2175,1.358236,3.0,5.0,6.0,7.0,10.0
headline_sentence_count,400.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
headline_lemmas_count,400.0,6.3225,1.498033,2.0,5.0,6.0,7.0,11.0
headline_unique_lemmas_count,400.0,6.235,1.408807,2.0,5.0,6.0,7.0,11.0
article_word_count,400.0,390.5075,122.82114,214.0,314.0,358.0,423.0,840.0
article_unique_words_count,400.0,316.7375,91.345847,185.0,256.75,291.0,354.0,672.0
article_sentence_count,400.0,4.05,2.052567,1.0,3.0,4.0,5.0,12.0
article_lemmas_count,400.0,87.905,95.389827,4.0,28.75,55.0,98.0,550.0
article_unique_lemmas_count,400.0,69.2675,61.699416,4.0,27.0,49.0,85.0,313.0


In [75]:
test_stats_gazeta.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
headline_word_count,400.0,6.2225,1.271627,2.0,5.75,6.0,7.0,11.0
headline_unique_words_count,400.0,6.2225,1.271627,2.0,5.75,6.0,7.0,11.0
headline_sentence_count,400.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
headline_lemmas_count,400.0,6.36,1.417895,2.0,5.0,6.0,7.0,13.0
headline_unique_lemmas_count,400.0,6.2675,1.360357,2.0,5.0,6.0,7.0,13.0
article_word_count,400.0,384.8,117.826281,199.0,315.0,345.0,409.75,839.0
article_unique_words_count,400.0,310.5025,87.823901,178.0,255.0,282.0,339.25,658.0
article_sentence_count,400.0,4.075,2.021036,1.0,3.0,4.0,5.0,14.0
article_lemmas_count,400.0,77.31,83.138251,3.0,23.0,48.5,95.0,491.0
article_unique_lemmas_count,400.0,61.9025,55.550451,3.0,22.0,43.5,80.25,295.0
