### Process R data files

In [None]:
%load_ext autoreload
%autoreload 2

import sys, os

root_folder = os.path.join(os.getcwd().split('welfare_state_analytics')[0], 'welfare_state_analytics')

sys.path = list(set(sys.path + [ root_folder ]))

corpus_folder = os.path.join(root_folder, "data/textblock_politisk")

import notebooks.corpus_data as corpus_data

import ipywidgets
#from beakerx import *
#from beakerx.object import beakerx
from IPython.display import display
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import penelope.corpus.vectorized_corpus as vectorized_corpus

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

flatten = lambda l: [item for sublist in l for item in sublist]


In [None]:
import pandas as pd
import notebooks.political_in_newspapers.corpus_data as corpus_data

def load_meta_text_blocks_as_data_frame(corpus_folder):
    """ Load censored corpus data """

    filename =  os.path.join(corpus_folder, corpus_data.meta_textblocks_filename)
    df_meta = pd.read_csv(filename, compression='zip', header=0, sep=',', quotechar='"', na_filter=False)
    # df_meta = df_meta[['id', 'pred_bodytext']].drop_duplicates()
    #df_meta.columns = ["doc_id", "pred_bodytext"]
    #df_meta = df_meta.set_index("doc_id")
    return df_meta

def load_reconstructed_text_corpus(corpus_folder):
    filename = os.path.join(corpus_folder, corpus_data.reconstructed_text_corpus_file)
    if not os.path.isfile(filename):
        df_corpus = load_corpus_dtm_as_data_frame(corpus_folder)
        df_vocabulary = load_vocabulary_file_as_data_frame(corpus_folder)
        id2token = df_vocabulary['token'].to_dict()
        df_reconstructed_text_corpus = (df_corpus.groupby('document_id')).apply( lambda x: ' '.join(flatten(x['tf'] * (x['token_id'].apply(lambda y: [id2token[y]])))))
        df_reconstructed_text_corpus.to_csv(filename, compression='zip', header=0, sep=',', quotechar='"')
    else:
        df_reconstructed_text_corpus = pd.read_csv(filename, compression='zip', header=None, sep=',', quotechar='"')
        df_reconstructed_text_corpus.columns = ['document_id', 'test']
        df_reconstructed_text_corpus.set_index('document_id')

    return df_reconstructed_text_corpus

df = load_meta_text_blocks_as_data_frame(corpus_folder)
rt = load_reconstructed_text_corpus(corpus_folder)

### Load DTM, document index and vocabulary
This data is loaded from CSV files exported from R (drm1)

In [None]:
import notebooks.political_in_newspapers.corpus_data as corpus_data
df_corpus, df_document, df_vocabulary = corpus_data.load(corpus_folder)
id2token = df_vocabulary['token'].to_dict()

df_tf = df_corpus\
    .groupby(['document_id']).agg(
        term_count=('tf', 'sum')
    )
df_document = df_document.merge(df_tf, how='inner', right_index=True, left_index=True)


### Document size distribution

In [None]:
import pandas as pd
import numpy as np

def plot_document_size_distribution():

    df = df_document\
        .groupby('term_count')\
        .size()

    dx = pd.DataFrame({ 'term_count': list(range(0, df.index.max() + 1))}).set_index('term_count')
    df = dx.join(df.rename('x'), how='left').fillna(0).astype(np.int)

    ax = df\
        .plot\
        .bar(figsize=(20,10), rot=45);

    ticks = ax.xaxis.get_ticklocs();
    ticklabels = [ l.get_text() for l in ax.xaxis.get_ticklabels() ];
    ax.xaxis.set_ticks(ticks[::100]);
    ax.xaxis.set_ticklabels(ticklabels[::100]);

    return df

df = plot_document_size_distribution()

#print(df.describe())



### Number of documents per year and publication

In [None]:
df = df_document\
    .groupby(['year', 'publication'])\
    .agg(document_count=('doc_id', 'nunique'))\
    .reset_index()\
    .set_index(['year', 'publication'])

df\
    .unstack('publication')\
    .plot(kind='bar', subplots=True, figsize=(20,20), layout=(2,2), rot=45);



### Numer of tokens per year and publication

In [None]:
df = df_document\
    .groupby(['year', 'publication'])\
    .agg(term_count=('term_count', 'mean'))\
    .reset_index()\
    .set_index(['year', 'publication'])\
    .unstack('publication')

df.to_excel('mean_tokens_per_year.xlsx')
#display(df)    
#df.plot(kind='bar', subplots=True, figsize=(25,25), layout=(2,2), rot=45);


### Print data sizes

In [None]:

print('Corpus metrics, source "dtm1.rds", arrays drm$i, drm$j, drm$v')
print('  {} max document ID'.format(df_corpus.document_id.max()))
print('  {} unique document ID'.format(df_corpus.document_id.unique().shape[0]))
print('  {} max token ID'.format(df_corpus.token_id.max()))
print('  {} unique token ID'.format(df_corpus.token_id.unique().shape[0]))

print('Document metrics, source "dtm1.rds", arrays drm$dimnames[1]')
print('  {} max ID'.format(df_document.index.max()))
print('  {} unique ID'.format(df_document.index.unique().shape[0]))
print('  {} unique names'.format(df_document.doc_id.unique().shape[0]))

print('Vocabulary metrics, source "dtm1.rds", arrays drm$dimnames[2]')
print('  {} max ID'.format(df_vocabulary.index.max()))
print('  {} unique ID'.format(df_vocabulary.index.unique().shape[0]))
print('  {} unique token'.format(df_vocabulary.token.unique().shape[0]))

#df_document.groupby('doc_id').filter(lambda x: len(x) > 1).head()
