### Process R data files

In [33]:
%load_ext autoreload
%autoreload 2

import sys, os
import westac.notebooks.political_in_newspapers.corpus_data as corpus_data

root_folder = os.path.abspath(os.path.join(globals()['_dh'][-1], "../../.."))
corpus_folder = os.path.join(root_folder, "data/textblock_politisk")

sys.path = [ root_folder ] + sys.path

import ipywidgets
#from beakerx import *
#from beakerx.object import beakerx
from IPython.display import display
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import westac.corpus.vectorized_corpus as vectorized_corpus

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

flatten = lambda l: [item for sublist in l for item in sublist]


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Load DTM, document index and vocabulary
This data is loaded from CSV files exported from R (drm1)

In [None]:
import westac.notebooks.political_in_newspapers.corpus_data as corpus_data
df_corpus, df_document, df_vocabulary = corpus_data.load(corpus_folder)
id2token = df_vocabulary['token'].to_dict()

df_tf = df_corpus\
    .groupby(['document_id']).agg(
        term_count=('tf', 'sum')
    )
df_document = df_document.merge(df_tf, how='inner', right_index=True, left_index=True)


### Document size distribution

In [None]:

def plot_document_size_distribution():

    df = df_document\
        .groupby('term_count')\
        .size()

    dx = pd.DataFrame({ 'term_count': list(range(0, df.index.max() + 1))}).set_index('term_count')
    df = dx.join(df.rename('x'), how='left').fillna(0).astype(np.int)

    ax = df\
        .plot\
        .bar(figsize=(20,10), rot=45);

    ticks = ax.xaxis.get_ticklocs();
    ticklabels = [ l.get_text() for l in ax.xaxis.get_ticklabels() ];
    ax.xaxis.set_ticks(ticks[::100]);
    ax.xaxis.set_ticklabels(ticklabels[::100]);

    return df

df = plot_document_size_distribution()

#print(df.describe())



### Number of documents per year and publication

In [None]:
df = df_document\
    .groupby(['year', 'publication'])\
    .agg(document_count=('doc_id', 'nunique'))\
    .reset_index()\
    .set_index(['year', 'publication'])

df\
    .unstack('publication')\
    .plot(kind='bar', subplots=True, figsize=(20,20), layout=(2,2), rot=45);



In [None]:
df = df_document\
    .groupby(['year', 'publication'])\
    .agg(term_count=('term_count', 'sum'))\
    .reset_index()\
    .set_index(['year', 'publication'])\
    .unstack('publication')
    
df.plot(kind='bar', subplots=True, figsize=(25,25), layout=(2,2), rot=45);


### Print data sizes

In [None]:

print('Corpus metrics, source "dtm1.rds", arrays drm$i, drm$j, drm$v')
print('  {} max document ID'.format(df_corpus.document_id.max()))
print('  {} unique document ID'.format(df_corpus.document_id.unique().shape[0]))
print('  {} max token ID'.format(df_corpus.token_id.max()))
print('  {} unique token ID'.format(df_corpus.token_id.unique().shape[0]))

print('Document metrics, source "dtm1.rds", arrays drm$dimnames[1]')
print('  {} max ID'.format(df_document.index.max()))
print('  {} unique ID'.format(df_document.index.unique().shape[0]))
print('  {} unique names'.format(df_document.doc_id.unique().shape[0]))

print('Vocabulary metrics, source "dtm1.rds", arrays drm$dimnames[2]')
print('  {} max ID'.format(df_vocabulary.index.max()))
print('  {} unique ID'.format(df_vocabulary.index.unique().shape[0]))
print('  {} unique token'.format(df_vocabulary.token.unique().shape[0]))

#df_document.groupby('doc_id').filter(lambda x: len(x) > 1).head()


### Create a smaller test corpus




In [22]:
import pandas as pd
import os

corpus_folder = "/home/roger/source/welfare_state_analytics/data/textblock_politisk_test"

corpus_dataset_filename = os.path.join(corpus_folder, "corpus_dataset.zip")
document_dataset_filename = os.path.join(corpus_folder, "document_dataset.zip")
censored_corpus_filename = os.path.join(corpus_folder, "text1_utantext.zip")
meta_textblocks_filename = os.path.join(corpus_folder, "meta_textblocks.zip")

#vocabulary_dataset_filename = os.path.join(corpus_folder, "vocabulary_dataset.zip")
#sparse_matrix_filename = os.path.join(corpus_folder, "corpus_sparse_doc_term_matrx.npz")

corpus = pd.read_csv(corpus_dataset_filename, compression='zip', header=0, sep=',', quotechar='"', na_filter=False)
corpus.columns = ['document_id', 'token_id', 'token_count']
documents = pd.read_csv(document_dataset_filename, compression='zip', header=0, sep=',', quotechar='"', na_filter=False)
censured_text = pd.read_csv(censored_corpus_filename, compression='zip', header=0, sep=',', quotechar='"', na_filter=False)
meta_textblocks = pd.read_csv(meta_textblocks_filename, compression='zip', header=0, sep=',', quotechar='"', na_filter=False)




In [27]:
import os
import csv
target_folder = "/home/roger/source/welfare_state_analytics/data/textblock_politisk_19590616"

corpus_dataset_filename = os.path.join(target_folder, "corpus_dataset.zip")
document_dataset_filename = os.path.join(target_folder, "document_dataset.zip")
meta_textblocks_filename = os.path.join(target_folder, "meta_textblocks.zip")

if not os.path.isdir(target_folder):
    os.mkdir(target_folder)

censured_text2 = censured_text[censured_text.date.isin(["1959-06-16"])]
censured_text2.columns = ["document_id","doc_id","publication","date"]
censured_text2.to_csv(
    os.path.join(target_folder, "text1_utantext.zip"), compression='zip', index=False, header=["","doc_id","publication","date"], sep=',', quotechar='"', quoting=csv.QUOTE_ALL)


In [30]:
#document2 = documents[documents.Docs.isin(censured_text2.doc_id)].head()
#documents[documents.Docs.isin(censured_text2.doc_id)].head()
corpus2 = corpus[corpus.document_id.isin(censured_text2.document_id)].head()


In [67]:

corpus_folder = "/home/roger/source/welfare_state_analytics/data/textblock_politisk"



dtm, documents, id2token = corpus_data.load_dates_subset_as_dtm(corpus_folder, ["1959-06-16"])


In [66]:
dtm.shape

(24, 1570)

In [57]:
non_zero_token_ids = dtm.sum(axis=0).nonzero()[1].tolist()
id2token2 = { i: id2token[k] for i,k in enumerate(dtm.sum(axis=0).nonzero()[1])}

In [58]:
id2token2

{0: '-årige',
 1: '-mannagruppen',
 2: '-och',
 3: 'abbé',
 4: 'acceptera',
 5: 'accepterade',
 6: 'adenauer',
 7: 'adenauers',
 8: 'affärsbanker',
 9: 'affischerna',
 10: 'äga',
 11: 'ägde',
 12: 'agrarpolitik',
 13: 'agronom',
 14: 'air',
 15: 'åka',
 16: 'ake',
 17: 'aktiv',
 18: 'ålägger',
 19: 'albert',
 20: 'ålder',
 21: 'ali',
 22: 'alldeles',
 23: 'allmän',
 24: 'allmänhet',
 25: 'allmänna',
 26: 'allmänt',
 27: 'allra',
 28: 'alltför',
 29: 'allvarliga',
 30: 'allvarligt',
 31: 'alternativet',
 32: 'ambassadör',
 33: 'amerikansk',
 34: 'anbud',
 35: 'anda',
 36: 'andre',
 37: 'ändring',
 38: 'angår',
 39: 'angelägen',
 40: 'angelägna',
 41: 'angrepp',
 42: 'anklagas',
 43: 'änklingar',
 44: 'anledning',
 45: 'anmält',
 46: 'annars',
 47: 'anordnas',
 48: 'anseende',
 49: 'anser',
 50: 'anses',
 51: 'anslutning',
 52: 'anställning',
 53: 'ansvar',
 54: 'ansvaret',
 55: 'antagligen',
 56: 'antagna',
 57: 'antal',
 58: 'antogs',
 59: 'antyddes',
 60: 'använda',
 61: 'arbetare',
 