# Word frequency distribution trends

In [2]:
%load_ext autoreload
%autoreload 2
import os, sys
import westac.common.corpus_vectorizer as corpus_vectorizer
import westac.common.text_corpus as text_corpus
import westac.common.utility as utility
import numpy as np
import sklearn

# Analysis
https://github.com/davidmcclure/lint-analysis/tree/master/notebooks/2017


## Goodness-of-fit to uniform distribution (chi-square)

See [scipy.stats.chisquare](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chisquare.html): 
"*When just f_obs is given, it is assumed that the expected frequencies are uniform...*"


In [2]:
import os
import westac.common.corpus_vectorizer as corpus_vectorizer
import westac.common.text_corpus as text_corpus

from scipy import stats

def vectorize_corpus(filename):

    if not os.path.isfile(filename):
        print('error: no such file: {}'.format(filename))
        assert os.path.isfile(filename)

    dump_tag = os.path.basename(filename).split('.')[0]

    vectorizer = corpus_vectorizer.CorpusVectorizer()

    if not vectorizer.dump_exists(dump_tag):

        meta_extract = {
            'year': r"SOU (\d{4})\_.*",
            'serial_no': r"SOU \d{4}\_(\d+).*"
        }

        print('Creating new corpus...')
        corpus = text_corpus.create_corpus(filename, meta_extract)

        print('Creating document-term matrix...')
        _ = vectorizer.fit_transform(corpus)

        print('Saving data matrix...')
        vectorizer.dump(tag=dump_tag, folder='./output')

    else:

        print('Loading data matrix...')

        vectorizer.load(dump_tag, folder='./output')

    print('Done!')
    return vectorizer

filename = './data/SOU_1945-1989.zip'
    
vectorizer = vectorize_corpus(filename)

term_year_matrix = vectorizer.group_by_year().T
term_year_matrix_n = vectorizer.normalize(term_year_matrix, axis=1, norm='l1')


Loading data matrix...
Done!


In [4]:
vectorizer.document_index


Unnamed: 0,filename,year,serial_no,document_id
0,SOU 1945_1 - Betänkande med förslag till utlän...,1945,1,0
1,SOU 1945_10 - Betänkande och förslag rörande e...,1945,10,1
2,SOU 1945_11 - Utredningar angående ekonomisk e...,1945,11,2
3,SOU 1945_14 - Socialpolitikens ekonomiska verk...,1945,14,3
4,SOU 1945_15 - Stadsplaneutredningen 1942..txt,1945,15,4
...,...,...,...,...
3124,SOU 1989_95 - Riksgäldskontoret - en finansför...,1989,95,3124
3125,SOU 1989_96 - Förenklad handläggning av ansökn...,1989,96,3125
3126,SOU 1989_97 - Vad händer med folkhögskolan_.txt,1989,97,3126
3127,SOU 1989_98 - Transplantation.txt,1989,98,3127


In [3]:

year_term_matrix   = vectorizer.group_by_year()
year_term_matrix_n = vectorizer.normalize(year_term_matrix, axis=1, norm='l1')
# Y, categories = vectorizer.collapse_by_category('year')
# (Y == year_term_matrix).all()

#Ynw       = vectorizer.slice_tokens_by_count_threshold(Yn, 1)

#Yx2, imap = vectorizer.pick_by_top_variance(500)

#data       = stats.chisquare(Ynw, f_exp=None, ddof=0, axis=0)


copy_x : boolean, optional
When pre-computing distances it is more numerically accurate to center the data first. If copy_x is True (default), then the original data is not modified, ensuring X is C-contiguous. If False, the original data is modified, and put back before the function returns, but small numerical differences may be introduced by subtracting and then adding the data mean, in this case it will also not ensure that data is C-contiguous which may cause a significant slowdown.

algorithm : “auto”, “full” or “elkan”, default=”auto”
K-means algorithm to use. The classical EM-style algorithm is “full”. The “elkan” variation is more efficient by using the triangle inequality, but currently doesn’t support sparse data. “auto” chooses “elkan” for dense data and “full” for sparse data.


In [13]:
from sklearn.cluster import KMeans
import numpy as np

# https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html

print(year_term_matrix_n.shape)
kmeans = KMeans(n_clusters=6, random_state=1337, precompute_distances='auto', n_init=10, n_jobs=4, algorithm='auto')\
    .fit(term_year_matrix_n)

print(kmeans.labels_)
print(kmeans.inertia_)
print(len(kmeans.cluster_centers_))

# kmeans.predict()

(45, 5248040)
[1 1 1 1 1 1 1 1 3 3 3 3 3 4 3 3 3 3 4 4 4 4 4 0 0 0 0 0 0 2 2 2 2 2 2 2 2
 5 5 5 5 5 5 5 5]
0.0007008333416259124
6


# Clustering (Ward and K-means)

See [this](https://stackabuse.com/hierarchical-clustering-with-python-and-scikit-learn/) tutorial.


In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage
from matplotlib import pyplot as plt

linked = linkage(Z.todense(), 'ward')

labelList = tokens_of_interest

plt.figure(figsize=(10, 7))
dendrogram(linked, orientation='top', labels=labelList, distance_sort='descending', show_leaf_counts=True)
plt.show()


In [None]:
# df_Zy.sum().where(lambda x: x>= 10000).sort_values().dropna()

In [None]:
#Xn = normalize(X, axis=1, norm='l1')
#Y = collapse_to_year_matrix(X, df_documents)
#df = pd.DataFrame(Y, columns=list(vectorizer.get_feature_names()))
#df.to_excel('test.xlsx')

if False:
    
    df = pd.DataFrame(X.toarray(), columns=list(vectorizer.get_feature_names()))
    df['year'] = df.index + 45
    df = df.set_index('year')
    df['year'] =  pd.Series(df.index).apply(lambda x: documents[x][0])
    %matplotlib inline
    df[['krig']].plot() #.loc[df["000"]==49]
