In [1]:
import sys

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
%matplotlib notebook

In [3]:
sys.path.append('..')

In [4]:
from plsa import Corpus, Pipeline, Visualize
from plsa.pipeline import DEFAULT_PIPELINE
from plsa.algorithms import PLSA, ConditionalPLSA

In [5]:
data_dir = '../data'
file = 'Full-Economic-News-DFE-839861.csv'

options = {
    'encoding': 'latin_1'
}

In [6]:
source = '/'.join([data_dir, file])

raw = pd.read_csv(source, **options)
raw.head()

Unnamed: 0,_unit_id,_golden,_unit_state,_trusted_judgments,_last_judgment_at,positivity,positivity:confidence,relevance,relevance:confidence,articleid,date,headline,positivity_gold,relevance_gold,text
0,842613455,False,finalized,3,12/5/15 17:48,3.0,0.64,yes,0.64,wsj_398217788,8/14/91,Yields on CDs Fell in the Latest Week,,,NEW YORK -- Yields on most certificates of dep...
1,842613456,False,finalized,3,12/5/15 16:54,,,no,1.0,wsj_399019502,8/21/07,The Morning Brief: White House Seeks to Limit ...,,,The Wall Street Journal Online</br></br>The Mo...
2,842613457,False,finalized,3,12/5/15 1:59,,,no,1.0,wsj_398284048,11/14/91,Banking Bill Negotiators Set Compromise --- Pl...,,,WASHINGTON -- In an effort to achieve banking ...
3,842613458,False,finalized,3,12/5/15 2:19,,0.0,no,0.675,wsj_397959018,6/16/86,Manager's Journal: Sniffing Out Drug Abusers I...,,,The statistics on the enormous costs of employ...
4,842613459,False,finalized,3,12/5/15 17:48,3.0,0.3257,yes,0.64,wsj_398838054,10/4/02,Currency Trading: Dollar Remains in Tight Rang...,,,NEW YORK -- Indecision marked the dollar's ton...


In [7]:
docs = raw.text.to_list()[:1000]

pipeline = Pipeline(*DEFAULT_PIPELINE)
corpus = Corpus(docs, pipeline)
corpus

Corpus:
Number of documents: 1000
Number of words:     6431

In [8]:
n_topics = 5

In [None]:
conditional_plsa = ConditionalPLSA(corpus, n_topics, True)
conditional_result = conditional_plsa.fit()
conditional_plsa

In [None]:
visualize = Visualize(conditional_result)

In [17]:
plsa = PLSA(corpus, n_topics, True)
result = plsa.fit()
plsa

PLSA:
====
Number of topics:     5
Number of documents:  1000
Number of words:      6431
Number of iterations: 70

In [18]:
visualize = Visualize(result)

In [19]:
fig, ax = plt.subplots()
susi = visualize.convergence(ax)
fig.tight_layout()

<IPython.core.display.Javascript object>

In [20]:
fig, ax = plt.subplots()
susi = visualize.topics(ax)
fig.tight_layout()

<IPython.core.display.Javascript object>

In [21]:
fig, ax = plt.subplots()
susi = visualize.words_in_topic(0, ax)
fig.tight_layout()

<IPython.core.display.Javascript object>

In [22]:
fig, ax = plt.subplots()
susi = visualize.words_in_topic(1, ax)
fig.tight_layout()

<IPython.core.display.Javascript object>

In [23]:
fig, ax = plt.subplots()
susi = visualize.words_in_topic(3, ax)
fig.tight_layout()

<IPython.core.display.Javascript object>

In [24]:
fig, ax = plt.subplots()
susi = visualize.topics_in_doc(6, ax)
fig.tight_layout()

<IPython.core.display.Javascript object>

In [None]:
array = result.topic_given_doc
sorting_indices = array.argsort(axis=0)
ordered = array[sorting_indices, np.arange(array.shape[1])]
tmp = np.zeros((n_topics+1, corpus.n_docs))
tmp[1:, :] = ordered
gini = 1 - (tmp[1:, :] - (tmp[1:, :] - tmp[:-1, :])/2).sum(axis=0) * (1/n_topics) / 0.5

In [None]:
fig, ax = plt.subplots()
ax.hist(gini, bins=20)
ax.set_xlim(0,1)

In [25]:
ideal = (n_topics-1)/2

array = result.topic_given_doc
sorting_indices = array.argsort(axis=0)
ordered = array[sorting_indices, np.arange(array.shape[1])]

gini = 1 - ordered[:-1,:].cumsum(axis=0).sum(axis=0)/ideal

In [26]:
fig, ax = plt.subplots()
ax.hist(gini, bins=20)
ax.set_xlim(0,1)

<IPython.core.display.Javascript object>

(0, 1)

In [None]:
fig, ax = plt.subplots()
ax.hist(gini)

In [None]:
x = [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]
y = [0.0, 0.0, 0.0, 0.0, 0.0, 1.0]

1 - sum((x[i] - x[i-1])*(y[i] + y[i-1]) for i in range(1, 6))

In [None]:
import os

blogposts = []

for filename in os.listdir('../data/blogs')[:100]:
    print(filename)
    with open('../data/blogs/'+filename, encoding='latin_1') as file:
        new_post = False
        for line in file:
            if '<post>' in line:
                blogpost = ''
                new_post = True
            elif '</post>' in line:
                blogposts.append(blogpost)
                new_post = False
            if new_post:
                if '<post>' not in line:
                    blogpost = ''.join((blogpost, line.strip()))

In [None]:
x = np.array([0.01]*100)
y = np.array([0.0, 0.0, 0.0, 0.0, 1.0])
z = np.array([0.04/99]*99 + [0.96]*1)

In [None]:
a = (x[:-1].cumsum()*0.2).sum()
a

In [None]:
b = (y[:-1].cumsum()*0.2).sum()
b

In [None]:
c = (z[:-1].cumsum()*0.2).sum()
c

In [None]:
1 - c/a

In [None]:
1/(2*25*x.sum()/5)

In [None]:
gini = 0
for i in range(4):
    for j in range(4):
        gini += np.abs(x[i] - x[j])
gini/(2*25*x.mean())

In [None]:
perfect = 1/n_topics + 2/n_topics + 3/n_topics + 4/n_topics
perfect = (1 + 2 + 3 + 4)/n_topics
perfect

In [None]:
n_topics = 5

In [None]:
(n_topics-1)*(n_topics)/2/(n_topics-1)

In [None]:
(n_topics-1)/2

In [None]:
8 * 0.005

In [None]:
4 * 0.01

In [None]:
0.96/2

In [None]:
(n_topics-1)*(n_topics)/2/n_topics