In [1]:
# until ttk is installed, add parent dir to path
import sys
sys.path.insert(0, '..')

In [2]:
# typicaL imports
import pandas as pd
import numpy as np
import re

import matplotlib
%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (10.0, 8.0)

import matplotlib.pyplot as plt

import spacy
import ete3
import seaborn

from ttk.corpus import load_headline_corpus

In [3]:
%%time
# load the corpus
corpus = load_headline_corpus(verbose=True)

print ('Headlines:', len(corpus.sents()))

Loading corpus from: S:\git\tacticsiege\tactictoolkit\ttk\..\env\corpus\dated\2017_08_22\corpus
Corpus loaded.
Headlines: 190447
Wall time: 2.7 s


In [4]:
from ttk.preprocessing import Seq2WordVecTransformer
from ttk.preprocessing import SeqPaddingTransformer

from ttk.corpus import CategorizedDatedCorpusReporter

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

label_enc = LabelEncoder()

reporter = CategorizedDatedCorpusReporter()

corpus_df = reporter.to_data_frame(corpus, categories=['Washington Post', 'CNN'])

X = corpus_df['content'].values
X = [s.split() for s in X]

print ('Fitting on len(X):', len(X))
print ('X is', type(X))
print ('Elements of X are', type(X[0]))
print ('X[0] =', X[0])
print ('Elements of the Elements of X are', type(X[0][0]))
print ('Elements:', X[0][0], X[0][1], X[0][2])

y = label_enc.fit_transform(corpus_df['category'].values)

Using TensorFlow backend.


Fitting on len(X): 34613
X is <class 'list'>
Elements of X are <class 'list'>
X[0] = ['"', 'Uninsured', 'ranks', 'still', 'to', 'grow', 'by', 'tens', 'of', 'millions', 'under', 'latest', 'House', 'health', '-', 'care', 'bill', ',', 'CBO', 'says', '"']
Elements of the Elements of X are <class 'str'>
Elements: " Uninsured ranks


In [5]:
for i in range(5):
    print (i, X[i])
    
print ('len(X):', len(X))

0 ['"', 'Uninsured', 'ranks', 'still', 'to', 'grow', 'by', 'tens', 'of', 'millions', 'under', 'latest', 'House', 'health', '-', 'care', 'bill', ',', 'CBO', 'says', '"']
1 ['"', 'Republican', 'candidate', 'in', 'Montana', 'race', 'allegedly', "'", 'body', '-', 'slammed', "'", 'reporter', ',', 'prompting', 'police', 'investigation', '"']
2 ['"', 'Prepare', 'for', 'the', 'weirdest', 'Election', 'Day', 'in', 'history', ',', 'after', 'a', 'candidate', 'allegedly', 'body', '-', 'slams', 'a', 'reporter', '"']
3 ['Audio', ':', 'Guardian', 'reporter', 'allegedly', 'body', '-', 'slammed', 'by', 'Greg', 'Gianforte']
4 ['"', 'The', 'new', 'GOP', 'health', '-', 'care', 'bill', 'isn', "'", 't', 'any', 'better', ',', 'in', 'four', 'charts', '"']
len(X): 34613


In [6]:
vect = Seq2WordVecTransformer()
X_vect = vect.fit_transform(X, verbose='debug')
print ('len(X_vect):', len(X_vect))
#for i in range(5):
#    print (i, X_vect[i])

Fitting on len(X): 34613
X is <class 'list'>
Elements of X are <class 'list'>
X[0] = ['"', 'Uninsured', 'ranks', 'still', 'to', 'grow', 'by', 'tens', 'of', 'millions', 'under', 'latest', 'House', 'health', '-', 'care', 'bill', ',', 'CBO', 'says', '"']
Elements of the Elements of X are <class 'str'>
Elements: " Uninsured ranks


2017-09-16 22:36:51,270 : INFO : collecting all words and their counts
2017-09-16 22:36:51,271 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-09-16 22:36:51,298 : INFO : PROGRESS: at sentence #10000, processed 110351 words, keeping 12012 word types
2017-09-16 22:36:51,320 : INFO : PROGRESS: at sentence #20000, processed 222094 words, keeping 16758 word types
2017-09-16 22:36:51,339 : INFO : PROGRESS: at sentence #30000, processed 333851 words, keeping 20202 word types
2017-09-16 22:36:51,349 : INFO : collected 21533 word types from a corpus of 385756 raw words and 34613 sentences
2017-09-16 22:36:51,351 : INFO : Loading a fresh vocabulary
2017-09-16 22:36:51,385 : INFO : min_count=0 retains 21533 unique words (100% of original 21533, drops 0)
2017-09-16 22:36:51,386 : INFO : min_count=0 leaves 385756 word corpus (100% of original 385756, drops 0)
2017-09-16 22:36:51,433 : INFO : deleting the raw counts dictionary of 21533 items
2017-09-16 22:36:51,435 :

len(X_vect): 34613


In [7]:
from ttk.preprocessing import Seq2IndexTransformer

indexer = Seq2IndexTransformer(add_delimiters=False)
pad = SeqPaddingTransformer()

#X_vect = vect.fit_transform(X)
X_indexed = indexer.fit_transform(X)
X_pad = pad.fit_transform(X_indexed)

# split
X_train, X_test, y_train, y_test = train_test_split(X_pad, y, test_size=0.3, random_state=0)
print ('%i Training examples, %i Testing examples' % (len(X_train), len(X_test)))

24229 Training examples, 10384 Testing examples


In [8]:
print (X_train.shape)

(24229, 32)


In [9]:
seq = X_train[0]
t0 = seq[0]
print ('t0 type:', type(t0))

print (type(t0) is type(np.ndarray(1)))
print ('dim', t0.shape)

t0 type: <class 'numpy.float64'>
False
dim ()


In [10]:
from ttk.sandbox.udemy import SimpleRNNClassifier

N, t = X_train.shape

Y_t = np.zeros(X_train.shape, dtype=np.int32)
for n in range(N):
    for i in range(t):
        if i == t - 1:
            Y_t[n, i] = int(y_train[n])
        else:
            Y_t[n, i] = int(y_train[n])
            
X_train_shaped = X_train.reshape(N, t, 1).astype(np.float32)

Using cuDNN version 5110 on context None
Mapped name None to device cuda0: GeForce GTX 1080 Ti (0000:01:00.0)


In [14]:
def words_and_class_labels(X, y):
    N, t = X.shape
    Y_t = np.zeros(X.shape, dtype=np.int32)
    for n in range(N):
        for i in range(t):
            if i == t-1:
                Y_t[n, i] = float(y[n])
            else:
                Y_t[n, i] = float(X[n, i+1])
    X_shaped = X.reshape(N, t, 1).astype(np.float32)
    
    return X_shaped, Y_t
    
    
def only_class_labels(X, y):
    N, t = X.shape
    Y_t = np.zeros(X.shape, dtype=np.int32)
    for n in range(N):
        for i in range(t):
            Y_t[n, i] = int(y[n])
    X_shaped = X.reshape(N, t, 1).astype(np.float32)

In [15]:
X_t, Y_t = words_and_class_labels(X_train, y_train)

print ('Y_t.shape:', Y_t.shape)
print ('X_train.shape:', X_t.shape)

Y_t.shape: (24229, 32)
X_train.shape: (24229, 32, 1)


In [17]:
for i in range(6):
    print (X_t[i])

[[ 3423.]
 [ 2199.]
 [  776.]
 [ 1098.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]]
[[  160.]
 [   51.]
 [ 2123.]
 [   65.]
 [ 6957.]
 [ 2055.]
 [ 1805.]
 [  164.]
 [   24.]
 [   96.]
 [ 1587.]
 [ 5524.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]]
[[  7.30000000e+01]
 [  2.80000000e+01]
 [  8.90000000e+01]
 [  7.11000000e+02]
 [  6.00000000e+00]
 [  1.92600000e+03]
 [  1.79100000e+04]
 [  1.28570000e+04]
 [  9.25000000e+02]
 [  2.40000000e+01]
 [  2.80000000e+01]
 [  1.28900000e+04]
 [  2.80000000e+01]
 [  1.70940000e+04]
 [  0.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00

In [16]:
clf = SimpleRNNClassifier(4)
clf.fit(X_t, Y_t, show_fig=True)

iteration: 0
X[j]: [[  485.]
 [  146.]
 [ 1224.]
 [  146.]
 [   33.]
 [ 5952.]
 [  206.]
 [   28.]
 [ 3592.]
 [ 2747.]
 [   28.]
 [   72.]
 [ 2686.]
 [   16.]
 [ 3174.]
 [ 9365.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]] Y[j]: [ 146 1224  146   33 5952  206   28 3592 2747   28   72 2686   16 3174 9365
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    1]
c: 9.76809047784846
p[-1]: 17134 Y[j,-1]: 1
X[j]: [[  2.00000000e+00]
 [  7.68900000e+03]
 [  1.90000000e+01]
 [  1.15700000e+04]
 [  1.44900000e+03]
 [  8.56000000e+02]
 [  9.32200000e+03]
 [  2.40000000e+01]
 [  2.93000000e+02]
 [  9.82000000e+02]
 [  2.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00]


IndexError: index 20908 is out of bounds for axis 1 with size 17485
Apply node that caused the error: AdvancedIncSubtensor{inplace=False,  set_instead_of_inc=False}(Alloc.0, HostFromGpu(gpuarray).0, ARange{dtype='int64'}.0, Y)
Toposort index: 165
Inputs types: [TensorType(float64, matrix), TensorType(float64, vector), TensorType(int64, vector), TensorType(int32, vector)]
Inputs shapes: [(32, 17485), (32,), (32,), (32,)]
Inputs strides: [(139880, 8), (8,), (8,), (4,)]
Inputs values: ['not shown', 'not shown', 'not shown', 'not shown']
Outputs clients: [[GpuFromHost<None>(AdvancedIncSubtensor{inplace=False,  set_instead_of_inc=False}.0)]]

Backtrace when the node is created(use Theano flag traceback.limit=N to make it longer):
  File "S:\Anaconda3\lib\site-packages\theano\gradient.py", line 1272, in access_grad_cache
    term = access_term_cache(node)[idx]
  File "S:\Anaconda3\lib\site-packages\theano\gradient.py", line 967, in access_term_cache
    output_grads = [access_grad_cache(var) for var in node.outputs]
  File "S:\Anaconda3\lib\site-packages\theano\gradient.py", line 967, in <listcomp>
    output_grads = [access_grad_cache(var) for var in node.outputs]
  File "S:\Anaconda3\lib\site-packages\theano\gradient.py", line 1272, in access_grad_cache
    term = access_term_cache(node)[idx]
  File "S:\Anaconda3\lib\site-packages\theano\gradient.py", line 967, in access_term_cache
    output_grads = [access_grad_cache(var) for var in node.outputs]
  File "S:\Anaconda3\lib\site-packages\theano\gradient.py", line 967, in <listcomp>
    output_grads = [access_grad_cache(var) for var in node.outputs]
  File "S:\Anaconda3\lib\site-packages\theano\gradient.py", line 1272, in access_grad_cache
    term = access_term_cache(node)[idx]
  File "S:\Anaconda3\lib\site-packages\theano\gradient.py", line 1108, in access_term_cache
    new_output_grads)

HINT: Use the Theano flag 'exception_verbosity=high' for a debugprint and storage map footprint of this apply node.