In [1]:
# until ttk is installed, add parent dir to path
import sys
sys.path.insert(0, '..')

In [2]:
# typicaL imports
import pandas as pd
import numpy as np
import re

import matplotlib
%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (10.0, 8.0)

import matplotlib.pyplot as plt

import spacy
import ete3
import seaborn

from ttk.corpus import load_headline_corpus

In [3]:
%%time
# load the corpus
corpus = load_headline_corpus(verbose=True)

print ('Headlines:', len(corpus.sents()))

Loading corpus from: S:\git\tacticsiege\tactictoolkit\ttk\..\env\corpus\dated\2017_08_22\corpus
Corpus loaded.
Headlines: 190447
Wall time: 2.82 s


In [4]:
from ttk.preprocessing import Seq2WordVecTransformer
from ttk.preprocessing import SeqPaddingTransformer

from ttk.corpus import CategorizedDatedCorpusReporter

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from ttk.preprocessing import Seq2IndexTransformer

label_enc = LabelEncoder()
reporter = CategorizedDatedCorpusReporter()
corpus_df = reporter.to_data_frame(corpus, categories=['Washington Post', 'CNN'])

X = corpus_df['content'].values
X = [s.split() for s in X]
y = label_enc.fit_transform(corpus_df['category'].values)


def token_filter(t, s):
    return t.isalpha()

def token_processor(t, s):
    return t.lower()

indexer = Seq2IndexTransformer(add_delimiters=False, token_mapping_func=token_processor, token_filter_func=token_filter)
pad = SeqPaddingTransformer()

Using TensorFlow backend.


In [5]:
for i in range(5):
    print ('X:', X[i], 'y:', y[i])

X: ['"', 'Uninsured', 'ranks', 'still', 'to', 'grow', 'by', 'tens', 'of', 'millions', 'under', 'latest', 'House', 'health', '-', 'care', 'bill', ',', 'CBO', 'says', '"'] y: 1
X: ['"', 'Republican', 'candidate', 'in', 'Montana', 'race', 'allegedly', "'", 'body', '-', 'slammed', "'", 'reporter', ',', 'prompting', 'police', 'investigation', '"'] y: 1
X: ['"', 'Prepare', 'for', 'the', 'weirdest', 'Election', 'Day', 'in', 'history', ',', 'after', 'a', 'candidate', 'allegedly', 'body', '-', 'slams', 'a', 'reporter', '"'] y: 1
X: ['Audio', ':', 'Guardian', 'reporter', 'allegedly', 'body', '-', 'slammed', 'by', 'Greg', 'Gianforte'] y: 1
X: ['"', 'The', 'new', 'GOP', 'health', '-', 'care', 'bill', 'isn', "'", 't', 'any', 'better', ',', 'in', 'four', 'charts', '"'] y: 1


In [6]:
X_indexed = indexer.fit_transform(X)
for i in range(5):
    print ('X[%i]:' % i, X_indexed[i])

Max index: 17565
X[0]: [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]
X[1]: [19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]
X[2]: [31, 32, 33, 34, 35, 36, 21, 37, 38, 39, 20, 24, 25, 40, 39, 27]
X[3]: [41, 42, 27, 24, 25, 26, 7, 43, 44]
X[4]: [33, 45, 46, 14, 15, 16, 47, 48, 49, 50, 21, 51, 52]


In [7]:
X_padded = pad.fit_transform(X_indexed)
for i in range(5):
    print ('X[%i]' % i, X_padded[i])

X[0] [  2.   3.   4.   5.   6.   7.   8.   9.  10.  11.  12.  13.  14.  15.  16.
  17.  18.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.]
X[1] [ 19.  20.  21.  22.  23.  24.  25.  26.  27.  28.  29.  30.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.]
X[2] [ 31.  32.  33.  34.  35.  36.  21.  37.  38.  39.  20.  24.  25.  40.  39.
  27.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.]
X[3] [ 41.  42.  27.  24.  25.  26.   7.  43.  44.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.]
X[4] [ 33.  45.  46.  14.  15.  16.  47.  48.  49.  50.  21.  51.  52.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.]


In [8]:
# split
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.3, random_state=0)
print ('%i Training examples, %i Testing examples, Shape: %s' % (len(X_train), len(X_test), X_train.shape))

24229 Training examples, 10384 Testing examples, Shape: (24229, 27)


In [9]:
def reshape_and_seq_target(X, y):
    N, t = X.shape
    print ('N, t', N, t)
    Y_t = np.zeros(X.shape, dtype=np.int32)
    for n in range(N):
        #print ('n:', n)
        for i in range(t):
            if i != t-1:
                Y_t[n, i] = X[n, (i+1)]
            else:
                Y_t[n, i] = y[n]
    X_shaped = X.reshape(N, t, 1).astype(np.int32)
    return X_shaped, Y_t

def reshape_and_class_target(X, y):
    N, t = X.shape
    print ('N, t', N, t)
    Y_t = np.zeros(X.shape, dtype=np.int32)
    for n in range(N):
        #print ('n:', n)
        for i in range(t):
            Y_t[n, i] = y[n]
    X_shaped = X.reshape(N, t, 1).astype(np.int32)
    return X_shaped, Y_t

In [10]:
X_t, Y_t = reshape_and_class_target(X_train, y_train)
for i in range(3):
    print ('X[%i]:' % i, X_t[i], 'Y_t[%i]:' % i, Y_t[i])
    
X_t_f = X_t.astype(np.float32)
for i in range(3):
    print ('X[%i]:' % i, X_t_f[i], 'Y_t[%i]:' % i, Y_t[i])

N, t 24229 27
X[0]: [[ 743]
 [1967]
 [ 720]
 [1013]
 [   0]
 [   0]
 [   0]
 [   0]
 [   0]
 [   0]
 [   0]
 [   0]
 [   0]
 [   0]
 [   0]
 [   0]
 [   0]
 [   0]
 [   0]
 [   0]
 [   0]
 [   0]
 [   0]
 [   0]
 [   0]
 [   0]
 [   0]] Y_t[0]: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
X[1]: [[  39]
 [  45]
 [1903]
 [  59]
 [6006]
 [1847]
 [1631]
 [ 152]
 [  21]
 [  90]
 [1450]
 [4833]
 [   0]
 [   0]
 [   0]
 [   0]
 [   0]
 [   0]
 [   0]
 [   0]
 [   0]
 [   0]
 [   0]
 [   0]
 [   0]
 [   0]
 [   0]] Y_t[1]: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
X[2]: [[   67]
 [   83]
 [  661]
 [    5]
 [ 1150]
 [14811]
 [10816]
 [  407]
 [   21]
 [10843]
 [ 8066]
 [    0]
 [    0]
 [    0]
 [    0]
 [    0]
 [    0]
 [    0]
 [    0]
 [    0]
 [    0]
 [    0]
 [    0]
 [    0]
 [    0]
 [    0]
 [    0]] Y_t[2]: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
X[0]: [[  743.]
 [ 1967.]
 [  720.]
 [ 1013.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [

In [11]:
from ttk.sandbox.udemy import SimpleRNNClassifier

import theano

theano.config.exception_verbosity='high'

clf = SimpleRNNClassifier(4)
clf.fit(X_t_f, Y_t, show_fig=True)


Using cuDNN version 5110 on context None
Mapped name None to device cuda0: GeForce GTX 1080 Ti (0000:01:00.0)


D, K, N, M: 1 2 24229 4
iteration: 0
shape y: (27, 1, 2)
i: 0 cost: nan classification rate: 0.5298609104791778
duration: 177.09498119354248
iteration: 1
shape y: (27, 1, 2)
i: 1 cost: nan classification rate: 0.5324611003343102
duration: 175.81551504135132
iteration: 2
shape y: (27, 1, 2)
i: 2 cost: nan classification rate: 0.5324611003343102
duration: 175.8624029159546
iteration: 3
shape y: (27, 1, 2)
i: 3 cost: nan classification rate: 0.5324611003343102
duration: 175.87741804122925
iteration: 4
shape y: (27, 1, 2)
i: 4 cost: nan classification rate: 0.5324611003343102
duration: 175.4425926208496
iteration: 5
shape y: (27, 1, 2)
i: 5 cost: nan classification rate: 0.5324611003343102
duration: 176.13470792770386
iteration: 6
shape y: (27, 1, 2)
i: 6 cost: nan classification rate: 0.5324611003343102
duration: 177.26166033744812
iteration: 7
shape y: (27, 1, 2)
i: 7 cost: nan classification rate: 0.5324611003343102
duration: 178.00531554222107
iteration: 8
shape y: (27, 1, 2)
i: 8 cost

KeyboardInterrupt: 