In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import spacy
import empath
from util import const, reader



In [3]:
%%time
df = reader.load_df(clean=True)
fm = const.FileManager()

CPU times: user 1min 6s, sys: 1.49 s, total: 1min 7s
Wall time: 1min 8s


In [4]:
nlp = spacy.load('en')
doc = nlp(df.iloc[1].body_content)
doc.print_tree()[0]

  "__main__", mod_spec)


{'word': '@pammaysmcdonald',
 'lemma': '@pammaysmcdonald',
 'NE': 'GPE',
 'POS_fine': 'NN',
 'POS_coarse': 'NOUN',
 'arc': 'ROOT',
 'modifiers': [{'word': '@docrocktex26',
   'lemma': '@docrocktex26',
   'NE': '',
   'POS_fine': 'CC',
   'POS_coarse': 'CCONJ',
   'arc': 'pobj',
   'modifiers': [{'word': '@chesterbadger3',
     'lemma': '@chesterbadger3',
     'NE': '',
     'POS_fine': 'NN',
     'POS_coarse': 'NOUN',
     'arc': 'amod',
     'modifiers': []}]},
  {'word': '@NPR',
   'lemma': '@npr',
   'NE': '',
   'POS_fine': 'RB',
   'POS_coarse': 'ADV',
   'arc': 'punct',
   'modifiers': []}]}

# Empath
Lexicon substitute for the commercial LIWC.

In [5]:
lexicon = empath.Empath()
s = 'he hit the other person'
l = lexicon.analyze(s, normalize=True)
[(k,v) for k, v in l.items() if v > 0]

[('movement', 0.2),
 ('violence', 0.2),
 ('pain', 0.2),
 ('negative_emotion', 0.2)]

# Word collections

`;` indicates comments, one empty line between comments and start of words.

In [6]:
!ls ./collections/
!file ./collections/negative-words.txt

negative-words.txt       verbs-factive.txt        verbs-nonimplicative.txt
positive-words.txt       verbs-implicative.txt    verbs-reporting.txt
subjclues.json           verbs-nonfactive.txt
./collections/negative-words.txt: ISO-8859 text


In [7]:
def words_from_file(fname, encoding='UTF-8'):
    with open(fname, encoding=encoding) as f:
        s = set()
        include = lambda w: not (len(w.strip()) == 0 or w.startswith(';'))
        return set(w.strip().lower() for w in f if include(w))

fnames = [
    'negative-words.txt', 'verbs-factive.txt', 'verbs-nonimplicative.txt', 
    'positive-words.txt', 'verbs-implicative.txt', 'verbs-reporting.txt']

words = {}
for fname in fnames:
    key = fname[:fname.index('.')]
    path = fm.collection(fname)
    
    try:
        words[key] = words_from_file(path)
    except:
        words[key] = words_from_file(path, encoding='ISO-8859-1')

# Subjclues

Used vim to transfer them to JSON; removed 'polarity' and 'mpqpolarity' since only one entry uses these.

In [8]:
import pandas as pd
df_subjclues = pd.read_json(fm.collection('subjclues.json'))

In [9]:
df_subjclues.head()

Unnamed: 0,len,pos1,priorpolarity,stemmed1,type,word1
0,1,adj,negative,n,weaksubj,abandoned
1,1,noun,negative,n,weaksubj,abandonment
2,1,verb,negative,y,weaksubj,abandon
3,1,verb,negative,y,strongsubj,abase
4,1,anypos,negative,y,strongsubj,abasement


In [10]:
subjclues_set = set(df_subjclues.word1)

In [11]:
import sklearn.feature_extraction as fe

sentence = "this is a test"
dv = fe.DictVectorizer()

In [12]:
dv.fit_transform([{'x': 2, 'y': 1}, {'y': 200}]).todense()

matrix([[  2.,   1.],
        [  0., 200.]])

In [13]:
words.values()



In [168]:
from collections import defaultdict
import util
import numpy as np


def dict_features(tokens, feature_set):
    ret = defaultdict(int) #dict((f, 0) for f in feature_set)
    for t in tokens:
        if t in feature_set:
            ret[t] += 1
    return ret


feature_set = set(np.reshape(words.values(), -1))
feature_set = feature_set.union(subjclues_set)




In [169]:
import nltk
from nltk.tokenize import RegexpTokenizer
from util import data_process, functions
from nltk.stem.porter import PorterStemmer
from sklearn.pipeline import TransformerMixin


class TokenizerTransformer(TransformerMixin):
    
    def __init__(self):
        self.normalizer = data_process.TweetNormalizer()
        self.regex_word_tokenizer = RegexpTokenizer(r'[@]?\w+')
        stemmer = PorterStemmer()
        stem = lambda ws: [stemmer.stem(w) for w in ws]
        self.pipeline = util.compose(
            self.normalizer.transform,
            self.regex_word_tokenizer.tokenize,
            # 10 times faster without stemming 
            # stem,
        )
        
        self.fit = functions.const(self)
        self.transform = self.pipeline


def tokenize(xs):
    regex_word_tokenizer = RegexpTokenizer(r'[@]?\w+')
    xs = data_process.normalize(xs)
    pipeline = data_process.compose(
        regex_word_tokenizer.tokenize,)
    return map(pipeline, xs)


xs_tok = tokenize(['this is a TEST. @user and a sentence. insolent, gutless'])
xs_feat = map(lambda t: dict_features(t, feature_set), xs_tok)
print(list(xs_feat))
xs_tok = tokenize(['this is a TEST. @user and a sentence. insolent, gutless'])
print(list(xs_tok))
print(TokenizerTransformer().transform('this is a TEST. @user and a sentence. insolent, gutless'))

[defaultdict(<class 'int'>, {'insolent': 1})]
[['this', 'is', 'a', 'test', '@user', 'and', 'a', 'sentence', 'insolent', 'gutless']]
['this', 'is', 'a', 'test', '@user', 'and', 'a', 'sentence', 'insolent', 'gutless']


In [161]:
%%time
tweet_tokenizer = TokenizerTransformer()
full_data_count_vectorizer = fe.text.CountVectorizer(
    analyzer=tweet_tokenizer.transform)
full_data_count_vectorizer.fit_transform(df.body_content)


CPU times: user 6.6 s, sys: 281 ms, total: 6.88 s
Wall time: 7.5 s


In [192]:
texts_transformed = df.body_content.apply(TokenizerTransformer().transform)
text_lens = texts_transformed.apply(len)

In [202]:
uniq_words = set([i for sublist in texts_transformed.values.tolist() for i in sublist])
feature_set_limited = feature_set.intersection(uniq_words)

In [204]:
print(len(feature_set))
print(len(feature_set_limited))

6887
4114


In [190]:
text_lens.max()

53

In [173]:
import sklearn.pipeline as pipe
from util import pipeutil

def lexical_feat_pipeline():
    dict_vectorizer_lex = fe.DictVectorizer()
    dict_vectorizer_empath = fe.DictVectorizer()
    
    lex_empath_pipe = pipeutil.union(
        pipeutil.pipe(
            lambda x: dict_features(x, feature_set),
            dict_vectorizer_lex),
        pipeutil.pipe(
            lexicon.analyze,
            dict_vectorizer_empath
        ))
    
    return lex_empath_pipe


xs_tok = tokenize(df.body_content.iloc[:4])
pipeline = lexical_feat_pipeline()
print(pipeline.fit_transform(list(xs_tok)).todense())
print(pipeline.transform(list(tokenize(['insolent']))).todense())


[[0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0.]
 [1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 

In [53]:
from util.pipeutil import GlobalTransform


def network_features(df):
    maybe_empty_list = lambda obj: obj if isinstance(obj, list) else []
    filter_tweeting_user = lambda atreplies, username: [
        (r, 1) for r in maybe_empty_list(atreplies) 
        if r[1:] != username]
    
    return map(dict, 
        (filter_tweeting_user(atreplies, username) 
        for atreplies, username in zip(df.body_atreplies, df.user_handle)))

def network_feat_pipeline():
    dict_vectorizer = fe.DictVectorizer()
    return pipeutil.pipe(
        GlobalTransform(network_features), 
        dict_vectorizer)

In [44]:
list(network_features(df.iloc[20:30]))

[{'@el_pais': 1},
 {},
 {},
 {'@clarincom': 1,
  '@la100fm': 1,
  '@telefecom': 1,
  '@bairesdirecto': 1,
  '@dariobarassi': 1},
 {},
 {},
 {},
 {},
 {'@Ebooksreport': 1, '@USATODAY': 1},
 {'@guardian': 1}]

In [21]:
import keras.preprocessing.text as kpt

tok = kpt.Tokenizer()
texts = "this is the first text", "some other second document"
tok.fit_on_texts(texts)


In [22]:
print(util.compose(tokenize, list)(df.body_content.head(n=2)))


[['@user', 'a', 'korean', 'american', 'man', 'detained', 'in', 'north', 'korea', 'has', 'reportedly', 'confessed', 'to', 'trying', '@url'], ['@user', '@user', '@user', '@user', 'meanwhile', 'culture', 'economic', 'shifts', 'left', 'both', 'irrelevant', 'but', 'still', 'sizeable', 'demographics']]


In [23]:
%%time
from util import gloveutil

# fit CountVectorizer on entire dataset and later use it for
# word-to-index mapping?

glove_200 = gloveutil.load_glove(const.FileManager.GLOVE_200)

CPU times: user 1min 15s, sys: 2.81 s, total: 1min 18s
Wall time: 1min 19s


In [39]:
%%time 
embedding_matrix = gloveutil.glove_to_embedding_matrix(
    glove_200, full_data_count_vectorizer.vocabulary_)


CPU times: user 164 ms, sys: 12.9 ms, total: 177 ms
Wall time: 178 ms


In [155]:
def network_cues_model(input_shape):
    #model = Sequential()
    #model.add(Dense(100, input_shape=input_shape, activation='relu'))
    #model.add(Dense(100, input_shape=input_shape, activation='relu'))
    xs = keras.Input(shape=input_shape)
    ys = keras.layers.Dense(100)(xs)
    ys = keras.layers.Dense(100)(ys)
    return xs, ys, Model(xs, ys)

from keras import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical
from util import pipeutil

def sequence_model(embedding_matrix, word_index, max_sequence_length):
    """
    embedding_matrix: matrix of 200-dim GloVe embeddings
    word_index: word-to-index dictionary
    """
    embedding_dim = 200
    num_words = len(word_index)
    
    sequence_input = Input(
      shape=(max_sequence_length,), dtype='int32')
    embedding_layer = Embedding(
      num_words, embedding_dim, weights=[embedding_matrix],
      input_length=max_sequence_length, trainable=False)
    
    embedded_sequences = embedding_layer(sequence_input)

    x = Conv1D(100, 5, activation='relu')(embedded_sequences)
    x = MaxPooling1D(5)(x)
    x = Conv1D(100, 5, activation='relu')(x)
    x = GlobalMaxPooling1D()(x)
    # x = keras.layers.Flatten()(x)
    x = Dense(100, activation='relu')(x)

    return sequence_input, x, Model(sequence_input, x)


import keras
def full_model(ys_left, ys_right):
    ys_merged = keras.layers.concatenate([ys_left, ys_right])
    ys_out = keras.layers.Dense(100)(ys_merged)
    return keras.Model(
      inputs=[xs_left, xs_right], outputs=ys_merged)


def get_network_lex_cues_pipe():
    # TODO this can be done prior to iteration
    # network_feat_pipeline is too slow
    #net_pipe = network_feat_pipeline()
    #net_pipe = pipeutil.pipe(
    #    GlobalTransform(lambda df: df.body_atreplies))
    
    cues_pipe = pipeutil.pipe(
        GlobalTransform(lambda df: df.body_content),
        TokenizerTransformer().transform,
        lexical_feat_pipeline())
    return pipeutil.union(cues_pipe)



In [156]:
loh = pd.get_dummies(df.label)
df = df.assign(label_onehot=loh.values.tolist())

In [157]:
def to_sequence(index, text):
    return [index[w] for w in text if w in index]
to_sequence({'a': 0, 'b': 1}, ['b', 'a'])

[1, 0]

In [207]:
#preprocess_nl = get_network_lex_cues_pipe()
#%time xs_nl = preprocess_nl.fit_transform(df)

In [None]:

preprocess_nl = get_network_lex_cues_pipe()

In [191]:
from sklearn.model_selection import KFold, StratifiedKFold
kfold = StratifiedKFold(n_splits=2, random_state=const.SEED)

for i, (itrain, itest) in enumerate(kfold.split(df, y=df.label)):
    print(i)
    
    df_train = df.iloc[itrain]
    ys_train = df.label.iloc[itrain]
    df_test = df.iloc[itest]
    ys_test = df.label.iloc[itest]
    
    print("Train stats\n%s" % df_train.label.value_counts())
    print("Test stats\n%s" % df_test.label.value_counts())
    
    print("Preprocessing")
    preprocess_nl = get_network_lex_cues_pipe()
    %time xs_train_nl = preprocess_nl.fit_transform(df_train)
    %time xs_test_nl = preprocess_nl.transform(df_test)
    
    print("Preprocessed network & lexical cues.")
    lmap = lambda f, l: list(map(f, l))
    tweet_tokenizer = TokenizerTransformer()
    
    xs_train_cnn = lmap(tweet_tokenizer.transform, df_train.body_content)
    xs_test_cnn = lmap(tweet_tokenizer.transform, df_test.body_content)
    
    print(xs_train_cnn[:5])
    print(xs_test_cnn[:5])

    count_vectorizer = fe.text.CountVectorizer(
        tokenizer=lambda x: x,
        analyzer=lambda x: x)
    count_vectorizer.fit(xs_train_cnn)
    
    to_seq_vocab = lambda l: lmap(
        lambda x: to_sequence(count_vectorizer.vocabulary_, x), l)
    print(to_seq_vocab(xs_train_cnn[:5]))
    
    max_seq_len = text_lens.max()
    xs_train_cnn = pad_sequences(
        to_seq_vocab(xs_train_cnn), maxlen=max_seq_len, padding='post')
    xs_test_cnn = pad_sequences(
        to_seq_vocab(xs_test_cnn), maxlen=max_seq_len, padding='post')
    
    embedding_matrix = gloveutil.glove_to_embedding_matrix(
        glove_200, count_vectorizer.vocabulary_)
    
    print("Preprocessed CNN inputs, building model")
    print("NC shape", xs_train_nl.shape[1:])
    nl_in, nl_out, model_nl = network_cues_model(xs_train_nl.shape[1:])
    cnn_in, cnn_out, model_cnn = sequence_model(
        embedding_matrix, count_vectorizer.vocabulary_, max_seq_len)
    
    out_merged = keras.layers.concatenate([nl_out, cnn_out])
    out = Dense(len(df.label.unique()), activation='softmax')(out_merged)
    model = keras.Model(
      inputs=[nl_in, cnn_in], outputs=out)
    optimizer = keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999)
    
    print("Compiling model")
    model.compile(
      optimizer, 
      loss=keras.losses.categorical_crossentropy,
      metrics=['accuracy'])
    
    print("Starting training.")
    model.fit([xs_train_nl, xs_train_cnn], ys_train,
          batch_size=128,
          epochs=10,
          validation_data=([xs_test_nl, xs_test_cnn], ys_test),)
    

0
Train stats
1    26917
0    23752
Name: label, dtype: int64
Test stats
1    26917
0    23752
Name: label, dtype: int64
Preprocessing
Preprocessed network & lexical cues.


KeyboardInterrupt: 

In [None]:
from keras.layers import Input, Dense, Embedding, Conv2D, MaxPool2D
from keras.layers import Reshape, Flatten, Dropout, Concatenate
from keras.callbacks import ModelCheckpoint
from keras.optimizers import Adam
from keras.models import Model
from sklearn.model_selection import train_test_split

x, y, vocabulary, vocabulary_inv = load_data()
X_train, X_test, y_train, y_test = train_test_split( x, y, test_size=0.2, random_state=42)

sequence_length = x.shape[1]
vocabulary_size = len(vocabulary_inv)
embedding_dim = 256
filter_sizes = [1, 1]
num_filters = 512
drop = 0.5

epochs = 100
batch_size = 30

# this returns a tensor
print("Creating Model...")
inputs = Input(shape=(sequence_length,), dtype='int32')
embedding = Embedding(
    input_dim=vocabulary_size, output_dim=embedding_dim, input_length=sequence_length)(inputs)
reshape = Reshape((sequence_length,embedding_dim,1))(embedding)

conv_0 = Conv2D(
    num_filters, kernel_size=(filter_sizes[0], embedding_dim), 
    padding='valid', kernel_initializer='normal', activation='relu')(reshape)
conv_1 = Conv2D(
    num_filters, kernel_size=(filter_sizes[1], embedding_dim), 
    padding='valid', kernel_initializer='normal', activation='relu')(reshape)
conv_2 = Conv2D(
    num_filters, kernel_size=(filter_sizes[2], embedding_dim), 
    padding='valid', kernel_initializer='normal', activation='relu')(reshape)

maxpool_0 = MaxPool2D(pool_size=(
    sequence_length - filter_sizes[0] + 1, 1), strides=(1,1), padding='valid')(conv_0)
maxpool_1 = MaxPool2D(pool_size=(
    sequence_length - filter_sizes[1] + 1, 1), strides=(1,1), padding='valid')(conv_1)
maxpool_2 = MaxPool2D(pool_size=(
    sequence_length - filter_sizes[2] + 1, 1), strides=(1,1), padding='valid')(conv_2)

concatenated_tensor = Concatenate(axis=1)([maxpool_0, maxpool_1, maxpool_2])
flatten = Flatten()(concatenated_tensor)
dropout = Dropout(drop)(flatten)
output = Dense(units=2, activation='softmax')(dropout)

# this creates a model that includes
model = Model(inputs=inputs, outputs=output)

checkpoint = ModelCheckpoint(
    'weights.{epoch:03d}-{val_acc:.4f}.hdf5', 
    monitor='val_acc', 
    verbose=1, 
    save_best_only=True, 
    mode='auto')
adam = Adam(lr=1e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)

model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])
print("Traning Model...")
model.fit(
    X_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1, 
    callbacks=[checkpoint], validation_data=(X_test, y_test))

In [None]:
from keras import Sequential
test_model = Sequential()
test_model.add(Dense(200, input_shape=(200,)))

In [None]:
test_model.summary()