In [1]:
import numpy as np
from keras.models import Sequential
from keras.layers.recurrent import LSTM
from keras.layers.core import Dense, Activation, Dropout
from keras.utils.np_utils import to_categorical
from nltk.stem.snowball import SnowballStemmer
import pandas as pd
import numpy as np
import string, re
import nltk
from sklearn.utils import shuffle
import os.path
from keras.utils.visualize_util import plot

Using Theano backend.


In [2]:
from tweet_analysis.Tools import data_processing as dp

In [3]:
from keras.regularizers import l1l2

In [44]:
tweet_type = ['raw_tweet', 'transformed', 'lemmatized']

#Getting pandas dataframes
airlines_data = []
pol_data = []

pol_data.append(dp.get_clean_data(lemmatize=False))
airlines_data.append(dp.get_clean_data('Tweets', lemmatize=False))

pol_data.append(dp.get_clean_data(lemmatize=True))
airlines_data.append(dp.get_clean_data('Tweets', lemmatize=True))

In [45]:
airline_tweets = []
pol_tweets = []

pol_sentiments = []
airline_sentiments = []

# First the tweets without any tansformations
airline_tweets.append(pol_data[0]['text'].values)
pol_tweets.append(airlines_data[0]['text'].values)

pol_sentiments.append(to_categorical(pd.Categorical.from_array(pol_data[0]['sentiment'].values).codes))
airline_sentiments.append(to_categorical(pd.Categorical.from_array(airlines_data[0]['airline_sentiment'].values).codes))

for airlines_df, pol_df in zip(airlines_data, pol_data):
    airline_tweets.append(airlines_df['processed_text'].values)
    pol_tweets.append(pol_df['processed_text'].values)
    
    pol_sentiments.append(to_categorical(pd.Categorical.from_array(pol_df['sentiment'].values).codes))
    airline_sentiments.append(to_categorical(pd.Categorical.from_array(airlines_df['airline_sentiment'].values).codes))
    
del pol_data[:]
del airlines_data[:]

In [52]:
# Data we want to experiment with

tweets = airline_tweets[1]
sentiment = airline_sentiments[1]

In [75]:
np.random.seed(123)
n_train = (tweets.shape[0]*2) // 3
inds = [i for i in range(tweets.shape[0])]
np.random.shuffle(inds)
train_inds = inds[:n_train]
test_inds = inds[n_train:]

X_train = tweets[train_inds]
X_test = tweets[test_inds]
y_train = sentiment[train_inds]
y_test = sentiment[test_inds]

In [76]:
vocab = set()
vocab.add('OOV') # For out of vocabulary words
# max number of timesteps (chars in our case)
max_len = 150

# learn vocab
for tweet in X_train:
    for c in tweet:
        vocab.add(c)

vocab = list(vocab)

char_id = {ch:i for i, ch in enumerate(vocab)}
id_char = {i:ch for i, ch in enumerate(vocab)}


In [77]:
# using bool to reduce memory usage
X = np.zeros((len(tweets), max_len, len(vocab)), dtype=np.bool)

print('Formating input and targets...')

# set the appropriate indices to 1 in each one-hot vector
for i, train_example in enumerate(tweets):
    for timestep, char in enumerate(train_example):
        if char in char_id:
            X[i, timestep, char_id[char]] = 1 # one hot encodings of tweet characters
        else:
            X[i, timestep, char_id['OOV']] = 1

X_train = X[train_inds]
X_test = X[test_inds]

Formating input and targets...


In [115]:
from keras.callbacks import EarlyStopping
from keras.layers.pooling import AveragePooling1D
from keras.layers.wrappers import TimeDistributed

from keras.engine.topology import Layer, InputSpec
from keras import backend as T

class TemporalMeanPooling(Layer):
    """
This is a custom Keras layer. This pooling layer accepts the temporal
sequence output by a recurrent layer and performs temporal pooling,
looking at only the non-masked portion of the sequence. The pooling
layer converts the entire variable-length hidden vector sequence
into a single hidden vector, and then feeds its output to the Dense
layer.

input shape: (nb_samples, nb_timesteps, nb_features)
output shape: (nb_samples, nb_features)
"""
def __init__(self, **kwargs):
    super(TemporalMeanPooling, self).__init__(**kwargs)
    self.supports_masking = True
    self.input_spec = [InputSpec(ndim=3)]

def get_output_shape_for(self, input_shape):
    return (input_shape[0], input_shape[2])

def call(self, x, mask=None): #mask: (nb_samples, nb_timesteps)
    if mask is None:
        mask = T.mean(T.ones_like(x), axis=-1)
    ssum = T.sum(x,axis=-2) #(nb_samples, np_features)
    mask = T.cast(mask,T.floatx())
    rcnt = T.sum(mask,axis=-1,keepdims=True) #(nb_samples)
    return ssum/rcnt
    #return rcnt

def compute_mask(self, input, mask):
    return None

In [123]:
callbacks = [EarlyStopping(monitor='val_loss', min_delta=0, patience=10, verbose=0, mode='auto')]

In [124]:
print('Building model...')
#l1l2(l1=0.001, l2=0.001)

model = Sequential()
model.add(LSTM(15, 
               activation='tanh', 
               W_regularizer=None, 
               U_regularizer=None,
               return_sequences=True, 
               input_shape=(max_len, len(vocab))))

#model.add(LSTM(30, activation='tanh', return_sequences=False))
#model.add(TemporalMeanPooling())
#model.add(Dropout(0.2))
#model.add(LSTM(512, activation='tanh', return_sequences=False))
model.add(Dense(3))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy', 'precision', 'recall'])
plot(model, to_file='model.png', show_shapes=True)
print('Start fitting...')

Building model...
Start fitting...


In [None]:
hist = model.fit(X_train, y_train, batch_size=100, nb_epoch=100, validation_data=(X_test, y_test), callbacks=callbacks)

Train on 9358 samples, validate on 4679 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
 600/9358 [>.............................] - ETA: 11s - loss: 0.7083 - acc: 0.7117 - precision: 0.7854 - recall: 0.6217

In [62]:
scores = model.evaluate(X_test, y_test, batch_size=32, verbose=1, sample_weight=None)



In [63]:
scores

[0.91038657776130827,
 0.63688822398585221,
 0.63688822398585221,
 0.63688822398585221]

In [66]:
histories = []