Taken from https://www.kaggle.com/mkowoods/deep-learning-lstm-for-tweet-classification

In [1]:
import os
import re

import numpy as np
import tensorflow as tf

np.random.seed(1)
tf.random.set_seed(2)

import pandas as pd
import keras
# from tqdm import tqdm
from tqdm import tqdm_notebook as tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import class_weight
from sklearn.metrics import f1_score, classification_report, log_loss

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, SpatialDropout1D, Bidirectional, Flatten
from keras.layers import Dropout, Conv1D, GlobalMaxPool1D, GRU, GlobalAvgPool1D
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint, EarlyStopping

#training constants
MAX_SEQ_LEN = 25 #this is based on a quick analysis of the len of sequences train['text'].apply(lambda x : len(x.split(' '))).quantile(0.95)
DEFAULT_BATCH_SIZE = 128

Using TensorFlow backend.


# First, we'll need to load the csv

In [2]:
LABEL_COLUMN = 'sentiment'
LABELS = [0, 4]

train_file_path='./models/training_data/training.1600000.processed.noemoticon.csv'
data = pd.read_csv(train_file_path)
train, test = train_test_split(data, random_state = 42, train_size=0.03, test_size=0.003)
print(train.shape)
print(test.shape)


(48000, 6)
(4800, 6)


## Next, we need to encode the text

In [3]:
def clean_text(text, mapping):
    replace_white_space = ["\n"]
    for s in replace_white_space:
        text = text.replace(s, " ")
    replace_punctuation = ["’", "‘", "´", "`", "\'", r"\'"]
    for s in replace_punctuation:
        text = text.replace(s, "'")
    
    # Random note: removing the URLs slightly degraded performance, it's possible the model learned that certain URLs were positive/negative
    # And was able to extrapolate that to retweets. Could also explain why re-training the Embeddings improves performance.
    # remove twitter url's
    #     text = re.sub(r"http[s]?://t.co/[A-Za-z0-9]*","TWITTERURL",text)
    mapped_string = []
    for t in text.split(" "):
        if t in mapping:
            mapped_string.append(mapping[t])
        elif t.lower() in mapping:
            mapped_string.append(mapping[t.lower()])
        else:
            mapped_string.append(t)
    return ' '.join(mapped_string)

CONTRACTION_MAPPING = {
    "don't": "do not",
    "isn't": "is not",
    "aren't": "are not",
}

# Get tweets from Data frame and convert to list of "texts" scrubbing based on clean_text function
# CONTRACTION_MAPPING is a map of common contractions(e.g don't => do not)
train_text_vec = [clean_text(text, CONTRACTION_MAPPING) for text in train['text'].values]
test_text_vec = [clean_text(text, CONTRACTION_MAPPING) for text in test['text'].values]


# tokenize the sentences
tokenizer = Tokenizer(lower=False)
tokenizer.fit_on_texts(train_text_vec)
train_text_vec = tokenizer.texts_to_sequences(train_text_vec)
test_text_vec = tokenizer.texts_to_sequences(test_text_vec)

# pad the sequences
train_text_vec = pad_sequences(train_text_vec, maxlen=MAX_SEQ_LEN)
test_text_vec = pad_sequences(test_text_vec, maxlen=MAX_SEQ_LEN)

print('Number of Tokens:', len(tokenizer.word_index))
print("Max Token Index:", train_text_vec.max(), "\n")

print('Sample Tweet Before Processing:', train["text"].values[0])
print('Sample Tweet After Processing:', tokenizer.sequences_to_texts([train_text_vec[0]]), '\n')

print('What the model will interpret:', train_text_vec[0].tolist())

Number of Tokens: 68697
Max Token Index: 68697 

Sample Tweet Before Processing: @rahulsood When's the update to the Envy133 coming? I want ont but know that it's trailing edge speed w/o trailing edge price now 
Sample Tweet After Processing: ["rahulsood When's the update to the Envy133 coming I want ont but know that it's trailing edge speed w o trailing edge price now"] 

What the model will interpret: [0, 19900, 19901, 3, 637, 1, 3, 19902, 274, 2, 73, 12738, 21, 54, 18, 58, 12739, 3449, 3084, 285, 595, 12739, 3449, 2006, 30]


In [4]:
# One Hot Encode Y values:
encoder = LabelEncoder()

y_train = encoder.fit_transform(train['sentiment'].values)
y_train = to_categorical(y_train) 

y_test = encoder.fit_transform(test['sentiment'].values)
y_test = to_categorical(y_test) 

# get an idea of the distribution of the text values
from collections import Counter
ctr = Counter(train['sentiment'].values)
print('Distribution of Classes:', ctr)

# get class weights for the training data, this will be used data
y_train_int = np.argmax(y_train,axis=1)
cws = class_weight.compute_class_weight('balanced', np.unique(y_train_int), y_train_int)
print(cws)

print('Dominant Class: ', ctr.most_common(n = 1)[0][0])
print('Baseline Accuracy Dominant Class', (ctr.most_common(n = 1)[0][0] == test['sentiment'].values).mean())

preds = np.zeros_like(y_test)
preds[:, 0] = 1
preds[0] = 1 #done to suppress warning from numpy for f1 score
print('F1 Score:', f1_score(y_test, preds, average='weighted'))

Distribution of Classes: Counter({4: 24025, 0: 23975})
[1.00104275 0.99895942]
Dominant Class:  4
Baseline Accuracy Dominant Class 0.5020833333333333
F1 Score: 0.3310210941121928


In [5]:
def threshold_search(y_true, y_proba, average = None):
    best_threshold = 0
    best_score = 0
    for threshold in [i * 0.01 for i in range(100)]:
        score = f1_score(y_true=y_true, y_pred=y_proba > threshold, average=average)
        if score > best_score:
            best_threshold = threshold
            best_score = score
    search_result = {'threshold': best_threshold, 'f1': best_score}
    return search_result


def train(model, 
          X_train, y_train, X_test, y_test, 
          checkpoint_path='model.hdf5', 
          epochs = 25, 
          batch_size = DEFAULT_BATCH_SIZE, 
          class_weights = None, 
          fit_verbose=2,
          print_summary = True
         ):
    m = model()
    if print_summary:
        print(m.summary())
    m.fit(
        X_train, 
        y_train, 
        #this is bad practice using test data for validation, in a real case would use a seperate validation set
        validation_data=(X_test, y_test),  
        epochs=epochs, 
        batch_size=batch_size,
        class_weight=class_weights,
         #saves the most accurate model, usually you would save the one with the lowest loss
        callbacks= [
            ModelCheckpoint(checkpoint_path, monitor='val_acc', verbose=1, save_best_only=True),
            EarlyStopping(patience = 2)
        ],
        verbose=fit_verbose
    ) 
    print("\n\n****************************\n\n")
    #print('Loading Best Model...')
    #m.load_weights(checkpoint_path)
    predictions = m.predict(X_test, verbose=1)
    print('Validation Loss:', log_loss(y_test, predictions))
    print('Test Accuracy', (predictions.argmax(axis = 1) == y_test.argmax(axis = 1)).mean())
    print('F1 Score:', f1_score(y_test.argmax(axis = 1), predictions.argmax(axis = 1), average='weighted'))
    return m #returns best performing model

In [6]:
# First model version
def model_1():
    model = Sequential()
    model.add(Embedding(input_dim = (len(tokenizer.word_counts) + 1), output_dim = 128, input_length = MAX_SEQ_LEN))
    model.add(LSTM(128))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(2, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

m1 = train(model_1, 
           train_text_vec,
           y_train,
           test_text_vec,
           y_test,
           epochs=5,
           checkpoint_path='/Users/tom/tikal/streamer/checkpoints/model_1.h5',
           class_weights=cws
          )

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 25, 128)           12682240  
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dense_1 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 130       
Total params: 12,822,210
Trainable params: 12,822,210
Non-trainable params: 0
_________________________________________________________________
None


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 80000 samples, validate on 8000 samples
Epoch 1/5
 - 179s - loss: 0.4995 - accuracy: 0.7522 - val_loss: 0.4617 - val_accuracy: 0.7828
Epoch 2/5




 - 211s - loss: 0.3080 - accuracy: 0.8679 - val_loss: 0.5105 - val_accuracy: 0.7660
Epoch 3/5
 - 188s - loss: 0.1571 - accuracy: 0.9378 - val_loss: 0.6503 - val_accuracy: 0.7588
Epoch 4/5
 - 182s - loss: 0.0874 - accuracy: 0.9662 - val_loss: 0.8673 - val_accuracy: 0.7452
Epoch 5/5
 - 166s - loss: 0.0552 - accuracy: 0.9790 - val_loss: 1.0593 - val_accuracy: 0.7355


****************************


Validation Loss: 1.059322223015625
Test Accuracy 0.7355
F1 Score: 0.7354722744580906


In [9]:
def model_1b():
    """
    Using a Bidiretional LSTM. 
    """
    model = Sequential()
    model.add(Embedding(input_dim = (len(tokenizer.word_counts) + 1), output_dim = 128, input_length = MAX_SEQ_LEN))
    model.add(SpatialDropout1D(0.3))
    model.add(Bidirectional(LSTM(128, dropout=0.25, recurrent_dropout=0.25)))
    model.add(Dense(64, activation='relu'))
#     model.add(Dropout(0.3))
    model.add(Dense(2, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

m1b = train(model_1b, 
           train_text_vec,
           y_train,
           test_text_vec,
           y_test,
           checkpoint_path='model_1b.h5',
           class_weights=cws,
           print_summary = True
          )

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 25, 128)           12682240  
_________________________________________________________________
spatial_dropout1d_3 (Spatial (None, 25, 128)           0         
_________________________________________________________________
bidirectional_3 (Bidirection (None, 256)               263168    
_________________________________________________________________
dense_7 (Dense)              (None, 64)                16448     
_________________________________________________________________
dense_8 (Dense)              (None, 2)                 130       
Total params: 12,961,986
Trainable params: 12,961,986
Non-trainable params: 0
_________________________________________________________________
None


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 80000 samples, validate on 8000 samples
Epoch 1/25
 - 139s - loss: 0.5138 - accuracy: 0.7425 - val_loss: 0.4594 - val_accuracy: 0.7799
Epoch 2/25




 - 423s - loss: 0.3662 - accuracy: 0.8397 - val_loss: 0.4686 - val_accuracy: 0.7759
Epoch 3/25
 - 141s - loss: 0.2415 - accuracy: 0.9005 - val_loss: 0.5962 - val_accuracy: 0.7641
Epoch 4/25
 - 161s - loss: 0.1551 - accuracy: 0.9406 - val_loss: 0.6692 - val_accuracy: 0.7499
Epoch 5/25
 - 147s - loss: 0.1105 - accuracy: 0.9574 - val_loss: 0.8145 - val_accuracy: 0.7469
Epoch 6/25
 - 165s - loss: 0.0847 - accuracy: 0.9681 - val_loss: 0.8511 - val_accuracy: 0.7454
Epoch 7/25
 - 158s - loss: 0.0687 - accuracy: 0.9739 - val_loss: 1.0575 - val_accuracy: 0.7467
Epoch 8/25
 - 143s - loss: 0.0568 - accuracy: 0.9785 - val_loss: 1.1072 - val_accuracy: 0.7372
Epoch 9/25
 - 143s - loss: 0.0498 - accuracy: 0.9813 - val_loss: 1.1196 - val_accuracy: 0.7439
Epoch 10/25
 - 143s - loss: 0.0428 - accuracy: 0.9841 - val_loss: 1.1907 - val_accuracy: 0.7421
Epoch 11/25
 - 146s - loss: 0.0378 - accuracy: 0.9857 - val_loss: 1.2147 - val_accuracy: 0.7383
Epoch 12/25
 - 3063s - loss: 0.0333 - accuracy: 0.9876 - va

KeyboardInterrupt: 

In [12]:
def model_1c():
    """
    Adding dropout to reduce overfitting using a bidiretional LSTM
    """
    model = Sequential()
    model.add(Embedding(input_dim = (len(tokenizer.word_counts) + 1), output_dim = 128, input_length = MAX_SEQ_LEN))
    model.add(SpatialDropout1D(0.3))
    model.add(Bidirectional(LSTM(128, dropout=0.25, recurrent_dropout=0.25, return_sequences=True)))
    model.add(Conv1D(64, 4))
#     model.add(Flatten())
    model.add(GlobalMaxPool1D())
    model.add(Dense(64, activation='relu'))
    model.add(Dense(2, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
#     print(model.summary())
    return model


m1c = train(model_1c, 
           train_text_vec,
           y_train,
           test_text_vec,
           y_test,
           checkpoint_path='model_1c.h5',
           class_weights=cws,
           print_summary = True
          )

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 25, 128)           12682240  
_________________________________________________________________
spatial_dropout1d_5 (Spatial (None, 25, 128)           0         
_________________________________________________________________
bidirectional_5 (Bidirection (None, 25, 256)           263168    
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 22, 64)            65600     
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 64)                0         
_________________________________________________________________
dense_11 (Dense)             (None, 64)                4160      
_________________________________________________________________
dense_12 (Dense)             (None, 2)                

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 80000 samples, validate on 8000 samples
Epoch 1/25
 - 151s - loss: 0.5042 - accuracy: 0.7488 - val_loss: 0.4566 - val_accuracy: 0.7824
Epoch 2/25




 - 153s - loss: 0.3393 - accuracy: 0.8525 - val_loss: 0.4940 - val_accuracy: 0.7714
Epoch 3/25
 - 158s - loss: 0.1980 - accuracy: 0.9210 - val_loss: 0.6092 - val_accuracy: 0.7655


****************************


Validation Loss: 0.6091855999883811
Test Accuracy 0.7655
F1 Score: 0.7654859730140383


In [6]:
def model_1d():
    """
    Just for fun below is a model only using covolutions. This is pretty good and also trains very quickly(and predictions would also likely be fast) compared to the LSTM...
    It's equivalent to using an n-gram based approach.
    Usually in practice you would use a more complex architecture with multiple parallel convolutions that are combined before pooling(and usually both max and avg).
    Pure Convolutional NLP is definitely a solution worth exploring further.
    """
    model = Sequential()
    model.add(Embedding(input_dim = (len(tokenizer.word_counts) + 1), output_dim = 128, input_length = MAX_SEQ_LEN))
    model.add(SpatialDropout1D(0.3))
    model.add(Conv1D(64, 5))
    model.add(Conv1D(64, 3))
    model.add(Conv1D(64, 2))
    model.add(GlobalMaxPool1D())
    model.add(Dense(64, activation='relu'))
    model.add(Dense(2, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model


m1d = train(model_1d, 
           train_text_vec,
           y_train,
           test_text_vec,
           y_test,
           checkpoint_path='model_1d.h5',
           class_weights=cws,
           print_summary = True
          )

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 25, 128)           8793344   
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 25, 128)           0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 21, 64)            41024     
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 19, 64)            12352     
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 18, 64)            8256      
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 64)               

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 48000 samples, validate on 4800 samples
Epoch 1/25
 - 42s - loss: 0.5413 - accuracy: 0.7178 - val_loss: 0.4866 - val_accuracy: 0.7640
Epoch 2/25




 - 41s - loss: 0.3567 - accuracy: 0.8436 - val_loss: 0.5210 - val_accuracy: 0.7504
Epoch 3/25
 - 41s - loss: 0.1785 - accuracy: 0.9301 - val_loss: 0.7180 - val_accuracy: 0.7277


****************************


Validation Loss: 0.7179547698535527
Test Accuracy 0.7277083333333333
F1 Score: 0.7265767092106544


In [66]:
def predict(model, text):
    text_vector = [clean_text(text, CONTRACTION_MAPPING)]
    text_vector = tokenizer.texts_to_sequences(text_vector)
    text_vector = pad_sequences(text_vector, maxlen=MAX_SEQ_LEN)
    prediction = model.predict(text_vector)
    arr_prediction = prediction.tolist()[0]
    max_prediction = max(arr_prediction)
    index = arr_prediction.index(max_prediction)
    if max_prediction > 0.75:
        if index == 0:
            return ("Negative", max_prediction)
        return ("Positive", max_prediction)
    return ("Unknown", max_prediction)

In [67]:
#positive
prediction = predict(m1d, "@siyab it has been long I have seen you on twitter... Your avatar rocks as always ")
print(prediction)

#negative
prediction = predict(m1d, "It's sunburns like these that make me hate being a red head. I'd trade it for the ability to tan instead of burn any day...")
print(prediction)

prediction = predict(m1d, "The need for independent journalism has never been greater. Become a Guardian supporter: http://gu.com/supporter/guardiannews")
print(prediction)

('Positive', 0.9623548984527588)
('Negative', 0.9230782389640808)
('Positive', 0.8561643958091736)


In [76]:
m1d.save("twitter_sentiment.hd5")

In [64]:
m1dloaded = tf.keras.models.load_model('twitter_sentiment')

In [65]:
prediction = predict(m1dloaded, "The need for independent journalism has never been greater. Become a Guardian supporter: http://gu.com/supporter/guardiannews")
print(prediction)

Positive


In [70]:
# save the tokenizer
import io
import json
tokenizer_json = tokenizer.to_json()
with io.open('tokenizer.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(tokenizer_json, ensure_ascii=False))

In [71]:
from keras_preprocessing.text import tokenizer_from_json
#load tokenizer
with open('tokenizer.json') as f:
    data = json.load(f)
    tokenizerr = tokenizer_from_json(data)


In [75]:
model = m1d
export_path = 'models/sentiment_analyzer/model.pd'
print('Exporting trained model to', export_path)
builder = tf.saved_model.builder.SavedModelBuilder(export_path)
builder.add_meta_graph_and_variables(
      sess, [tf.saved_model.tag_constants.SERVING],
      signature_def_map={
           'predict_images':
               prediction_signature,
           signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
               classification_signature,
      },
      main_op=tf.tables_initializer())
builder.save()


Exporting trained model to models/sentiment_analyzer/model.pd


AttributeError: module 'tensorflow_core._api.v2.saved_model' has no attribute 'builder'