In [1]:
import pandas as pd
import matplotlib.pyplot as plt

# Scikit-learn
from sklearn.model_selection import train_test_split

# Keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Conv1D, MaxPooling1D, LSTM
from keras.layers import Bidirectional
from keras import utils
from keras.callbacks import ReduceLROnPlateau, EarlyStopping
from keras.layers import BatchNormalization
from keras.utils import np_utils

import gensim

import re
import numpy as np
import os
from collections import Counter
import logging
import time
import pickle
import itertools


In [5]:
# load dataset
df = pd.read_csv('twitterA_train_data.txt', sep="\t",names=['Number','Label','Text'])

# split data
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
print("TRAIN size:", len(df_train))
print("TEST size:", len(df_test))


TRAIN size: 12441
TEST size: 3111


In [None]:
# wotd2vec
def word_to_vector(df):
  docs = [_text.split() for _text in df.Text] 
  w2v_model = gensim.models.word2vec.Word2Vec(size=300, window=7, min_count=10, workers=8)
  w2v_model.build_vocab(docs)
  words = w2v_model.wv.vocab.keys()
  vocab_size = len(words)
  w2v_model.train(docs, total_examples=len(docs), epochs=8)
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(df.Text)
  vocab_size = len(tokenizer.word_index) + 1

  return tokenizer

In [7]:
# prepare train test data
x_train = pad_sequences(tokenizer.texts_to_sequences(df_train.tidy), maxlen=300)
x_test = pad_sequences(tokenizer.texts_to_sequences(df_test.tidy), maxlen=300)
labels = df_train.Label.unique().tolist()
encoder = LabelEncoder()
encoder.fit(df_train.Label.tolist())

y_train = encoder.transform(df_train.Label.tolist())
y_test = encoder.transform(df_test.Label.tolist())

y_train = y_train.reshape(-1,1)
y_test = y_test.reshape(-1,1)

def embedding_layer():
  # embedding layer
  embedding_matrix = np.zeros((vocab_size, 300))
  print(embedding_matrix.shape)
  for word, i in tokenizer.word_index.items():
    if word in w2v_model.wv:
      embedding_matrix[i] = w2v_model.wv[word]
  print(embedding_matrix.shape)

  embedding_layer = Embedding(vocab_size, 300, weights=[embedding_matrix], input_length=300, trainable=False)
  # embedding_layer = Embedding(vocab_size, 300, input_length=300)

  return embedding_layer

def build_lstm_model(embedding_layer):
  
  model = Sequential()
  model.add(embedding_layer)
  model.add(Dropout(0.5))
  # model.add(LSTM(256, dropout=0.2, recurrent_dropout=0.2))
  model.add(Bidirectional(LSTM(256, return_sequences=True, dropout=0.2, recurrent_dropout=0.2), input_shape=(300, 300)))
  model.add(Bidirectional(LSTM(256, dropout=0.2, recurrent_dropout=0.2)))
  model.add(Dense(1024, activation='relu'))
  model.add(Dropout(0.5))
  model.add(BatchNormalization())
  model.add(Dense(3, activation='softmax'))
  model.summary()

  return model

def train_model(model, x_train, x_test, y_train, y_test):
  model.compile(loss='categorical_crossentropy',
              optimizer="adam",
              metrics=['accuracy'])
  callbacks = [ ReduceLROnPlateau(monitor='val_loss', patience=5, cooldown=0),
              EarlyStopping(monitor='val_acc', min_delta=1e-4, patience=5)]
  
  y_train = np_utils.to_categorical(y_train, num_classes=3)
  y_test = np_utils.to_categorical(y_test, num_classes=3)

  y_train = np.array(y_train)
  X_train = np.array(x_train)
  y_test = np.array(y_test)
  X_test = np.array(x_test)

  # y_train = utils.to_categorical(y_train, num_classes=3)
  # y_test = utils.to_categorical(y_test, num_classes=3)

  history = model.fit(x_train, y_train,
                      batch_size=128,
                      epochs=20,
                      validation_split=0.1,
                      verbose=1,
                      callbacks=callbacks)
  

In [8]:
# build model
tokenizer = word_to_vector(df)
embedding_layer = embedding_layer()
model = build_lstm_model(embedding_layer)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 300, 300)          10177500  
                                                                 
 dropout (Dropout)           (None, 300, 300)          0         
                                                                 
 bidirectional (Bidirectiona  (None, 300, 512)         1140736   
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 512)              1574912   
 nal)                                                            
                                                                 
 dense (Dense)               (None, 1024)              525312    
                                                                 
 dropout_1 (Dropout)         (None, 1024)              0

In [None]:
# train lstm model
train_model(model, x_train, x_test, y_train, y_test)

Epoch 1/20


In [None]:
# final test accuracy
score = model.evaluate(x_test, y_test, batch_size=800)
print()
print("ACCURACY:",score[1])
print("LOSS:",score[0])


ACCURACY: 0.6126647591590881
LOSS: 1.2511096000671387
