Code adapted from https://www.kaggle.com/chiranjeevbit/movie-review-prediction

In [47]:
from google.colab import drive 
drive.mount("/content/gdrive")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


# Imports and Initializations

In [48]:
import os
import random
import re

import bs4
import keras
import pandas as pd
import nltk
import numpy as np
import sklearn
import tensorflow as tf

nltk.download("punkt")
nltk.download("wordnet")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [0]:
BASE_DIR = ""
GLOVE_DIR = "/content/gdrive/My Drive/glove.6B/"
TEXT_DATA_DIR = "/content/gdrive/My Drive/rt/"
MAX_SEQUENCE_LENGTH = 48
MAX_NUM_WORDS = 13738
EMBEDDING_DIM = 300
batch_size = 32
tf.random.set_seed(123)
random.seed(123)

# Read Inputs

In [50]:
train = pd.read_csv("/content/gdrive/My Drive/rt/train.tsv", sep="\t")
train.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [0]:

X = train.Phrase
X = X.astype("str")
y = np.asarray(train.Sentiment.astype("int"))
y = keras.utils.to_categorical(y)

In [52]:

embeddings_index = {}
with open(os.path.join(GLOVE_DIR, "glove.6B.300d.txt")) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

Found 400000 word vectors.


# Preprocess

In [0]:
def clean_sentences(sent):
    """
    1. remove html content
    2. remove non-alphabetic characters
    3. tokenize the sentences
    4. lemmatize each word to its lemma

    Input:
      sent(str): text to be preprocessed
    Returns:
      lemma_words(list): lemmatized words
    """
    review_text = bs4.BeautifulSoup(sent).get_text()
    
    review_text = re.sub("[^a-zA-Z]"," ", review_text)

    words = nltk.tokenize.word_tokenize(review_text.lower())

    lemmatizer = nltk.stem.WordNetLemmatizer()
    lemma_words = [lemmatizer.lemmatize(i) for i in words]
    

    return lemma_words

In [54]:
X = X.apply(clean_sentences)
x_train, x_val, y_train, y_val = sklearn.model_selection.train_test_split(X,y,test_size=0.2,stratify=y)

tokenizer = keras.preprocessing.text.Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(list(x_train))

x_train = tokenizer.texts_to_sequences(x_train)
x_val = tokenizer.texts_to_sequences(x_val)

x_train = keras.preprocessing.sequence.pad_sequences(x_train, maxlen=MAX_SEQUENCE_LENGTH)
x_val = keras.preprocessing.sequence.pad_sequences(x_val, maxlen=MAX_SEQUENCE_LENGTH)


  ' Beautiful Soup.' % markup)


In [0]:
# prepare embedding matrix
word_index = tokenizer.word_index
num_words = min(MAX_NUM_WORDS, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = keras.layers.Embedding(num_words,
                            EMBEDDING_DIM,
                            embeddings_initializer=keras.initializers.Constant(embedding_matrix),
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)


# Train Model

In [65]:
early_stopping = keras.callbacks.EarlyStopping(min_delta = 0.001, mode = "max", monitor="val_accuracy", patience = 2)
callback = [early_stopping]

print("Build model...")

model=keras.Sequential()
model.add(keras.layers.InputLayer(input_shape=(MAX_SEQUENCE_LENGTH,), dtype="int32"))
model.add(embedding_layer)
model.add(keras.layers.LSTM(256,dropout=0.5, recurrent_dropout=0.5,return_sequences=True))
model.add(keras.layers.LSTM(128,dropout=0.5, recurrent_dropout=0.5,return_sequences=True))
model.add(keras.layers.LSTM(64,dropout=0.5, recurrent_dropout=0.5,return_sequences=False))
model.add(keras.layers.Dense(300,activation="relu"))
model.add(keras.layers.Dropout(0.5))
model.add(keras.layers.Dense(5,activation="softmax"))
model.compile(loss="categorical_crossentropy",optimizer=keras.optimizers.Adam(lr=0.001),metrics=["accuracy"])
model.summary()

print("Train...")
history=model.fit(x_train, y_train, validation_data=(x_val, y_val),epochs=15, batch_size=256, verbose=1, callbacks=callback)

Build model...
Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 48, 300)           4121400   
_________________________________________________________________
lstm_17 (LSTM)               (None, 48, 256)           570368    
_________________________________________________________________
lstm_18 (LSTM)               (None, 48, 128)           197120    
_________________________________________________________________
lstm_19 (LSTM)               (None, 64)                49408     
_________________________________________________________________
dense_17 (Dense)             (None, 300)               19500     
_________________________________________________________________
dropout_9 (Dropout)          (None, 300)               0         
_________________________________________________________________
dense_18 (Dense)             (None, 5) 

# Preparing submission file

In [66]:
test = pd.read_csv("/content/gdrive/My Drive/rt/test.tsv", sep="\t")

X_test = test.Phrase.astype("str")
X_test = X_test.apply(clean_sentences)

sequences = tokenizer.texts_to_sequences(X_test.to_list())
test_data = keras.preprocessing.sequence.pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

  ' Beautiful Soup.' % markup)


In [0]:
y_pred=model.predict_classes(test_data)

sub_file = pd.read_csv("/content/gdrive/My Drive/rt/sampleSubmission.csv",sep=",")
sub_file.Sentiment=y_pred
sub_file.to_csv("sub_exp.csv",index=False)
!cp sub_exp.csv "/content/gdrive/My Drive/rt/submissions/"