In [None]:
!pip install tensorflow==1.12.0

In [None]:
import pandas as pd
import numpy as np

In [None]:
import pandas as pd
DATASET_FILE_NAME = "../input/sentiment140/training.\
1600000.processed.noemoticon.csv"
DATASET_ENCODING = "ISO-8859-1"
DATASET_COLUMNS = ["target", "ids", "date", "flag", "user", "text"]

df = pd.read_csv(
            DATASET_FILE_NAME,
            encoding=DATASET_ENCODING,
            names=DATASET_COLUMNS)

In [None]:
df.head()

In [None]:
df.target.value_counts()

In [None]:
import re
import from nltk.corpus import stopwords

stop_words = stopwords.words("english") 

def preprocess(text):
    text = text.lower()
    text = re.sub(r"@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+", " ", text)
    text.strip()
    tokens = []
    for token in text.split():
        if token not in stop_words:
            tokens.append(token)
    return " ".join(tokens)

df.text = df.text.apply(lambda x: preprocess(x))

In [None]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

In [None]:
import gensim

W2V_SIZE = 300

w2v_model = gensim.models.word2vec.Word2Vec(
                                    size=W2V_SIZE,
                                    window=7,
                                    min_count=10,
                                    workers=4)
sentences = [x.split() for x in df_train.text]
w2v_model.build_vocab(sentences)
w2v_model.train(sentences, total_examples=len(sentences), epochs=32)

In [None]:
w2v_model.wv.most_similar("happy")

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
tokenizer.fit_on_texts(df_train.text)

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

SEQUENCE_LENGTH = 300
x_train = pad_sequences(
            tokenizer.texts_to_sequences(df_train.text),
            maxlen=SEQUENCE_LENGTH)
x_test = pad_sequences(
            tokenizer.texts_to_sequences(df_test.text),
            maxlen=SEQUENCE_LENGTH)

In [None]:
y_train = np.array(list(map(
                    lambda x: 0 if x == 0 else 1,
                    df_train.target.values)))
y_test = np.array(list(map(
                    lambda x: 0 if x == 0 else 1,
                    df_test.target.values)))
y_train = y_train.reshape(-1,1)
y_test = y_test.reshape(-1,1)

In [None]:
print("x_train", x_train.shape)
print("y_train", y_train.shape)
print("x_test", x_test.shape)
print("y_test", y_test.shape)

In [None]:
from tensorflow.keras.layers import Embedding

vocab_size = len(tokenizer.word_index) + 1
embedding_matrix = np.zeros((vocab_size, W2V_SIZE))

for word, i in tokenizer.word_index.items():
  if word in w2v_model.wv:
    embedding_matrix[i] = w2v_model.wv[word]

print(embedding_matrix.shape)

embedding_layer = Embedding(
                    vocab_size,
                    W2V_SIZE,
                    weights=[embedding_matrix],
                    input_length=SEQUENCE_LENGTH,
                    trainable=False)

In [None]:
from tensorflow.keras.layers import Dropout, LSTM, Dense, Input
from tensorflow.keras.models import Model

inputs = Input(shape=(SEQUENCE_LENGTH,))
embedded = embedding_layer(inputs)
dropout = Dropout(0.5)(embedded)
lstm = LSTM(100, dropout=0.2, recurrent_dropout=0.2)(dropout)
outputs = Dense(1, activation="sigmoid")(lstm)
model = Model(inputs=inputs, outputs=outputs)

In [None]:
model.summary()

In [None]:
from tensorflow.keras.utils import plot_model

plot_model(model, to_file="sa_model.png")

In [None]:
model.compile(loss="binary_crossentropy",
              optimizer="adam",
              metrics=["accuracy"])

In [None]:
model.fit(x_train, y_train,
            batch_size=256,
            epochs=3,
            validation_split=0.1,
            verbose=1)

In [None]:
model.save("sa_model.h5")

In [None]:
from tensorflow.keras.callbacks import (
                                ReduceLROnPlateau,
                                EarlyStopping)

callbacks = [ ReduceLROnPlateau(
                    monitor="val_loss",
                    patience=5,
                    cooldown=0),
              EarlyStopping(
                    monitor="val_acc",
                    min_delta=1e-4,
                    patience=5)]

In [None]:
model.fit(x_train, y_train,
            batch_size=256,
            epochs=3,
            validation_split=0.1,
            verbose=1,
            callbacks=callbacks)

In [None]:
def decode_sentiment(score):
    label = "NEUTRAL"
    if score <= 0.4:
        label = "NEGATIVE"
    elif score >= 0.7:
        label = "POSITIVE"
    return label

def predict(text):
    x_test = pad_sequences(
                tokenizer.texts_to_sequences([text]),
                maxlen=SEQUENCE_LENGTH)
    score = model.predict([x_test])[0]
    label = decode_sentiment(score)
    return {"label": label, "score": float(score)}

In [None]:
print(predict("We’re going to have Tom’s birthday party today, so join us!"))
print(predict("It’s going to be too late to return home."))