In [101]:
import re
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
import pickle

In [16]:
import tensorflow as tf
from keras.models import Sequential
from keras import layers
from tensorflow.keras.optimizers import Adam
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras import regularizers
from keras import backend as K
from keras.callbacks import ModelCheckpoint

# Load dataset

In [3]:
train = pd.read_csv("dataset/dataset.csv")
train = train[['selected_text','sentiment']]
train["selected_text"].fillna("No content", inplace=True)

## Prepare features

In [4]:
def depure_data(data):

    #Removing URLs with a regular expression
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    data = url_pattern.sub(r'', data)

    # Remove Emails
    data = re.sub('\S*@\S*\s?', '', data)

    # Remove new line characters
    data = re.sub('\s+', ' ', data)

    # Remove distracting single quotes
    data = re.sub("\'", "", data)

    return data

In [17]:
data = []
#Splitting pd.Series to list
data_to_list = train['selected_text'].values.tolist()
for i in range(len(data_to_list)):
    data.append(depure_data(data_to_list[i]))

## Prepare target label

In [43]:
# labels = np.array(train['sentiment'])
y = []
for i in range(len(labels)):
    if labels[i] == 'neutral':
        y.append(0)
    if labels[i] == 'negative':
        y.append(1)
    if labels[i] == 'positive':
        y.append(2)
# y = np.array(y)
# labels = tf.keras.utils.to_categorical(y, 3, dtype="float32")
# del y

In [50]:
train.head()

Unnamed: 0,selected_text,sentiment
0,have responded if were going,0
1,sooo sad,1
2,bullying me,1
3,leave me alone,1
4,sons of,1


# Modelling

In [33]:
max_words = 5000
max_len = 200

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(data)
sequences = tokenizer.texts_to_sequences(data)
tweets = pad_sequences(sequences, maxlen=max_len)

In [58]:
#Splitting the data
X_train, X_test, y_train, y_test = train_test_split(tweets, train.sentiment.values, random_state=0)
print(len(X_train), len(X_test), len(y_train), len(y_test))

20610 6871 20610 6871


In [59]:
y_train = y_train.astype("float")
y_test = y_test.astype("float")

## Training

In [49]:
model = Sequential()
model.add(layers.Embedding(max_words, 20))
model.add(layers.LSTM(15, dropout=0.5))
model.add(layers.Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

#Implementing model checkpoins to save the best metric and do not lose it on training.
# checkpoint1 = ModelCheckpoint("best_model.hdf5", monitor='val_accuracy', verbose=1,save_best_only=True, mode='auto', period=1,save_weights_only=False)

In [52]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 20)          100000    
_________________________________________________________________
lstm_2 (LSTM)                (None, 15)                2160      
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 16        
Total params: 102,176
Trainable params: 102,176
Non-trainable params: 0
_________________________________________________________________


In [61]:
history = model.fit(
    X_train,
    y_train,
    epochs=5,
    validation_data=(X_test, y_test),
#     callbacks=[checkpoint1]
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


## Save model & tokenizer

In [67]:
# Save model
save_path = "../model/text_model"
model.save(save_path)
print("successfully saved!")



INFO:tensorflow:Assets written to: model/sentiment_model/assets


INFO:tensorflow:Assets written to: model/sentiment_model/assets


successfully saved!


In [102]:
# save tokenizer
with open(f'{save_path}/tokenizer.pkl', 'wb') as file_out:
    pickle.dump(tokenizer, file_out, protocol=pickle.HIGHEST_PROTOCOL)
print('Tokenizer saved')

Tokenizer saved


## Test prediction

In [76]:
test_text = "nothing to worry about" #"I like this the most" #"Oh God! It's really worst!"
test_text = tokenizer.texts_to_sequences([test_text])
test_text = pad_sequences(test_text, maxlen=max_len)

In [89]:
label_mapping = {0: "Neutral", 1: "Negative", 2: "Positive"}

In [98]:
def predict(list_text, model, tokenizer, max_len=max_len, mapping=label_mapping):
    sequence = tokenizer.texts_to_sequences(list_text)
    sequence = pad_sequences(sequence, maxlen=max_len)
    preds = np.round(model.predict(sequence), decimals=0) \
            .squeeze() \
            .astype("int") \
            .tolist()
    labels = [mapping[pred] for pred in preds]
    return preds, labels

In [99]:
tes = ["oh man! I hate it!", "that was awesome!"]
predict(tes, model, tokenizer)

([0, 1], ['Neutral', 'Negative'])

# Load model & tokenizer

In [69]:
model_load = tf.keras.models.load_model(save_path)

In [70]:
model_load.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 20)          100000    
_________________________________________________________________
lstm_2 (LSTM)                (None, 15)                2160      
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 16        
Total params: 102,176
Trainable params: 102,176
Non-trainable params: 0
_________________________________________________________________


In [103]:
with open(f"{save_path}/tokenizer.pkl", "rb") as file_in:
    load_tokenizer = pickle.load(file_in)

In [104]:
predict(tes, model_load, load_tokenizer)

([0, 1], ['Neutral', 'Negative'])