In [2]:
import numpy as np
import pandas as pd 
import tensorflow as tf
import nltk
from nltk.stem.porter import PorterStemmer
import re
import os

from tensorflow.keras.layers import Embedding, Dense, GRU, LSTM, Dropout, Bidirectional
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.callbacks import ModelCheckpoint
from nltk.corpus import stopwords
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import time
try:
    os.remove('/content/drive/MyDrive/final_project/weights_LSTM.h5')
except OSError:
    pass

start = time.time()
df = pd.read_csv('/content/drive/MyDrive/final_project/Emotion_project.csv') # read the dataset
df = df.dropna() # drop columns with NA values
X = df.drop('Emotion',axis=1) # input
y = df['Emotion'] # output

vocab_size = 10000
messages = X.copy() # copy of output
messages.reset_index(inplace=True)

nltk.download('stopwords')
# stopwords: frequent words in text.('the', 'and', 'I', etc.) They don't add much meaning to a sentence
ps = PorterStemmer()
corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['Text'][i]) # remove special characters
    review = review.lower() # turn into lower case
    review = review.split() 
    review = [ps.stem(word) for word in review if (word == 'not' or word == 'no' or word == 'nor' or not word in stopwords.words('english')) ] # remove stopwords
    review = ' '.join(review)
    corpus.append(review)

onehot_repr = [one_hot(text,vocab_size)for text in corpus] # use one_hot encoding

maxlength = 0
for x in corpus:
    maxlength = max(maxlength, len(x.split(' '))) # find text with max length
embedded_docs = pad_sequences(onehot_repr,padding = 'pre',maxlen = maxlength) # pad sequences to the same length

X_final = np.array(embedded_docs)
label_encoder = preprocessing.LabelEncoder() 
y_final = label_encoder.fit_transform(y) # encoding the target outputs to integers
y_final = np.array(y_final)

X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.3, random_state=42) # train-test split
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=21)  # train-validation split

# create model
embedding_vector_features = 100
model = Sequential()
model.add(Embedding(vocab_size, embedding_vector_features, input_length = maxlength))
# 降維: turn "One hot representation" to "Distributed representation"(represent the relation between words)
model.add(Dropout(0.3)) # randomly sets input units to 0 -> helps prevent overfitting
model.add(LSTM(64)) # 64: dimensionality of the output space (output 維度)
model.add(Dropout(0.3))
model.add(Dense(64, activation='relu', kernel_regularizer = tf.keras.regularizers.l1(0.01))) # output = activation(dot(input, kernel)+bias)
model.add(Dropout(0.3))
model.add(Dense(5, activation='softmax')) # 5 categories
model.compile(loss='sparse_categorical_crossentropy', optimizer= tf.keras.optimizers.Adam(learning_rate=0.01), metrics=['accuracy'])
model.summary()

model_save = ModelCheckpoint('/content/drive/MyDrive/final_project/weights_LSTM.h5', save_best_only = True, save_weights_only = True, monitor = 'val_loss', mode = 'min', verbose = 1)
# save a model or weights (in a checkpoint file) at some interval, so the model or weights can be loaded later to continue the training from the state saved
history = model.fit(X_train,y_train, validation_data=(X_val,y_val), epochs = 10, batch_size = 256, callbacks = [model_save])
#training process, record epoch, loss, accuracy...
# batch_size : number of samples per gradient update (一次丟多少資料進去)
# epochs : number of iteration ( 1 iteration represents going though whole training data)
# validation : check if the data is overfitting
model.save('/content/drive/MyDrive/final_project/weights_LSTM.h5')
model.load_weights('/content/drive/MyDrive/final_project/weights_LSTM.h5')

end = time.time()
print("Process time: ",end - start)

y_pred=np.argmax(model.predict(X_test), axis=-1) # predict test data
print("LSTM Accuracy: ", accuracy_score(y_test,y_pred)) # calculate accuracy

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 35, 100)           1000000   
                                                                 
 dropout (Dropout)           (None, 35, 100)           0         
                                                                 
 lstm (LSTM)                 (None, 64)                42240     
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                                 
 dense (Dense)               (None, 64)                4160      
                                                                 
 dropout_2 (Dropout)         (None, 64)                0         
           

In [4]:
def predict_emotion(stri):
    review = re.sub('[^a-zA-Z]', ' ', stri)
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if (word == 'not' or word == 'no' or word == 'nor' or not word in stopwords.words('english')) ]
    review = ' '.join(review)
    onehot_repr = [one_hot(review,vocab_size)] 
    embed = pad_sequences(onehot_repr,padding='pre',maxlen=maxlength)
    predicti = model.predict(embed)
    print(predicti)
    print("Label: ",label_encoder.classes_[np.argmax(predicti)])
    #transform(predicti[0],stri)

text = input("Input any text: ")
predict_emotion(text)

Input any text: i love this world, wonderful
[[0.10042729 0.07277312 0.65159464 0.15637462 0.01883036]]
Label:  happy
