<a href="https://colab.research.google.com/github/vyankateshgithubber/speech-analyer/blob/main/LSTMmodel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np
import pandas as pd

In [3]:
train_df = pd.read_csv("/content/drive/MyDrive/Speech_Analyzer/Datasets/train.txt",sep=';')
train_df.columns = ["Sentance","Emotion"]

In [4]:
test_df = pd.read_csv("/content/drive/MyDrive/Speech_Analyzer/Datasets/test.txt",sep=';')
test_df.columns = ["Sentance","Emotion"]

In [5]:
train_length = train_df.shape[0]
test_length = test_df.shape[0]
train_length, test_length

(15999, 1999)

In [6]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [7]:
from nltk.corpus import stopwords

In [8]:
stop_words = stopwords.words("english")
stop_words[:5]

['i', 'me', 'my', 'myself', 'we']

In [9]:
# tokenize the sentences
def tokenize(tweets):
    stop_words = stopwords.words("english")
    tokenized_tweets = []
    for tweet in tweets:
        # split all words in the tweet
        words = tweet.split(" ")
        tokenized_string = ""
        for word in words:
            # remove @handles -> useless -> no information
            if word[0] != '@' and word not in stop_words:
                # if a hashtag, remove # -> adds no new information
                if word[0] == "#":
                    word = word[1:]
                tokenized_string += word + " "
        tokenized_tweets.append(tokenized_string)
    return tokenized_tweets

In [10]:
def encod_tweets(tweets):
    tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', split=" ", lower=True)
    tokenizer.fit_on_texts(tweets)
    return tokenizer, tokenizer.texts_to_sequences(tweets)


In [11]:
# example_str = tokenize(['This is a good day. @css #mlhlocalhost'])
# encod_str = encod_tweets(example_str)
# print(example_str)
# print(encod_str)

In [12]:
# apply padding to dataset and convert labels to bitmaps
def format_data(encoded_tweets, max_length, labels):
    x = pad_sequences(encoded_tweets, maxlen= max_length, padding='post')
    y = []
    for emoji in labels:
        bit_vec = np.zeros(20)
        bit_vec[emoji] = 1
        y.append(bit_vec)
    y = np.asarray(y)
    return x, y


In [13]:
# create weight matrix from pre trained embeddings
def create_weight_matrix(vocab, raw_embeddings):
    vocab_size = len(vocab) + 1
    weight_matrix = np.zeros((vocab_size, 300))
    for word, idx in vocab.items():
        if word in raw_embeddings:
            weight_matrix[idx] = raw_embeddings[word]
    return weight_matrix

In [14]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.wrappers import Bidirectional
from keras.layers import Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [15]:
# final model
def final_model(vocab_size, max_length, x, y, epochs = 5):
    embedding_layer = Embedding(vocab_size, 300, input_length=max_length, trainable=True, mask_zero=True)
    model = Sequential()
    model.add(embedding_layer)
    model.add(Bidirectional(LSTM(128, dropout=0.2, return_sequences=True)))
    model.add(Bidirectional(LSTM(128, dropout=0.2)))
    model.add(Dense(20, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.fit(x, y, epochs = epochs, validation_split = 0.25)
    score, acc = model.evaluate(x_test, y_test)
    return model, score, acc

In [16]:
import math


In [17]:
tokenized_tweets = tokenize(train_df['Sentance'])
tokenized_tweets += tokenize(test_df['Sentance'])
max_length = math.ceil(sum([len(s.split(" ")) for s in tokenized_tweets])/len(tokenized_tweets))
tokenizer, encoded_tweets = encod_tweets(tokenized_tweets)
max_length, len(tokenized_tweets)

(11, 17998)

In [18]:
tokenizer_l = Tokenizer()
tokenizer_l.fit_on_texts(train_df['Emotion'])
train_label = tokenizer_l.texts_to_sequences(train_df['Emotion'])
test_label = tokenizer_l.texts_to_sequences(test_df['Emotion'])
tokenizer_l.word_index

{'anger': 3, 'fear': 4, 'joy': 1, 'love': 5, 'sadness': 2, 'surprise': 6}

In [29]:
map = tokenizer_l.word_index
map_emotion = {3:'anger', 4:'fear', 1:'joy', 5:'love', 2:'sadness', 6:'surprise'}

In [30]:
x, y = format_data(encoded_tweets[:train_length], max_length, train_label)
len(x), len(y)
x_test, y_test = format_data(encoded_tweets[train_length:], max_length, test_label)

In [21]:
voc = tokenizer.word_index
len(voc)

16035

In [22]:
model , score, acc = final_model(len(voc)+1,max_length,x,y,epochs=5)
model , score, acc

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


(<tensorflow.python.keras.engine.sequential.Sequential at 0x7fb6230e1a10>,
 0.6522490978240967,
 0.8424212336540222)

In [31]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 11, 300)           4810800   
_________________________________________________________________
bidirectional (Bidirectional (None, 11, 256)           439296    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 256)               394240    
_________________________________________________________________
dense (Dense)                (None, 20)                5140      
Total params: 5,649,476
Trainable params: 5,649,476
Non-trainable params: 0
_________________________________________________________________


In [32]:
y_pred = model.predict(x_test)
y_pred

array([[1.8288337e-07, 6.4711159e-05, 9.9938071e-01, ..., 1.1355400e-07,
        2.3783164e-07, 7.8929176e-08],
       [2.0514122e-07, 3.8135855e-04, 9.9903667e-01, ..., 9.5332950e-08,
        2.1583519e-07, 8.9108553e-08],
       [3.3801484e-07, 9.9756444e-01, 2.1569503e-03, ..., 1.2828789e-07,
        8.4632228e-07, 2.4098131e-07],
       ...,
       [1.1007466e-07, 9.9984848e-01, 1.8300494e-05, ..., 7.2115505e-08,
        1.3519828e-07, 1.6303734e-07],
       [3.4078397e-07, 9.9960643e-01, 1.2279244e-04, ..., 2.3561444e-07,
        4.0966560e-07, 3.7126190e-07],
       [1.0266227e-02, 2.9929774e-02, 9.2612102e-04, ..., 9.4015952e-03,
        1.0751196e-02, 7.6636355e-03]], dtype=float32)

In [33]:
for pred in y_pred:
    print(np.argmax(pred))

2
2
1
2
4
1
1
1
2
3
2
2
1
5
2
1
2
3
1
5
1
5
2
2
4
3
2
4
3
4
3
2
3
2
1
6
2
1
1
2
2
1
3
1
3
1
1
4
4
2
4
1
2
1
2
2
1
2
3
2
2
1
1
2
6
2
2
4
6
1
5
6
1
5
1
1
2
1
5
1
3
2
1
2
2
1
1
1
2
1
4
3
4
3
3
5
1
4
2
4
2
2
4
3
6
1
1
4
2
1
1
4
1
1
1
1
4
2
2
2
3
5
3
2
5
2
4
2
2
1
1
1
3
3
1
4
2
1
1
1
2
4
1
1
2
5
1
4
1
2
5
1
2
2
2
2
1
3
2
3
3
1
4
4
1
5
5
5
2
5
3
1
2
2
3
1
1
2
2
4
1
2
5
4
2
1
1
2
3
1
2
1
2
2
2
4
4
1
2
4
5
5
1
1
1
2
4
4
1
3
2
2
4
1
2
3
2
3
3
1
4
6
1
1
1
3
1
5
4
2
2
1
1
2
1
2
1
2
2
2
5
4
1
2
2
2
3
1
1
2
2
2
1
5
2
1
3
2
3
3
1
1
3
3
1
1
1
3
1
3
3
4
2
3
2
1
4
3
2
1
2
2
1
1
1
5
2
1
1
4
4
1
2
5
4
3
2
2
3
1
3
5
1
1
6
3
5
3
4
1
2
4
1
3
3
3
6
3
6
2
2
2
1
2
2
1
3
2
2
4
4
2
5
2
1
2
1
1
2
3
1
6
2
1
1
1
4
4
1
2
1
2
1
1
5
5
2
2
1
3
2
2
3
1
1
5
2
1
5
2
1
1
3
1
1
5
2
2
2
4
3
6
4
1
3
3
4
1
3
2
5
2
2
4
2
3
5
2
4
4
1
3
5
5
1
1
2
1
2
4
3
1
1
2
6
4
2
2
4
1
4
4
1
2
1
4
1
3
6
3
2
4
2
4
2
1
3
3
2
3
6
2
2
1
1
2
2
1
3
2
3
2
2
4
2
1
6
1
3
1
1
2
2
2
1
1
2
1
6
4
2
1
1
3
1
1
3
1
4
2
2
2
1
1
2
4
3
1
1
2
2
1
6
1
1
5
1
1
4
3


In [34]:
import math
from sklearn.metrics import classification_report, confusion_matrix


In [35]:
y_pred = np.array([np.argmax(pred) for pred in y_pred])
y_true = np.array(test_label)
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           1       0.85      0.88      0.87       695
           2       0.88      0.88      0.88       580
           3       0.81      0.81      0.81       275
           4       0.88      0.79      0.84       224
           5       0.73      0.73      0.73       159
           6       0.66      0.68      0.67        66

    accuracy                           0.84      1999
   macro avg       0.80      0.80      0.80      1999
weighted avg       0.84      0.84      0.84      1999



In [37]:
emoji_pred = [map_emotion[pred] for pred in y_pred]
emoji_pred

['sadness',
 'sadness',
 'joy',
 'sadness',
 'fear',
 'joy',
 'joy',
 'joy',
 'sadness',
 'anger',
 'sadness',
 'sadness',
 'joy',
 'love',
 'sadness',
 'joy',
 'sadness',
 'anger',
 'joy',
 'love',
 'joy',
 'love',
 'sadness',
 'sadness',
 'fear',
 'anger',
 'sadness',
 'fear',
 'anger',
 'fear',
 'anger',
 'sadness',
 'anger',
 'sadness',
 'joy',
 'surprise',
 'sadness',
 'joy',
 'joy',
 'sadness',
 'sadness',
 'joy',
 'anger',
 'joy',
 'anger',
 'joy',
 'joy',
 'fear',
 'fear',
 'sadness',
 'fear',
 'joy',
 'sadness',
 'joy',
 'sadness',
 'sadness',
 'joy',
 'sadness',
 'anger',
 'sadness',
 'sadness',
 'joy',
 'joy',
 'sadness',
 'surprise',
 'sadness',
 'sadness',
 'fear',
 'surprise',
 'joy',
 'love',
 'surprise',
 'joy',
 'love',
 'joy',
 'joy',
 'sadness',
 'joy',
 'love',
 'joy',
 'anger',
 'sadness',
 'joy',
 'sadness',
 'sadness',
 'joy',
 'joy',
 'joy',
 'sadness',
 'joy',
 'fear',
 'anger',
 'fear',
 'anger',
 'anger',
 'love',
 'joy',
 'fear',
 'sadness',
 'fear',
 'sadne