<a href="https://colab.research.google.com/github/vyankateshgithubber/speech-analyer/blob/main/LSTMmodel_combineWithMicrophone.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd

In [4]:
train_df = pd.read_csv("/content/drive/MyDrive/Speech_Analyzer/Datasets/train.txt",sep=';')
train_df.columns = ["Sentance","Emotion"]

In [5]:
test_df = pd.read_csv("/content/drive/MyDrive/Speech_Analyzer/Datasets/test.txt",sep=';')
test_df.columns = ["Sentance","Emotion"]

In [6]:
train_length = train_df.shape[0]
test_length = test_df.shape[0]
train_length, test_length

(15999, 1999)

In [7]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [8]:
from nltk.corpus import stopwords

In [9]:
stop_words = stopwords.words("english")
stop_words[:5]

['i', 'me', 'my', 'myself', 'we']

In [10]:
# tokenize the sentences
def tokenize(tweets):
    stop_words = stopwords.words("english")
    tokenized_tweets = []
    for tweet in tweets:
        # split all words in the tweet
        words = tweet.split(" ")
        tokenized_string = ""
        for word in words:
            # remove @handles -> useless -> no information
            if word[0] != '@' and word not in stop_words:
                # if a hashtag, remove # -> adds no new information
                if word[0] == "#":
                    word = word[1:]
                tokenized_string += word + " "
        tokenized_tweets.append(tokenized_string)
    return tokenized_tweets

In [11]:
def encod_tweets(tweets):
    tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', split=" ", lower=True)
    tokenizer.fit_on_texts(tweets)
    return tokenizer, tokenizer.texts_to_sequences(tweets)


In [12]:
# example_str = tokenize(['This is a good day. @css #mlhlocalhost'])
# encod_str = encod_tweets(example_str)
# print(example_str)
# print(encod_str)

In [13]:
# apply padding to dataset and convert labels to bitmaps
def format_data(encoded_tweets, max_length, labels):
    x = pad_sequences(encoded_tweets, maxlen= max_length, padding='post')
    y = []
    for emoji in labels:
        bit_vec = np.zeros(20)
        bit_vec[emoji] = 1
        y.append(bit_vec)
    y = np.asarray(y)
    return x, y


In [14]:
# create weight matrix from pre trained embeddings
def create_weight_matrix(vocab, raw_embeddings):
    vocab_size = len(vocab) + 1
    weight_matrix = np.zeros((vocab_size, 300))
    for word, idx in vocab.items():
        if word in raw_embeddings:
            weight_matrix[idx] = raw_embeddings[word]
    return weight_matrix

In [15]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.wrappers import Bidirectional
from keras.layers import Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [16]:
# final model
def final_model(vocab_size, max_length, x, y, epochs = 5):
    embedding_layer = Embedding(vocab_size, 300, input_length=max_length, trainable=True, mask_zero=True)
    model = Sequential()
    model.add(embedding_layer)
    model.add(Bidirectional(LSTM(128, dropout=0.2, return_sequences=True)))
    model.add(Bidirectional(LSTM(128, dropout=0.2)))
    model.add(Dense(20, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.fit(x, y, epochs = epochs, validation_split = 0.25)
    score, acc = model.evaluate(x_test, y_test)
    return model, score, acc

In [17]:
import math


In [18]:
tokenized_tweets = tokenize(train_df['Sentance'])
tokenized_tweets += tokenize(test_df['Sentance'])
max_length = math.ceil(sum([len(s.split(" ")) for s in tokenized_tweets])/len(tokenized_tweets))
tokenizer, encoded_tweets = encod_tweets(tokenized_tweets)
max_length, len(tokenized_tweets)

(11, 17998)

In [19]:
tokenizer_l = Tokenizer()
tokenizer_l.fit_on_texts(train_df['Emotion'])
train_label = tokenizer_l.texts_to_sequences(train_df['Emotion'])
test_label = tokenizer_l.texts_to_sequences(test_df['Emotion'])
tokenizer_l.word_index

{'anger': 3, 'fear': 4, 'joy': 1, 'love': 5, 'sadness': 2, 'surprise': 6}

In [20]:
map = tokenizer_l.word_index
map_emotion = {3:'anger', 4:'fear', 1:'joy', 5:'love', 2:'sadness', 6:'surprise'}

In [21]:
x, y = format_data(encoded_tweets[:train_length], max_length, train_label)
len(x), len(y)
x_test, y_test = format_data(encoded_tweets[train_length:], max_length, test_label)

In [22]:
voc = tokenizer.word_index
len(voc)

16035

In [23]:
model , score, acc = final_model(len(voc)+1,max_length,x,y,epochs=5)
model , score, acc

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


(<tensorflow.python.keras.engine.sequential.Sequential at 0x7f4a44efc610>,
 0.6234433650970459,
 0.8469234704971313)

In [24]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 11, 300)           4810800   
_________________________________________________________________
bidirectional (Bidirectional (None, 11, 256)           439296    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 256)               394240    
_________________________________________________________________
dense (Dense)                (None, 20)                5140      
Total params: 5,649,476
Trainable params: 5,649,476
Non-trainable params: 0
_________________________________________________________________


In [25]:
y_pred = model.predict(x_test)
y_pred

array([[6.9903606e-08, 1.0389501e-04, 9.9959379e-01, ..., 3.9715776e-08,
        4.5722174e-08, 3.3977184e-08],
       [1.2938317e-06, 1.1111644e-02, 9.8491609e-01, ..., 7.0725275e-07,
        6.8154844e-07, 7.4959826e-07],
       [1.8022118e-07, 9.9987066e-01, 3.4203724e-05, ..., 1.5166553e-07,
        7.0282780e-08, 8.4273573e-08],
       ...,
       [1.5662256e-07, 9.9974436e-01, 3.2229804e-05, ..., 1.6281091e-07,
        1.2873853e-07, 7.7666776e-08],
       [5.3900021e-07, 9.9860102e-01, 2.6792820e-04, ..., 6.7539264e-07,
        3.7585244e-07, 4.7861283e-07],
       [8.2774257e-04, 3.4871742e-01, 5.9399934e-04, ..., 6.9490151e-04,
        5.4764602e-04, 6.0388312e-04]], dtype=float32)

In [26]:
for pred in y_pred:
    print(np.argmax(pred))

2
2
1
2
4
1
1
1
5
3
2
4
1
5
2
1
1
5
1
5
1
5
2
2
4
3
2
4
3
4
3
2
3
2
1
6
2
1
1
4
2
1
3
1
3
1
1
4
4
2
4
1
2
1
2
2
1
2
3
2
2
1
1
2
6
2
2
4
6
1
5
6
1
5
1
1
2
1
5
1
4
2
1
2
4
1
1
1
2
1
4
3
6
5
3
5
1
4
2
4
2
2
3
1
3
1
1
6
2
1
5
4
1
1
1
1
4
2
1
2
3
2
3
2
5
2
4
2
2
1
1
1
3
3
1
4
2
1
1
1
2
4
3
1
2
3
1
4
5
2
5
1
2
2
2
4
1
3
2
3
3
1
4
4
1
5
5
5
2
5
3
1
2
2
3
1
1
2
2
4
1
2
5
4
2
1
1
6
2
1
2
1
2
2
2
4
4
1
2
1
5
5
1
1
1
1
4
4
1
3
2
1
4
1
2
3
2
3
3
1
4
6
1
1
1
3
1
5
4
2
2
1
1
2
1
2
1
2
1
2
5
4
1
2
2
2
3
1
1
2
2
2
3
5
2
1
3
2
3
3
2
1
3
3
1
1
1
3
2
3
3
4
2
3
2
1
4
3
2
1
2
2
1
1
1
5
2
1
1
4
4
1
2
5
4
3
2
2
3
1
3
5
1
1
6
4
5
3
4
1
2
4
1
1
3
2
6
3
4
2
2
2
1
2
2
1
3
2
2
4
2
2
5
2
5
3
1
1
2
3
1
6
1
1
1
1
4
4
1
2
1
2
1
2
5
5
2
2
1
3
2
2
3
1
1
5
2
3
5
2
1
1
3
1
2
1
2
2
2
4
3
6
4
1
3
3
4
1
2
2
5
2
2
4
2
3
1
2
4
4
1
3
1
5
1
1
2
1
2
4
3
1
1
2
6
4
2
2
4
1
4
4
1
2
1
4
1
3
6
3
2
4
2
4
2
1
3
3
3
1
6
2
2
1
1
2
2
1
3
2
3
2
2
4
2
1
4
1
3
4
1
2
2
2
1
1
2
3
6
4
2
1
1
3
1
1
3
1
4
2
2
2
1
1
2
4
3
1
1
1
2
1
6
1
1
1
5
1
4
3


In [27]:
import math
from sklearn.metrics import classification_report, confusion_matrix


In [28]:
y_pred = np.array([np.argmax(pred) for pred in y_pred])
y_true = np.array(test_label)
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           1       0.85      0.90      0.87       695
           2       0.92      0.85      0.88       580
           3       0.85      0.79      0.82       275
           4       0.80      0.86      0.83       224
           5       0.76      0.74      0.75       159
           6       0.65      0.71      0.68        66

    accuracy                           0.85      1999
   macro avg       0.80      0.81      0.81      1999
weighted avg       0.85      0.85      0.85      1999



In [29]:
emoji_pred = [map_emotion[pred] for pred in y_pred]
emoji_pred

['sadness',
 'sadness',
 'joy',
 'sadness',
 'fear',
 'joy',
 'joy',
 'joy',
 'love',
 'anger',
 'sadness',
 'fear',
 'joy',
 'love',
 'sadness',
 'joy',
 'joy',
 'love',
 'joy',
 'love',
 'joy',
 'love',
 'sadness',
 'sadness',
 'fear',
 'anger',
 'sadness',
 'fear',
 'anger',
 'fear',
 'anger',
 'sadness',
 'anger',
 'sadness',
 'joy',
 'surprise',
 'sadness',
 'joy',
 'joy',
 'fear',
 'sadness',
 'joy',
 'anger',
 'joy',
 'anger',
 'joy',
 'joy',
 'fear',
 'fear',
 'sadness',
 'fear',
 'joy',
 'sadness',
 'joy',
 'sadness',
 'sadness',
 'joy',
 'sadness',
 'anger',
 'sadness',
 'sadness',
 'joy',
 'joy',
 'sadness',
 'surprise',
 'sadness',
 'sadness',
 'fear',
 'surprise',
 'joy',
 'love',
 'surprise',
 'joy',
 'love',
 'joy',
 'joy',
 'sadness',
 'joy',
 'love',
 'joy',
 'fear',
 'sadness',
 'joy',
 'sadness',
 'fear',
 'joy',
 'joy',
 'joy',
 'sadness',
 'joy',
 'fear',
 'anger',
 'surprise',
 'love',
 'anger',
 'love',
 'joy',
 'fear',
 'sadness',
 'fear',
 'sadness',
 'sadness'

In [30]:
!pip uninstall ffmpeg 
!pip uninstall ffmpeg-python
! pip install SpeechRecognition
!apt install libasound2-dev portaudio19-dev
! pip install PyAudio
! pip install ffmpeg-python
!pip install librosa

Collecting SpeechRecognition
[?25l  Downloading https://files.pythonhosted.org/packages/26/e1/7f5678cd94ec1234269d23756dbdaa4c8cfaed973412f88ae8adf7893a50/SpeechRecognition-3.8.1-py2.py3-none-any.whl (32.8MB)
[K     |████████████████████████████████| 32.8MB 83kB/s 
[?25hInstalling collected packages: SpeechRecognition
Successfully installed SpeechRecognition-3.8.1
Reading package lists... Done
Building dependency tree       
Reading state information... Done
libasound2-dev is already the newest version (1.1.3-5ubuntu0.5).
The following additional packages will be installed:
  libportaudio2 libportaudiocpp0
Suggested packages:
  portaudio19-doc
The following NEW packages will be installed:
  libportaudio2 libportaudiocpp0 portaudio19-dev
0 upgraded, 3 newly installed, 0 to remove and 29 not upgraded.
Need to get 184 kB of archives.
After this operation, 891 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 libportaudio2 amd64 19.6.

In [31]:
import speech_recognition as sr
import pyaudio
from IPython.display import HTML, Audio
from google.colab.output import eval_js
from base64 import b64decode
import numpy as np
from scipy.io.wavfile import read as wav_read
from scipy.io.wavfile import write
import io
import ffmpeg
AUDIO_HTML = """
<script>
var my_div = document.createElement("DIV");
var my_p = document.createElement("P");
var my_btn = document.createElement("BUTTON");
var t = document.createTextNode("Press to start recording");

my_btn.appendChild(t);
//my_p.appendChild(my_btn);
my_div.appendChild(my_btn);
document.body.appendChild(my_div);

var base64data = 0;
var reader;
var recorder, gumStream;
var recordButton = my_btn;

var handleSuccess = function(stream) {
  gumStream = stream;
  var options = {
    //bitsPerSecond: 8000, //chrome seems to ignore, always 48k
    mimeType : 'audio/webm;codecs=opus'
    //mimeType : 'audio/webm;codecs=pcm'
  };            
  //recorder = new MediaRecorder(stream, options);
  recorder = new MediaRecorder(stream);
  recorder.ondataavailable = function(e) {            
    var url = URL.createObjectURL(e.data);
    var preview = document.createElement('audio');
    preview.controls = true;
    preview.src = url;
    document.body.appendChild(preview);

    reader = new FileReader();
    reader.readAsDataURL(e.data); 
    reader.onloadend = function() {
      base64data = reader.result;
      //console.log("Inside FileReader:" + base64data);
    }
  };
  recorder.start();
  };

recordButton.innerText = "Recording... press to stop";

navigator.mediaDevices.getUserMedia({audio: true}).then(handleSuccess);


function toggleRecording() {
  if (recorder && recorder.state == "recording") {
      recorder.stop();
      gumStream.getAudioTracks()[0].stop();
      recordButton.innerText = "Saving the recording... pls wait!"
  }
}

// https://stackoverflow.com/a/951057
function sleep(ms) {
  return new Promise(resolve => setTimeout(resolve, ms));
}

var data = new Promise(resolve=>{
//recordButton.addEventListener("click", toggleRecording);
recordButton.onclick = ()=>{
toggleRecording()

sleep(2000).then(() => {
  // wait 2000ms for the data to be available...
  // ideally this should use something like await...
  //console.log("Inside data:" + base64data)
  resolve(base64data.toString())

});

}
});
      
</script>
"""
def get_audio():
  display(HTML(AUDIO_HTML))
  data = eval_js("data")
  binary = b64decode(data.split(',')[1])
  
  process = (ffmpeg
    .input('pipe:0')
    .output('pipe:1', format='wav')
    .run_async(pipe_stdin=True, pipe_stdout=True, pipe_stderr=True, quiet=True, overwrite_output=True)
  )
  output, err = process.communicate(input=binary)
  
  riff_chunk_size = len(output) - 8
  # Break up the chunk size into four bytes, held in b.
  q = riff_chunk_size
  b = []
  for i in range(4):
      q, r = divmod(q, 256)
      b.append(r)

  # Replace bytes 4:8 in proc.stdout with the actual size of the RIFF chunk.
  riff = output[:4] + bytes(b) + output[8:]

  sr, audio = wav_read(io.BytesIO(riff))
  write('test.wav', sr, audio)
  return audio, sr

In [33]:
#first run this block so that it is recognized prior to the function call
def next_block(a):
  print(a)
  input = [a]
  input_token = tokenizer.texts_to_sequences(input) # tokenize the input string
  x = pad_sequences(input_token, maxlen= max_length, padding='post')
  print(x)
  output = model.predict(x) # predict the emotion of output
  print("output - ",output)
  Emotion = map_emotion[np.argmax(output)]
  print("Emotion - " , Emotion)

In [56]:
audio , sr = get_audio()

In [57]:
import time
import pyaudio
import speech_recognition as sr

In [58]:
import time
import pyaudio
import speech_recognition as sr
def callback(recognizer , audio):
    try:
        input_string=recognizer.recognize_google(audio,language="en-SG")
        #first run the next_block cell and then run this one 
        next_block(input_string) 
    except:
        print("Opps didn't catch")
r=sr.Recognizer()
m=sr.AudioFile('test.wav')
with m as source:
        r.dynamic_energy_threshold=True
        r.adjust_for_ambient_noise(source,duration=5)
        time.sleep(0.5)
stop_listening=r.listen_in_background(m,callback)
for _ in range(8):time.sleep(0.1) 
stop_listening()
for i in range(5):time.sleep(0.1)

in perfect height adjuster trade pictures images
[[ 242 1706 4894 1002 1536    0    0    0    0    0    0]]
output -  [[3.6273454e-04 9.4914556e-01 3.2390747e-03 9.9972300e-03 7.2085294e-03
  2.2914920e-02 3.0384737e-03 4.0917456e-04 2.2465982e-04 4.0611715e-04
  3.5105777e-04 3.0486353e-04 3.3224805e-04 3.6707617e-04 2.0000646e-04
  3.1290678e-04 3.2314734e-04 3.2564005e-04 2.5090834e-04 2.8575465e-04]]
Emotion -  joy
