# Implementing an LSTM to classify audio

In [1]:
!python --version

Python 3.10.12


In [2]:
!pip install torchtext

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=2.3.0->torchtext)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=2.3.0->torchtext)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=2.3.0->torchtext)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=2.3.0->torchtext)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=2.3.0->torchtext)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch>=2.3.0->torchtext)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl.metadata (1.5 

In [3]:
import torch
import pandas as pd
import torch.nn as nn
import torch.optim as optim
import numpy as np

SEED = 42
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

from google.colab import drive
drive.mount('/content/drive')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

torch.manual_seed(42)

Mounted at /content/drive


<torch._C.Generator at 0x796acc752050>

In [4]:
data = pd.read_csv("/content/drive/My Drive/Machine Learning/COSMOS/FINAL_PROJECT/DER/emot140.csv")
data = data.drop('Unnamed: 0', axis = 1)
data.head()

Unnamed: 0,emote,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [45]:
import re
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords

texties = []
labels = []
dictionary = {}
size = 0

num_zeros = 0
num_twos = 0
num_fours = 0

def preprocess_text(text: str) -> str:
    # remove links
    text = re.sub(r"http\S+", "", text)
    # remove special chars and numbers
    text = re.sub("[^A-Za-z]+", " ", text)
    # remove stopwords
    # 1. tokenize
    tokens = nltk.word_tokenize(text)
    # 2. check if stopword
    tokens = [w.lower() for w in tokens if not w in stopwords.words("english")]
    return tokens

for index, row in data.iterrows():
  # checking if we can append to the end of the list
  coolio = False
  if (num_zeros == 50000 and num_twos == 50000 and num_fours == 50000):
    break
  if ((row['emote'] == 0 and num_zeros < 50000) or (row['emote'] == 2 and num_twos < 50000) or (row['emote'] == 4 and num_fours < 50000)):
    coolio = True
  if not coolio:
    continue

  # processing the damn thing
  texts = preprocess_text(row['text'])
  words = []
  for i in range(len(texts)):
    if texts[i] in dictionary:
      words.append(dictionary[texts[i]])
    else:
      size += 1
      dictionary[texts[i]] = size
      words.append(size)
  if len(words) == 0:
    continue

  # updating the counts
  num_zeros += (row['emote'] == 0)
  num_twos += (row['emote'] == 2)
  num_fours += (row['emote'] == 4)

  # appends
  texties.append(torch.tensor(words, dtype = torch.int32))
  labels.append(torch.tensor([row['emote']], dtype = torch.int32))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Exception ignored in: <function SeekableUnicodeStreamReader.__del__ at 0x7969a35d2b00>
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/nltk/data.py", line 1160, in __del__
    if not self.closed:
  File "/usr/local/lib/python3.10/dist-packages/nltk/data.py", line 1180, in closed
    return self.stream.closed
AttributeError: 'SeekableUnicodeStreamReader' object has no attribute 'stream'


In [46]:
text = torch.nn.utils.rnn.pad_sequence(texties, batch_first = True)
label = torch.stack(labels)
tdset = torch.utils.data.TensorDataset(text, label)

In [54]:
MXLEN = 0
for i in range(100000):
  MXLEN = max(MXLEN, texties[i].size(0))
print(MXLEN)

36


In [55]:
import torch
import torch.nn as nn
import torch.optim as optim

class LSTM_Classifier(nn.Module):
  def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout):
    super(LSTM_Classifier, self).__init__()
    # Embedding layer converts integer sequences to vector sequences
    self.embedding = nn.Embedding(vocab_size, embedding_dim)
    # LSTM layer process the vector sequences
    self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers = n_layers, bidirectional = bidirectional, dropout = dropout, batch_first = True)
    #                     input           output                             bisexual?                      dropout_prob      whether or not batch_size comes first in the tensor.size()
    # Dense layer to predict
    self.fc = nn.Linear(hidden_dim * 2, output_dim)
    # Prediction activation function
    self.sigmoid = nn.Sigmoid()

  def forward(self, text, text_lengths):
    embedded = self.embedding(text) # embedded version
    # Thanks to packing, LSTM don't see padding tokens
    # and this makes our model better
    packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths, batch_first=True, enforce_sorted = False)
    packed_output, (hidden_state, cell_state) = self.lstm(packed_embedded)
    # Concatenating the final forward and backward hidden states
    hidden = torch.cat((hidden_state[-2,:,:], hidden_state[-1,:,:]), dim = 1)
    dense_outputs = self.fc(hidden)
    #Final activation function
    outputs = self.sigmoid(dense_outputs)
    return outputs


In [56]:
from torch.utils.data import DataLoader
print(len(tdset))
train_len = (int)(0.900 * len(tdset))
train_dataset, test_dataset = torch.utils.data.random_split(tdset, [train_len, len(tdset) - train_len])
print(train_len, len(tdset) - train_len)

b_size = 50

train_loader = DataLoader(train_dataset, batch_size = b_size, shuffle = True, pin_memory = True, num_workers = 2)
test_loader = DataLoader(test_dataset, batch_size = b_size, shuffle = False, pin_memory = True, num_workers = 2)
print(len(train_dataset))
print(len(test_dataset))

100000
90000 10000
90000
10000


# Training Loop

In [57]:
model = LSTM_Classifier(vocab_size = len(dictionary) + 1,
                        embedding_dim = 100,
                        hidden_dim = 64,
                        output_dim = 5,
                        n_layers = 5,
                        bidirectional = True,
                        dropout = 0.5).to(device)
optimizer = optim.Adam(model.parameters(), lr = 0.001)
loss_fn = nn.BCELoss().to(device)
epochs = 15

for i in range(epochs):
  model.train()
  train_loss = 0.00
  correct = 0.00
  total = 0.00
  for tweet, emotion in train_loader:
    tweet = tweet.cuda()
    emotion = emotion.cuda()
    text_lengths = []
    for j in range(b_size):
      goddamn = False
      for k in range(MXLEN):
        if tweet[j][k] == 0:
          text_lengths.append(k)
          goddamn = True
          break
      if goddamn == False:
        text_lengths.append(MXLEN)
    emotion_ = torch.nn.functional.one_hot(emotion.to(torch.int64), 5).to(torch.float)
    emotion_ = emotion_.squeeze(1)
    optimizer.zero_grad()
    y_pred = model(tweet, text_lengths)
    loss = loss_fn(y_pred, emotion_)
    loss.backward()
    optimizer.step()
    train_loss += loss.item()/len(train_loader)
    prediction = y_pred.argmax(dim=1)
    emotion = emotion.squeeze(1)
    correct += prediction.eq(emotion).sum().item()
    total += emotion.size(0)
  print(correct, total)
  print(f"Epoch: {i+1}/{epochs}, Training Loss: {train_loss:.4f}, Training Accuracy: {correct/total:.4f}")

45081.0 90000.0
Epoch: 1/15, Training Loss: 0.2814, Training Accuracy: 0.5009
61317.0 90000.0
Epoch: 2/15, Training Loss: 0.2328, Training Accuracy: 0.6813
70650.0 90000.0
Epoch: 3/15, Training Loss: 0.1843, Training Accuracy: 0.7850
74253.0 90000.0
Epoch: 4/15, Training Loss: 0.1581, Training Accuracy: 0.8250
77581.0 90000.0
Epoch: 5/15, Training Loss: 0.1319, Training Accuracy: 0.8620
80452.0 90000.0
Epoch: 6/15, Training Loss: 0.1057, Training Accuracy: 0.8939
82891.0 90000.0
Epoch: 7/15, Training Loss: 0.0826, Training Accuracy: 0.9210
84664.0 90000.0
Epoch: 8/15, Training Loss: 0.0646, Training Accuracy: 0.9407
85993.0 90000.0
Epoch: 9/15, Training Loss: 0.0503, Training Accuracy: 0.9555
86810.0 90000.0
Epoch: 10/15, Training Loss: 0.0409, Training Accuracy: 0.9646
87407.0 90000.0
Epoch: 11/15, Training Loss: 0.0343, Training Accuracy: 0.9712


KeyboardInterrupt: 

# Validation Loop

In [59]:
model.eval()
val_loss = 0.00
correct = 0.00
total = 0.00

for tweet, emotion in test_loader:
  tweet = tweet.cuda()
  emotion = emotion.cuda()
  text_lengths = []
  for j in range(b_size):
    goddamn = False
    for k in range(MXLEN):
      if tweet[j][k] == 0:
        text_lengths.append(k)
        goddamn = True
        break
    if goddamn == False:
      text_lengths.append(MXLEN)
  emotion_ = torch.nn.functional.one_hot(emotion.to(torch.int64), 5).to(torch.float)
  emotion_ = emotion_.squeeze(1)
  with torch.no_grad():
    y_pred = model(tweet, text_lengths)
  loss = loss_fn(y_pred, emotion_)
  val_loss += loss.item()/len(test_loader)
  emotion = emotion.squeeze(1)
  prediction = y_pred.argmax(dim=1)
  correct += prediction.eq(emotion).sum().item()
  #print(correct, total)
  total += emotion.size(0)

print(f"Validation Loss: {val_loss:.4f}")
print(f"Validation Accuracy: {correct/total:.4f}")

Validation Loss: 0.4543
Validation Accuracy: 0.7302
