In [30]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import csv
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.metrics import roc_curve, auc, f1_score
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import random
from moviepy.editor import *
from pydub import AudioSegment
from pydub.utils import which
import speech_recognition as sr
import string

Preprocessing Data

In [32]:
def clean_text(t):
    for c in string.punctuation:
        t = t.replace(c, " ")
    t = t.lower()
#     t = remove_articles(t)
    t = t.split()
    wordsFiltered = []
    stops = set(stopwords.words('english'))
    ps = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    for w in t:
        if w not in stops:
            wordsFiltered.append(lemmatizer.lemmatize(w))
    return ' '.join(wordsFiltered)



def augment_sentence(sentence):
    words = sentence.split()
    if len(words) > 1:
        idx1, idx2 = random.sample(range(len(words)), 2)
        words[idx1], words[idx2] = words[idx2], words[idx1]
    return ' '.join(words)


def sentences_to_indices(X, word_to_index, max_len, clean_text_function, augment=False):
    """
    Converts an array of sentences (strings) into an array of indices corresponding to words in the sentences.
    """
    
    m = X.shape[0]  # number of training examples
    X_indices = np.zeros((m, max_len))
    
    for i in range(m):
        # Clean and optionally augment the sentence
        sentence = clean_text_function(X[i])
        if augment:
            sentence = augment_sentence(sentence)

        sentence_words = sentence.lower().split()

        j = 0
        for w in sentence_words:
            if w not in word_to_index or j >= max_len:
                continue
            X_indices[i, j] = word_to_index[w]
            j += 1
    
    return X_indices



In [33]:
def read_glove_vecs(glove_file):
    with open(glove_file, 'r') as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
        
        i = 1
        words_to_index = {}
        index_to_words = {}
        for w in sorted(words):
            words_to_index[w] = i
            index_to_words[i] = w
            i = i + 1
    return words_to_index, index_to_words, word_to_vec_map

def convert_to_one_hot(Y, C):
    Y = np.eye(C)[Y.reshape(-1)]
    return Y
word_to_index, index_to_word, word_to_vec_map = read_glove_vecs('glove.6B.50d.txt')

In [34]:
def pretrained_embedding_layer(word_to_vec_map, word_to_index, non_trainable=True):
    num_embeddings = len(word_to_index) + 1                   
    embedding_dim = word_to_vec_map["cucumber"].shape[0]  #  dimensionality of GloVe word vectors (= 50)

    # Initialize the embedding matrix as a numpy array of zeros of shape (num_embeddings, embedding_dim)
    weights_matrix = np.zeros((num_embeddings, embedding_dim))

    # Set each row "index" of the embedding matrix to be the word vector representation of the "index"th word of the vocabulary
    for word, index in word_to_index.items():
        weights_matrix[index, :] = word_to_vec_map[word]

    embed = nn.Embedding.from_pretrained(torch.from_numpy(weights_matrix).type(torch.FloatTensor), freeze=non_trainable)

    return embed, num_embeddings, embedding_dim


In [45]:
class NN(nn.Module):
    def __init__(self, embedding, hidden_dim, output_dim, batch_size):
        super(NN, self).__init__()
        self.batch_size = batch_size
        self.hidden_dim = hidden_dim
        self.word_embeddings = embedding  # Pre-trained embeddings
        # Only apply dropout to LSTM layers, not the output of LSTM
        self.lstm = nn.LSTM(self.word_embeddings.embedding_dim, hidden_dim, num_layers=2, dropout=0.5, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, sentence):
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        embeds = self.word_embeddings(sentence)
        h0 = torch.zeros(2, sentence.size(0), self.hidden_dim).to(device)
        c0 = torch.zeros(2, sentence.size(0), self.hidden_dim).to(device)
        lstm_out, _ = self.lstm(embeds, (h0, c0))
        # Take the output from the last LSTM layer
        lstm_out = lstm_out[:, -1, :]
        out = self.fc(lstm_out)
        return out

In [None]:
AUDIO_FILE = "Sample9.wav"

# use the audio file as the audio source      
text = ""
r = sr.Recognizer()
try:
    with sr.AudioFile(AUDIO_FILE) as source:
            audio = r.record(source)  # read the entire audio file                  
            text =  r.recognize_google(audio)
            print("Transcription: " + text)
except:
    text = ""

In [46]:
model_path = 'overall_best_model.pth'

# Assuming you have the embedding layer ready from your training
embedding, vocab_size, embedding_dim = pretrained_embedding_layer(word_to_vec_map, word_to_index, non_trainable=True)


hidden_dim=128
output_size=2
batch_size = 32
# Create an instance of the model
model1 = NN(embedding, hidden_dim, 2, batch_size)

# Load the saved state dictionary
model1.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))


<All keys matched successfully>

In [47]:
def predict(text,model):
    
    list1 = []

    list1.append(text)
    X = np.array(list1)
    
    string_index = sentences_to_indices(X, word_to_index, len(list1[0].split()), clean_text)
    string_index_tensor = torch.tensor(string_index, dtype=torch.long)  # Convert to LongTensor
    output = model1(string_index_tensor)
    output = torch.nn.functional.softmax(output, dim=1)
    print(output)
    

In [54]:

predict(text,model1)

tensor([[0.9760, 0.0240]], grad_fn=<SoftmaxBackward0>)
