# Imports

In [24]:
# Data manipulation and analysis
import pandas as pd
import numpy as np

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Deep learning with TensorFlow
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Conv2D, MaxPooling2D, Flatten
from tensorflow.keras.utils import to_categorical

# Deep learning with PyTorch
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from torchvision import models
from torchvision.models import resnet50, ResNet50_Weights
from torchvision.models import resnet18, ResNet18_Weights

# Natural Language Processing (NLP)
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Miscellaneous
import os
import re
import time
import pickle
from PIL import Image

from google.colab import drive
drive.mount('/content/drive')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Data Import

In [25]:
nltk.download('punkt')
nltk.download('stopwords')

b_size = 10

data = torch.load("/content/drive/My Drive/Machine Learning/COSMOS/FINAL_PROJECT/DER/ekman.pt")
train_length = 250
train_set, val_set = torch.utils.data.random_split(data, [train_length, len(data) - train_length])

train_loader = DataLoader(train_set, batch_size = b_size, shuffle = True, pin_memory = True, num_workers = 2)
test_loader = DataLoader(val_set, batch_size = b_size, shuffle = False, pin_memory = True, num_workers = 2)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [26]:
import os.path

frame_process = transforms.Compose([
    transforms.Resize((32, 32)),
    transforms.ToTensor(),
    transforms.Normalize(mean = [0.485, 0.456, 0.406], std = [0.229, 0.224, 0.225])
])

def crop(image):
  return image.crop((80, 58, 577, 428))

spect_process = transforms.Compose([
    transforms.Lambda(crop),
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

frame_directory = "/content/drive/My Drive/Machine Learning/COSMOS/FINAL_PROJECT/DER/ekman6_split/"
spect_directory = "/content/drive/My Drive/Machine Learning/COSMOS/FINAL_PROJECT/DER/ekman6_spectro/"
folders = ["anger/", "disgust/", "fear/", "joy/", "sadness/", "surprise/"]
df = pd.read_csv("/content/drive/My Drive/Machine Learning/COSMOS/FINAL_PROJECT/DER/ekman6_texts/en_transcripts.csv")

def preprocess_text(text: str) -> str:
    # remove links
    text = re.sub(r"http\S+", "", text)
    # remove special chars and numbers
    text = re.sub("[^A-Za-z]+", " ", text)
    # remove stopwords
    # 1. tokenize
    tokens = nltk.word_tokenize(text)
    # 2. check if stopword
    tokens = [w.lower() for w in tokens if not w in stopwords.words("english")]
    return tokens

def get_frame_tensor(i):
  frames = []
  last_valid_filename = ""
  for j in range(1, 11):
    filename = str(i) + "_" + str(j) + ".jpg"
    if not os.path.isfile(frame_directory + folders[(int)(i/50)] + filename):
      filename = last_valid_filename
    if filename == "":
      print(str(i) + "_" + str(j) + ".jpg")
    file = Image.open(frame_directory + folders[(int)(i/50)] + filename)
    file = frame_process(file)
    frames.append(file)
    last_valid_filename = filename
  frames = torch.stack(frames)
  return frames

def get_spect_tensor(i):
  filename = str(i) + ".jpg"
  file = Image.open(spect_directory + folders[(int)(i/50)] + filename)
  file = spect_process(file)
  return file

dictionary = {
    'EMPTY': 1 # EMPTY --> signal that the text is empty and contains nothing
}

def get_text_tensor(i):
  text = df.columns[i]
  text = preprocess_text(text)
  liszt = [] # the processed version of the text
  for i in range(len(text)):
    if text[i] in dictionary:
      liszt.append((int)(dictionary[text[i]]))
    else:
      size = len(dictionary) + 1
      dictionary[text[i]] = size
      liszt.append((int)(dictionary[text[i]]))
  return torch.Tensor(liszt).to(device).to(torch.int64), len(text)


In [27]:
text_tensors = []
text_lengths = []
MXLEN = 0

num_input_videos = 300

for i in range(num_input_videos):
  tt, text_length = get_text_tensor(i)
  text_tensors.append(tt)
  text_lengths.append(text_length)
  MXLEN = max(MXLEN, (int)(tt.size(0)))

text_tensors = torch.nn.utils.rnn.pad_sequence(text_tensors, batch_first = True)

In [28]:
#frame_tensors = []
spect_tensors = []

for i in range(300):
  #print(i)
  #frame_tensors.append(get_frame_tensor(i))
  spect_tensors.append(get_spect_tensor(i))

# Reading the Frames

In [29]:
ds = torch.load("/content/drive/My Drive/Machine Learning/COSMOS/FINAL_PROJECT/DER/frame.pt")
print(ds.size())

torch.Size([300, 10, 3, 32, 32])


# Models

LSTM

In [30]:
class LSTM_Classifier(nn.Module):
  def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout):
    super(LSTM_Classifier, self).__init__()
    # Embedding layer converts integer sequences to vector sequences
    self.embedding = nn.Embedding(vocab_size, embedding_dim)
    # LSTM layer process the vector sequences
    self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers = n_layers, bidirectional = bidirectional, dropout = dropout, batch_first = True)
    #                     input           output                             bisexual?                      dropout_prob      whether or not batch_size comes first in the tensor.size()
    # Dense layer to predict
    self.fc = nn.Linear(hidden_dim * (2 if bidirectional == True else 1), output_dim)
    # Prediction activation function
    self.sigmoid = nn.Sigmoid()

  def forward(self, text, text_lengths):
    embedded = self.embedding(text) # embedded version
    # Thanks to packing, LSTM don't see padding tokens
    # and this makes our model better
    packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths, batch_first=True, enforce_sorted = False)
    packed_output, (hidden_state, cell_state) = self.lstm(packed_embedded)
    # Concatenating the final forward and backward hidden states
    hidden = torch.cat((hidden_state[-2,:,:], hidden_state[-1,:,:]), dim = 1)
    dense_outputs = self.fc(hidden)
    #Final activation function
    dense_outputs = nn.functional.relu(dense_outputs)
    return dense_outputs

3D-CNN

In [31]:
class CNN3d(nn.Module):
  def __init__(self):
    super(CNN3d, self).__init__()
    self.c1 = nn.Conv3d(3, 6, kernel_size = 4, padding = 1, stride = 2)
    self.c2 = nn.Conv3d(6, 16, kernel_size = 4, padding = 1, stride = 2)
    # 16 x 2 x 8 x 8
    self.flatten = nn.Flatten()
    self.fc = nn.Sequential(
      nn.Linear(16 * 2 * 8 * 8, 128),
      nn.Linear(128, 64),
      nn.Linear(64, 6),
      nn.ReLU()
    )

  def forward(self, x):
    x = self.c1(x)
    x = self.c2(x)
    x = nn.functional.relu(x)
    x = self.flatten(x)
    x = self.fc(x)
    return x

Combiner Model

In [32]:
class Smash(nn.Module):
  def __init__(self):
    super(Smash, self).__init__()
    self.fcnn = nn.Sequential(
      nn.Linear(18, 12),
      nn.ReLU(),
      nn.Linear(12, 6)
    )
    self.bilstm = LSTM_Classifier(vocab_size = len(dictionary) + 1,
                         embedding_dim = 100,
                         hidden_dim = 64,
                         output_dim = 6,
                         n_layers = 5,
                         bidirectional = True,
                         dropout = 0.5).to(device)
    self.resn = resnet18(num_classes = 6)
    self.cnn = CNN3d()

  def forward(self, x1, x2, tlengths, x3):
    x1 = self.resn(x1)
    x2 = self.bilstm(x2, tlengths)
    x3 = self.cnn(x3)
    x1 = torch.cat((x1, x2, x3), dim = -1)
    x1 = self.fcnn(x1)
    return x1

# Training Loop

In [36]:
torch.manual_seed(42)

model = Smash().to(device)

epochs = 25
optimizer = optim.Adam(model.parameters(), lr = 0.0001)
loss_fn = nn.CrossEntropyLoss()

tl = []

for i in range(epochs):
  model.train()
  train_loss = 0.00
  correct = 0.00
  total = 0.00
  for indices, label in train_loader:
    indices = indices.cuda()
    label = label.cuda()

    # getting the text for the index-th video
    text_batch = []
    for index in indices:
      text = text_tensors[index].to(device)
      # if the text ends up being nothing
      if text_lengths[index] == 0:
        text[0] = dictionary['EMPTY']
        # dimensionality voodoo
        text_lengths[index] = 1
      text_batch.append(text)
    text_batch = torch.stack(text_batch)
    text_batch = text_batch.squeeze(1)

    # getting the spectrograms
    spect_batch = []
    for index in indices:
      spect = spect_tensors[index].to(device)
      spect.requires_grad = True
      spect_batch.append(spect)
    spect_batch = torch.stack(spect_batch)

    # getting all of the frames
    frame_batch = []
    for index in indices:
      frames = ds[index.cpu()].to(device)
      frames = frames.squeeze(0)
      #print(frames.size())
      frame_batch.append(frames)
    frame_batch = torch.stack(frame_batch)
    frame_batch = torch.transpose(frame_batch, 1, 2)
    #print(frame_batch.size())

    # getting the text lengths
    textl = []
    for index in indices:
      textl.append(text_lengths[index])

    # zero-ing gradients
    optimizer.zero_grad()

    # output of the final NN on the super-tensor
    y_pred = model(spect_batch, text_batch, textl, frame_batch).to(device)

    # creating one-hot vector for the label for the index-th video
    labels = torch.full((b_size, 6,), 0.00).cuda()
    for j in range(b_size):
      labels[j][label[j]] = 1.00

    # computing the Cross Entropy Loss and backpropagating
    loss = loss_fn(y_pred, labels)
    loss.backward()

    # updating gradients
    optimizer.step()

    # statistics
    train_loss += loss.item()/len(train_loader)
    prediction = y_pred.argmax(dim=1)
    label = label.squeeze(1)
    correct += (prediction.eq(label).sum()).item()
    total += label.size(0)

  print(correct, total)
  # more statistics
  tl.append(train_loss)
  print(f"Epoch: {i+1}/{epochs}, Training Loss: {train_loss:.4f}, Training Accuracy: {correct/total:.4f}")

57.0 250.0
Epoch: 1/25, Training Loss: 1.7649, Training Accuracy: 0.2280
77.0 250.0
Epoch: 2/25, Training Loss: 1.6566, Training Accuracy: 0.3080
89.0 250.0
Epoch: 3/25, Training Loss: 1.5383, Training Accuracy: 0.3560
107.0 250.0
Epoch: 4/25, Training Loss: 1.3859, Training Accuracy: 0.4280
143.0 250.0
Epoch: 5/25, Training Loss: 1.2595, Training Accuracy: 0.5720
165.0 250.0
Epoch: 6/25, Training Loss: 1.1244, Training Accuracy: 0.6600
173.0 250.0
Epoch: 7/25, Training Loss: 1.0402, Training Accuracy: 0.6920
188.0 250.0
Epoch: 8/25, Training Loss: 0.9371, Training Accuracy: 0.7520
215.0 250.0
Epoch: 9/25, Training Loss: 0.7605, Training Accuracy: 0.8600
223.0 250.0
Epoch: 10/25, Training Loss: 0.6706, Training Accuracy: 0.8920
228.0 250.0
Epoch: 11/25, Training Loss: 0.5962, Training Accuracy: 0.9120
239.0 250.0
Epoch: 12/25, Training Loss: 0.4892, Training Accuracy: 0.9560
241.0 250.0
Epoch: 13/25, Training Loss: 0.4199, Training Accuracy: 0.9640
245.0 250.0
Epoch: 14/25, Training Lo

# Validation Loop

In [37]:
model.eval()

for indices, label in train_loader:
    indices = indices.cuda()
    label = label.cuda()

    # getting the text for the index-th video
    text_batch = []
    for index in indices:
      text = text_tensors[index].to(device)
      # if the text ends up being nothing
      if text_lengths[index] == 0:
        text[0] = dictionary['EMPTY']
        # dimensionality voodoo
        text_lengths[index] = 1
      text_batch.append(text)
    text_batch = torch.stack(text_batch)
    text_batch = text_batch.squeeze(1)

    # getting the spectrograms
    spect_batch = []
    for index in indices:
      spect = spect_tensors[index].to(device)
      spect.requires_grad = True
      spect_batch.append(spect)
    spect_batch = torch.stack(spect_batch)

    # getting all of the frames
    frame_batch = []
    for index in indices:
      frames = ds[index.cpu()].to(device)
      frames = frames.squeeze(0)
      #print(frames.size())
      frame_batch.append(frames)
    frame_batch = torch.stack(frame_batch)
    frame_batch = torch.transpose(frame_batch, 1, 2)
    #print(frame_batch.size())

    # getting the text lengths
    textl = []
    for index in indices:
      textl.append(text_lengths[index])

    # output of the final NN on the super-tensor
    with torch.no_grad():
      y_pred = model(spect_batch, text_batch, textl, frame_batch).to(device)

    # creating one-hot vector for the label for the index-th video
    labels = torch.full((b_size, 6,), 0.00).cuda()
    for j in range(b_size):
      labels[j][label[j]] = 1.00

    train_loss += loss.item()/len(train_loader)
    prediction = y_pred.argmax(dim=1)
    label = label.squeeze(1)
    correct += (prediction.eq(label).sum()).item()
    total += label.size(0)

print(f"Validation Accuracy: {correct/total:.4f}")

Validation Accuracy: 0.9720


In [38]:
torch.save(model, "/content/drive/My Drive/Machine Learning/COSMOS/FINAL_PROJECT/DER/vmodel1.pt")

# Save the Tensor (I do not want to run that cell ever again)

In [35]:
frame_tensors = torch.stack(frame_tensors)
torch.save(frame_tensors, "/content/drive/My Drive/Machine Learning/COSMOS/FINAL_PROJECT/DER/frame.pt")

NameError: name 'frame_tensors' is not defined