# Imports

In [1]:
# Data manipulation and analysis
import pandas as pd
import numpy as np

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Deep learning with TensorFlow
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Conv2D, MaxPooling2D, Flatten
from tensorflow.keras.utils import to_categorical

# Deep learning with PyTorch
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from torchvision import models

# Natural Language Processing (NLP)
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Miscellaneous
import os
import re
import time
import pickle
import cv2

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Defining Files

In [6]:
files = [[], [], [], [], []] # appending the filenames to their appropriate categories

edict = { # this matches up with the indices for Ekman-6
    'ANG': 0,
    'DIS': 1,
    'FEA': 2,
    'HAP': 3,
    'SAD': 4,
}

for file in os.listdir(directory):
  filename = os.fsdecode(file)
  emote = filename[9:12]
  if emote in edict:
    files[edict[emote]].append(filename)

# Getting Audio Clips of CREMA-D

In [4]:
!pip install moviepy
from moviepy.editor import VideoFileClip

[31mERROR: Operation cancelled by user[0m[31m
[0m

In [None]:
directory = "/content/drive/My Drive/Machine Learning/COSMOS/FINAL_PROJECT/DER/CREMA-D-Videos/"
to_directory = "/content/drive/My Drive/Machine Learning/COSMOS/FINAL_PROJECT/DER/CD-Audios/"

edict = { # this matches up with the indices for Ekman-6
    'ANG': 0,
    'DIS': 1,
    'FEA': 2,
    'HAP': 3,
    'SAD': 4,
}

id = 250 # cut off the last 50 video files from Ekman-6 Dataset

def write(read_path, write_path):
  clip = VideoFileClip(read_path)
  audio = clip.audio
  audio.write_audiofile(write_path)
  audio.close()
  clip.close()

for i in range(5):
  for j in range(50):
    read_path = directory + files[i][j]
    write_path = to_directory + str(id) + ".mp3"
    write(read_path, write_path)
    id += 1

In [None]:
for name in range(325, 500):
  emotion = (int)(name/50) - 5
  read_path = directory + files[emotion][name]
  write_path = to_directory + str(id) + ".mp3"
  print(read_path)
  write(read_path, write_path)
  id += 1

# Speech to Text

In [None]:
!pip install openai

Collecting openai
  Downloading openai-1.37.1-py3-none-any.whl.metadata (22 kB)
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.27.0-py3-none-any.whl.metadata (7.2 kB)
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Downloading httpcore-1.0.5-py3-none-any.whl.metadata (20 kB)
Collecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai)
  Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Downloading openai-1.37.1-py3-none-any.whl (337 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m337.0/337.0 kB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading httpcore-1.0.5-py3-none-any.whl (77 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading h11-0.14.0-py3-none-a

In [None]:
from openai import OpenAI
key = ""
client = OpenAI(api_key = key)

def translated_text(filename):
  audio_file = open(filename, "rb")
  translation = client.audio.translations.create(
    model="whisper-1",
    file=audio_file,
    temperature = 0.90
  )
  return translation.text

texts = []

for index in range(250, 500):
  texts.append(translated_text(to_directory + str(index) + ".mp3"))

In [None]:
import csv

with open("/content/drive/My Drive/Machine Learning/COSMOS/FINAL_PROJECT/DER/CD-texts/transcripts.csv", 'w') as myfile:
    wr = csv.writer(myfile, quoting = csv.QUOTE_ALL)
    wr.writerow(texts)

# Image to Spectrogram

In [4]:
import librosa
import librosa.core

def save_spectrogram(read_path, write_path):
  y, sr = librosa.load(read_path)
  y = y[:100000000] # shorten audio a bit for speed

  window_size = 1024
  window = np.hanning(window_size)
  stft  = librosa.stft(y, n_fft=window_size, hop_length=512, window=window)
  out = 2 * np.abs(stft) / np.sum(window)

  # For plotting headlessly
  from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas

  fig = plt.Figure()
  canvas = FigureCanvas(fig)
  ax = fig.add_subplot(111)
  p = librosa.display.specshow(librosa.amplitude_to_db(out, ref=np.max), ax=ax, y_axis='log', x_axis='time')
  fig.savefig(write_path)

In [None]:
spect_directory = "/content/drive/My Drive/Machine Learning/COSMOS/FINAL_PROJECT/DER/CD-spect/"

for file in os.listdir(to_directory):
  filename = os.fsdecode(file)
  rp = to_directory + filename
  wp = spect_directory + filename[:3] + ".jpg"
  print(rp)
  print(wp)
  save_spectrogram(rp, wp)

/content/drive/My Drive/Machine Learning/COSMOS/FINAL_PROJECT/DER/CD-Audios/250.mp3
/content/drive/My Drive/Machine Learning/COSMOS/FINAL_PROJECT/DER/CD-spect/250.jpg
/content/drive/My Drive/Machine Learning/COSMOS/FINAL_PROJECT/DER/CD-Audios/251.mp3
/content/drive/My Drive/Machine Learning/COSMOS/FINAL_PROJECT/DER/CD-spect/251.jpg
/content/drive/My Drive/Machine Learning/COSMOS/FINAL_PROJECT/DER/CD-Audios/252.mp3
/content/drive/My Drive/Machine Learning/COSMOS/FINAL_PROJECT/DER/CD-spect/252.jpg
/content/drive/My Drive/Machine Learning/COSMOS/FINAL_PROJECT/DER/CD-Audios/253.mp3
/content/drive/My Drive/Machine Learning/COSMOS/FINAL_PROJECT/DER/CD-spect/253.jpg
/content/drive/My Drive/Machine Learning/COSMOS/FINAL_PROJECT/DER/CD-Audios/254.mp3
/content/drive/My Drive/Machine Learning/COSMOS/FINAL_PROJECT/DER/CD-spect/254.jpg
/content/drive/My Drive/Machine Learning/COSMOS/FINAL_PROJECT/DER/CD-Audios/255.mp3
/content/drive/My Drive/Machine Learning/COSMOS/FINAL_PROJECT/DER/CD-spect/255.jp

# Frame Cutting

In [7]:
# the following was stolen from StackOverflow
def get_frames(readpath, writepath, filename):
    cap = cv2.VideoCapture(readpath)
    i = 0
    # a variable to set how many frames you want to skip
    frame_skip = 6 # every 0.25 seconds, we make another frame
    # a variable to keep track of the frame to be saved
    frame_count = 0
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        if i > frame_skip - 1:
            frame_count += 1
            if not cv2.imwrite(writepath + filename + "_" + str(frame_count) + ".jpg", frame):
              raise Exception("Could not write image")
            i = 0
            continue
        i += 1
    cap.release()
    cv2.destroyAllWindows()

In [8]:
directory = "/content/drive/My Drive/Machine Learning/COSMOS/FINAL_PROJECT/DER/CREMA-D-Videos/"
frame_directory = "/content/drive/My Drive/Machine Learning/COSMOS/FINAL_PROJECT/DER/CD-frames/"

id = 250

for i in range(5):
  for j in range(50):
    rp = directory + files[i][j]
    wp = frame_directory
    get_frames(rp, wp, str(id))
    id += 1

KeyboardInterrupt: 

# Making the Tensors

Transforms

In [9]:
frame_process = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.CenterCrop((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean = [0.485, 0.456, 0.406], std = [0.229, 0.224, 0.225])
])

def crop(image):
  return image.crop((80, 58, 577, 428))

spect_process = transforms.Compose([
    transforms.Lambda(crop),
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

Audio

In [None]:
from PIL import Image

images = []

for file in os.listdir(spect_directory):
  filename = os.fsdecode(file)
  spect_image = Image.open(spect_directory + filename)
  spect_image = spect_process(spect_image)
  images.append(spect_image)

images = torch.stack(images)
torch.save(images, "/content/drive/My Drive/Machine Learning/COSMOS/FINAL_PROJECT/DER/cd-spect.pt")

Text

In [None]:
nltk.download('punkt')
nltk.download('stopwords')

df = pd.read_csv("/content/drive/My Drive/Machine Learning/COSMOS/FINAL_PROJECT/DER/CD-texts/transcripts.csv")

def preprocess_text(text: str) -> str:
    # remove links
    text = re.sub(r"http\S+", "", text)
    # remove special chars and numbers
    text = re.sub("[^A-Za-z]+", " ", text)
    # remove stopwords
    # 1. tokenize
    tokens = nltk.word_tokenize(text)
    # 2. check if stopword
    tokens = [w.lower() for w in tokens if not w in stopwords.words("english")]
    return tokens

dictionary = {
    'EMPTY': 1
}

thingy = torch.load("/content/drive/My Drive/Machine Learning/COSMOS/FINAL_PROJECT/DER/ek-text.pt", map_location = torch.device('cpu'))
mx = 0
for i in range(250):
  for j in range(1224):
    mx = max(mx, thingy[i][j].item())

size = max(2, mx + 1)

def get_text_tensor(i):
  text = df.columns[i - 250]
  text = preprocess_text(text)
  liszt = [] # the processed version of the text
  global size
  for i in range(len(text)):
    if text[i] in dictionary:
      liszt.append((int)(dictionary[text[i]]))
    else:
      dictionary[text[i]] = size
      liszt.append((int)(dictionary[text[i]]))
      size += 1
  return torch.Tensor(liszt).to(torch.int64), len(text)

texties = []
text_lengths = []

for video in range(250, 500):
  tt, text_length = get_text_tensor(video)
  text_length = torch.tensor([text_length])
  texties.append(tt)
  text_lengths.append(text_length)

texties = torch.nn.utils.rnn.pad_sequence(texties, batch_first = True)
text_lengths = torch.stack(text_lengths)

torch.save(texties, "/content/drive/My Drive/Machine Learning/COSMOS/FINAL_PROJECT/DER/cd-text.pt")
torch.save(text_lengths, "/content/drive/My Drive/Machine Learning/COSMOS/FINAL_PROJECT/DER/cd-textl.pt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
print(size)

Frames

In [12]:
from PIL import Image

frame_directory = "/content/drive/My Drive/Machine Learning/COSMOS/FINAL_PROJECT/DER/CD-frames/"

def get_frame_tensor(i):
  frames = []
  last_valid_filename = ""
  for j in range(1, 11):
    filename = str(i) + "_" + str(j) + ".jpg"
    if not os.path.isfile(frame_directory + filename):
      filename = last_valid_filename
    if filename == "":
      print(str(i) + "_" + str(j) + ".jpg")
    file = Image.open(frame_directory + filename)
    file = frame_process(file)
    frames.append(file)
    last_valid_filename = filename
  frames = torch.stack(frames)
  return frames

tsr_list = []

for video in range(250, 500):
  tsr = get_frame_tensor(video)
  tsr_list.append(tsr)

tsr_list = torch.stack(tsr_list)
torch.save(tsr_list, "/content/drive/My Drive/Machine Learning/COSMOS/FINAL_PROJECT/DER/cd-frame.pt")

# Creating the Dataset

In [None]:
indices = []
labels = []

for index in range(0, 250):
  indices.append(torch.Tensor([index]))
  labels.append(torch.Tensor([(int)(index/50)]))

for index in range(250, 500):
  indices.append(torch.Tensor([index]))
  labels.append(torch.Tensor([(int)(index/50) - 5]))

indices = torch.stack(indices)
labels = torch.stack(labels)

dset = torch.utils.data.TensorDataset(indices, labels)
torch.save(dset, "/content/drive/My Drive/Machine Learning/COSMOS/FINAL_PROJECT/DER/cd-data.pt")