In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch.nn.functional as F
from sklearn.metrics import confusion_matrix
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
VISUAL=pd.read_csv('/content/drive/MyDrive/CS535/HW2/visual_x.csv')
AUDIO=pd.read_csv('/content/drive/MyDrive/CS535/HW2/audio_x.csv')
TEXT=pd.read_csv('/content/drive/MyDrive/CS535/HW2/text_x.csv')
yy=pd.read_csv('/content/drive/MyDrive/CS535/HW2/text_y.csv')

VISUAL_X = VISUAL.values
AUDIO_X = AUDIO.values
TEXT_X = TEXT.values

X = np.concatenate([VISUAL_X, AUDIO_X, TEXT_X], axis=1)
y = yy.values.reshape(-1)

In [3]:
from sklearn.utils.class_weight import compute_class_weight
class_labels = np.array([0,1,2,3])
class_weights = compute_class_weight('balanced', classes=class_labels, y=y)
class_weights = torch.tensor(class_weights, dtype=torch.float32)

In [4]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.long)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.long)

train_dataset = torch.utils.data.TensorDataset(X_train, y_train)
test_dataset = torch.utils.data.TensorDataset(X_test, y_test)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=64, shuffle=False)

In [5]:
class MLP(nn.Module):
    def __init__(self, input_size, hidden_size1, hidden_size2, output_size):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size1)
        self.fc2 = nn.Linear(hidden_size1, hidden_size2)
        self.fc3 = nn.Linear(hidden_size2, output_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(x)
        x = torch.relu(self.fc2(x))
        x = torch.relu(x)
        x = self.fc3(x)
        return F.sigmoid(x)

input_size = X.shape[1]
output_size = 4

# Early Fusion

In [6]:
from sklearn.metrics import f1_score
model = MLP(input_size, 512, 128, output_size)

criterion = nn.CrossEntropyLoss(weight=class_weights)
optimizer = optim.Adam(model.parameters(), lr=0.001)

predicted_labels_all = []
true_labels_all = []

num_epochs = 100
for epoch in range(num_epochs):
    for feature, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(feature)
        loss = criterion(outputs, labels.long())
        loss.backward()
        optimizer.step()

    if (epoch+1) % 10 == 0:
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}')

with torch.no_grad():
    for feature, labels in test_loader: #each batch
        outputs = model(feature)
        _, predicted = torch.max(outputs, 1)
        predicted_labels_all.append(predicted)
        true_labels_all.append(labels)
#all batch
true_labels_all = torch.cat(true_labels_all, dim=0)
predicted_labels_all = torch.cat(predicted_labels_all, dim=0)
f1 = f1_score(true_labels_all, predicted_labels_all, average='micro')
cf_matrix = confusion_matrix(predicted_labels_all, true_labels_all, labels = class_labels)
print(cf_matrix)
print(f'F1: {f1}')

Epoch 10/100, Loss: 0.7907912731170654
Epoch 20/100, Loss: 0.7681044340133667
Epoch 30/100, Loss: 0.7649135589599609
Epoch 40/100, Loss: 0.7436974048614502
Epoch 50/100, Loss: 0.7596962451934814
Epoch 60/100, Loss: 0.7597900032997131
Epoch 70/100, Loss: 0.7832394242286682
Epoch 80/100, Loss: 0.7436739802360535
Epoch 90/100, Loss: 0.7436731457710266
Epoch 100/100, Loss: 0.7436716556549072
[[62  5  2  9]
 [ 1 36  3  7]
 [ 3  4 17  5]
 [12 11  6 85]]
F1: 0.746268656716418


# Late Fusion

In [7]:
VISUAL_X_train = X_train[: , : VISUAL_X.shape[1]]
VISUAL_X_test = X_test[: , : VISUAL_X.shape[1]]

AUDIO_X_train = X_train[: , VISUAL_X.shape[1] : AUDIO_X.shape[1] + VISUAL_X.shape[1]]
AUDIO_X_test = X_test[: , VISUAL_X.shape[1] : AUDIO_X.shape[1] + VISUAL_X.shape[1]]

TEXT_X_train = X_train[: , AUDIO_X.shape[1] + VISUAL_X.shape[1] : ]
TEXT_X_test = X_test[: , AUDIO_X.shape[1] + VISUAL_X.shape[1] : ]

VISUAL_X_train.shape, AUDIO_X_train.shape, TEXT_X_train.shape

(torch.Size([1068, 2048]), torch.Size([1068, 128]), torch.Size([1068, 768]))

In [8]:
VISUAL_test_dataset = torch.utils.data.TensorDataset(VISUAL_X_test, y_test)
VISUAL_test_loader = torch.utils.data.DataLoader(VISUAL_test_dataset, batch_size=64, shuffle=False)

AUDIO_test_dataset = torch.utils.data.TensorDataset(AUDIO_X_test, y_test)
AUDIO_test_loader = torch.utils.data.DataLoader(AUDIO_test_dataset, batch_size=64, shuffle=False)

TEXT_test_dataset = torch.utils.data.TensorDataset(TEXT_X_test, y_test)
TEXT_test_loader = torch.utils.data.DataLoader(TEXT_test_dataset, batch_size=64, shuffle=False)

In [9]:
import torch.nn.functional as F

audio_model = torch.load('/content/drive/MyDrive/CS535/HW2/audio.pth')
visual_model = torch.load('/content/drive/MyDrive/CS535/HW2/visual.pth')
text_model = torch.load('/content/drive/MyDrive/CS535/HW2/text.pth')

audio_model.eval()
visual_model.eval()
text_model.eval()

MLP(
  (fc1): Linear(in_features=768, out_features=256, bias=True)
  (fc2): Linear(in_features=256, out_features=64, bias=True)
  (fc3): Linear(in_features=64, out_features=4, bias=True)
)

In [None]:
true_labels_all = []
visual_probs = []

with torch.no_grad():
    for feature, labels in VISUAL_test_loader: #each batch
        outputs = visual_model(feature)

        output_probs = F.softmax(outputs, dim=1)
        _, predicted = torch.max(outputs, 1)

        true_labels_all.append(labels)
        visual_probs.append(output_probs)

visual_probs

In [None]:
audio_probs = []

with torch.no_grad():
    for feature, labels in AUDIO_test_loader: #each batch
        outputs = audio_model(feature)

        output_probs = F.softmax(outputs, dim=1)
        _, predicted = torch.max(outputs, 1)

        audio_probs.append(output_probs)

audio_probs

In [None]:
text_probs = []

with torch.no_grad():
    for feature, labels in TEXT_test_loader: #each batch
        outputs = text_model(feature)

        output_probs = F.softmax(outputs, dim=1)
        _, predicted = torch.max(outputs, 1)

        text_probs.append(output_probs)

text_probs

In [13]:
final_probs = []
for (a, b, c) in zip(visual_probs, audio_probs, text_probs):
  tens = torch.add(a, b)
  tens = torch.add(tens, c)
  final_probs.append(tens)

In [14]:
predicted = []
for i, x in enumerate(final_probs):
  for j in x:
    predicted.append((j.argmax(0)).tolist())
len(predicted)

268

In [15]:
true_labels = []
for t in true_labels_all:
  for j in t.tolist():
    true_labels.append(j)
len(true_labels)

268

In [16]:
f1 = f1_score(true_labels, predicted, average='micro')
print(f'F1: {f1}')
cf_matrix = confusion_matrix(predicted, true_labels, labels = class_labels)
print(cf_matrix)

F1: 0.9776119402985075
[[ 77   0   0   1]
 [  0  55   0   1]
 [  0   1  26   0]
 [  1   0   2 104]]
