In [2]:
import torch
from torchtext.legacy import data
import torch.nn as nn
from torchtext.legacy.data import Field, LabelField, TabularDataset, Dataset

import torch.optim as optim

import time
import pandas as pd
import re
import spacy
from nltk.stem import PorterStemmer

from spacy.lang.en.stop_words import STOP_WORDS

In [3]:
import os
os.getcwd()

'C:\\Users\\Sunny\\Desktop\\Master\\Sem3\\5212\\A1\\Dataset_Assignment1'

In [4]:
# Set up random seed

SEED = 1

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [5]:
# Set up field & loading datasets

spacy_en = spacy.load("en_core_web_sm")

def custom_tokenizer(text):
    return [tok.text for tok in spacy_en.tokenizer(text) if tok.text not in STOP_WORDS]

TEXT = data.Field(sequential=True, tokenize=custom_tokenizer, lower=False)
LABEL = LabelField(dtype = torch.float)
 
train_datafield = [("title", TEXT), 
                   ("abstract", None),
                   ("InformationTheory", LABEL), 
                   ("ComputationalLinguistics", LABEL),
                   ("ComputerVision", LABEL)
                   ]

train_data, test_data = TabularDataset.splits(
    path = "./",
    train = "train.csv", test = "test.csv", format = "csv",
    skip_header = True, fields = train_datafield)

from torchtext.legacy.data import Dataset

def split_dataset(dataset, split_index):
    fields = dataset.fields
    examples = dataset.examples
    top_examples = examples[:split_index]
    remaining_examples = examples[split_index:]
    
    top_dataset = Dataset(top_examples, fields)
    remaining_dataset = Dataset(remaining_examples, fields)
    
    return top_dataset, remaining_dataset

train_data_1000, valid_data = split_dataset(train_data, 1000)

In [6]:
# Building vocab
MAX_VOCAB_SIZE = 1000

TEXT.build_vocab(train_data_1000, max_size = MAX_VOCAB_SIZE)
LABEL.build_vocab(train_data_1000)

In [7]:
# Create iterator #1 for small train_data

BATCH_SIZE = 64
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        
def preprocess_target_label(target_field):
    for example in train_data:
        setattr(example, f"label_{target_field}", getattr(example, target_field))
        
    for example in test_data:
        setattr(example, f"label_{target_field}", getattr(example, target_field))

def generate_label_iterator(dataset, target, validation = True):
    preprocess_target_label(target)
    
    label_attr = f"label_{target}"
    if validation:
        iterators = data.BucketIterator.splits(
            (dataset, valid_data, test_data),
            batch_size = BATCH_SIZE,
            device = device,
            sort_key = lambda x: len(getattr(x, label_attr)),
            sort_within_batch = False)
        return iterators[0], iterators[1], iterators[2]
    else:
        iterators = data.BucketIterator.splits(
            (dataset, test_data),
            batch_size = BATCH_SIZE,
            device = device,
            sort_key = lambda x: len(getattr(x, label_attr)),

            sort_within_batch = False)
        return iterators[0], iterators[1]

In [8]:
# Run generate_label_iterator functions

train_iterator_IT, validation_IT, test_iterator_IT = generate_label_iterator(train_data_1000, "InformationTheory", validation = True)
train_iterator_CL, validation_CL, test_iterator_CL = generate_label_iterator(train_data_1000, "ComputationalLinguistics", validation = True)
train_iterator_CV, validation_CV, test_iterator_CV = generate_label_iterator(train_data_1000, "ComputerVision", validation = True)

In [9]:
# Define RNN

import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        
        self.rnn = nn.RNN(embedding_dim, hidden_dim)
        
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text):

        embedded = self.embedding(text)
        
        output, hidden = self.rnn(embedded)
        
        assert torch.equal(output[-1,:,:], hidden.squeeze(0))
        
        return self.fc(hidden.squeeze(0))


In [10]:
# Model initialization & optimizer
def generate_model_and_optimizer(embedding_dim=100, hidden_dim=256, output_dim=1, lr=1e-3):
    INPUT_DIM = len(TEXT.vocab)

    model = RNN(INPUT_DIM, embedding_dim, hidden_dim, output_dim)

    optimizer = optim.SGD(model.parameters(), lr=lr)

    model = model.to(device)
    
    return model, optimizer

model_IT, optimizer_IT = generate_model_and_optimizer()
model_CL, optimizer_CL = generate_model_and_optimizer()
model_CV, optimizer_CV = generate_model_and_optimizer()

criterion = nn.BCEWithLogitsLoss()
criterion = criterion.to(device)

In [11]:
# Evaluation functions

def binary_accuracy(preds, y):

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

def train(model, iterator, optimizer, criterion, label_field, target_field):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:

        optimizer.zero_grad()
                
        predictions = model(getattr(batch, target_field)).squeeze(1)
        
        # Use the specific label field
        loss = criterion(predictions, getattr(batch, label_field))
        
        acc = binary_accuracy(predictions, getattr(batch, label_field))
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)


def evaluate(model, iterator, criterion, label_field, target_field):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(getattr(batch, target_field)).squeeze(1)
            
            # Use the specific label field
            loss = criterion(predictions, getattr(batch, label_field))
            
            acc = binary_accuracy(predictions, getattr(batch, label_field))

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)
    
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
# # Training Loop

# N_EPOCHS = 5
# label_names = ["InformationTheory", "ComputationalLinguistics", "ComputerVision"]
# models = [model_IT, model_CL, model_CV]
# optimizers = [optimizer_IT, optimizer_CL, optimizer_CV]
# iterators = [(train_iterator_IT, test_iterator_IT), (train_iterator_CL, test_iterator_CL), (train_iterator_CV, test_iterator_CV)]


# for idx, (label_name, model, optimizer, (train_iterator, test_iterator)) in enumerate(zip(label_names, models, optimizers, iterators)):
#     print(f"Training model for {label_name}...")
#     best_valid_loss_IT = float("inf")
#     best_valid_loss_CL = float("inf")
#     best_valid_loss_CV = float("inf")
    
#     for epoch in range(N_EPOCHS):
#         start_time = time.time()
        
        
#         if label_name == label_names[0]:
#             train_loss_IT, train_acc_IT = train(model_IT, train_iterator_IT, optimizers[idx], criterion, label_name, "title")
#             test_loss_IT, test_acc_IT = evaluate(model_IT, test_iterator_IT, criterion, label_name, "title")
#             end_time = time.time()
#             epoch_mins, epoch_secs = epoch_time(start_time, end_time)
#             if test_loss_IT < best_valid_loss_IT:
#                 best_valid_loss_IT = test_loss_IT
#                 torch.save(models[0].state_dict(), "RNN_model_IT.pt")
#             print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
#             print(f'InformationTheory Test Loss: {test_loss_IT:.3f} | Test Acc: {test_acc_IT*100:.2f}%')
#             model_IT.eval()
#             y_predict = []
#             y_test = []
#             with torch.no_grad():
#                 for batch in test_iterator_IT:
#                     predictions = model_IT(batch.title).squeeze(1)
#                     rounded_preds = torch.round(torch.sigmoid(predictions))
#                     y_predict += rounded_preds.tolist()
#                     y_test += batch.InformationTheory.tolist()
                       
#         elif label_name == label_names[1]:
#             train_loss_CL, train_acc_CL = train(model_CL, train_iterator_CL, optimizers[idx], criterion, label_name, "title")
#             test_loss_CL, test_acc_CL = evaluate(model_CL, test_iterator_CL, criterion, label_name, "title")
#             end_time = time.time()
#             epoch_mins, epoch_secs = epoch_time(start_time, end_time)
#             if test_loss_CL < best_valid_loss_CL:
#                 best_valid_loss_CL = test_loss_CL
#                 torch.save(models[1].state_dict(), "RNN_model_CL.pt")
#             print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
#             print(f'ComputationalLinguistics Test Loss: {test_loss_CL:.3f} | Test Acc: {test_acc_CL*100:.2f}%')

           
#         elif label_name == label_names[2]:
#             train_loss_CV, train_acc_CV = train(model_CV, train_iterator_CV, optimizers[2], criterion, label_name, "title")
#             test_loss_CV, test_acc_CV = evaluate(model_CV, test_iterator_CV, criterion, label_name, "title")
#             end_time = time.time()
#             epoch_mins, epoch_secs = epoch_time(start_time, end_time)
#             if test_loss_CV < best_valid_loss_CV:
#                 best_valid_loss_CV = test_loss_CV
#                 torch.save(models[2].state_dict(), "RNN_model_CV.pt")
#             print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
#             print(f'ComputerVision Test Loss: {test_loss_CV:.3f} | Test Acc: {test_acc_CV*100:.2f}%')


In [None]:
# # Training Loop

# N_EPOCHS = 5
# label_names = ["InformationTheory", "ComputationalLinguistics", "ComputerVision"]
# models = [model_IT, model_CL, model_CV]
# optimizers = [optimizer_IT, optimizer_CL, optimizer_CV]
# iterators = [(train_iterator_IT, test_iterator_IT), (train_iterator_CL, test_iterator_CL), (train_iterator_CV, test_iterator_CV)]

# best_valid_losses = [float("inf"), float("inf"), float("inf")]
# model_file_names = ["RNN_model_IT.pt", "RNN_model_CL.pt", "RNN_model_CV.pt"]
# target_field = "title"

# for idx, (label_name, model, optimizer, (train_iterator, test_iterator)) in enumerate(zip(label_names, models, optimizers, iterators)):
#     print(f"Training model for {label_name}...")
    
#     for epoch in range(N_EPOCHS):
#         start_time = time.time()

#         train_loss, train_acc = train(model, train_iterator, optimizer, criterion, label_name, target_field)
#         test_loss, test_acc = evaluate(model, test_iterator, criterion, label_name, target_field)
        
#         end_time = time.time()
#         epoch_mins, epoch_secs = epoch_time(start_time, end_time)
        
#         if test_loss < best_valid_losses[idx]:
#             best_valid_losses[idx] = test_loss
#             torch.save(model.state_dict(), model_file_names[idx])

#         print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
#         print(f'{label_name} Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')
        
#         # For predictions and ground truth collection
#         model.eval()
#         y_predict = []
#         y_test = []
#         with torch.no_grad():
#             for batch in test_iterator:
#                 predictions = model(getattr(batch, target_field)).squeeze(1)
#                 rounded_preds = torch.round(torch.sigmoid(predictions))
#                 y_predict += rounded_preds.tolist()
#                 y_test += getattr(batch, label_name).tolist()


In [13]:
def train_loop(ModelsList, OptimizersList, IteratorsList, N_EPOCHS, label_names, targe_field, modelFileNames, bestLossesList, preprocess, train_size):

    for idx, (label_name, model, optimizer, (train_iterator, test_iterator)) in enumerate(zip(label_names, ModelsList, OptimizersList, IteratorsList)):
        print(f"Training model for {preprocess}_{label_name}, using {train_size} data of {target_field}...")

        for epoch in range(N_EPOCHS):
            start_time = time.time()

            train_loss, train_acc = train(model, train_iterator, optimizer, criterion, label_name, target_field)
            test_loss, test_acc = evaluate(model, test_iterator, criterion, label_name, target_field)

            end_time = time.time()
            epoch_mins, epoch_secs = epoch_time(start_time, end_time)

            if test_loss < best_valid_losses[idx]:
                bestLossesList[idx] = test_loss
                torch.save(model.state_dict(), modelFileNames[idx])

            print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
            print(f'{label_name} Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

            # For predictions and ground truth collection
            model.eval()
            y_predict = []
            y_test = []
            with torch.no_grad():
                for batch in test_iterator:
                    predictions = model(getattr(batch, target_field)).squeeze(1)
                    rounded_preds = torch.round(torch.sigmoid(predictions))
                    y_predict += rounded_preds.tolist()
                    y_test += getattr(batch, label_name).tolist()
#     return y_predict, y_test

N_EPOCHS = 5
label_names = ["InformationTheory", "ComputationalLinguistics", "ComputerVision"]
models = [model_IT, model_CL, model_CV]
optimizers = [optimizer_IT, optimizer_CL, optimizer_CV]
iterators = [(train_iterator_IT, test_iterator_IT), (train_iterator_CL, test_iterator_CL), (train_iterator_CV, test_iterator_CV)]

best_valid_losses = [float("inf"), float("inf"), float("inf")]
model_file_names = ["RNN_model_IT.pt", "RNN_model_CL.pt", "RNN_model_CV.pt"]
target_field = "title"

# train_loop for "title", using top 1000 records of train data and preprocessing method 1
train_loop(models, optimizers, iterators, N_EPOCHS, label_names, target_field, model_file_names, best_valid_losses, "P1", "1000")




Training model for P1_InformationTheory, using 1000 data of title...
Epoch: 01 | Epoch Time: 0m 1s
InformationTheory Test Loss: 0.709 | Test Acc: 53.10%
Epoch: 02 | Epoch Time: 0m 1s
InformationTheory Test Loss: 0.727 | Test Acc: 53.11%
Epoch: 03 | Epoch Time: 0m 1s
InformationTheory Test Loss: 0.744 | Test Acc: 53.12%
Epoch: 04 | Epoch Time: 0m 1s
InformationTheory Test Loss: 0.757 | Test Acc: 53.13%
Epoch: 05 | Epoch Time: 0m 1s
InformationTheory Test Loss: 0.768 | Test Acc: 53.14%
Training model for P1_ComputationalLinguistics, using 1000 data of title...
Epoch: 01 | Epoch Time: 0m 1s
ComputationalLinguistics Test Loss: 0.576 | Test Acc: 80.35%
Epoch: 02 | Epoch Time: 0m 1s
ComputationalLinguistics Test Loss: 0.546 | Test Acc: 80.43%
Epoch: 03 | Epoch Time: 0m 1s
ComputationalLinguistics Test Loss: 0.528 | Test Acc: 80.50%
Epoch: 04 | Epoch Time: 0m 1s
ComputationalLinguistics Test Loss: 0.517 | Test Acc: 80.62%
Epoch: 05 | Epoch Time: 0m 1s
ComputationalLinguistics Test Loss: 0.510

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix, matthews_corrcoef
import numpy as np

def evaluate_model(model, iterator, label_field):
    y_predict = []
    y_test = []
    
    model.eval()
    with torch.no_grad():
        for batch in iterator:
            predictions = model(batch.title).squeeze(1)
            rounded_preds = torch.round(torch.sigmoid(predictions))
            y_predict += rounded_preds.tolist()
            y_test += getattr(batch, label_field).tolist()

    y_predict = np.asarray(y_predict)
    y_test = np.asarray(y_test)

    # Compute metrics
    recall = recall_score(y_test, y_predict, average='macro')
    precision = precision_score(y_test, y_predict, average='macro')
    f1score = f1_score(y_test, y_predict, average='macro')
    accuracy = accuracy_score(y_test, y_predict)
    matthews = matthews_corrcoef(y_test, y_predict)

    # Print metrics
    print(f"{label_field}:")
    print(confusion_matrix(y_test, y_predict))
    print('Accuracy:', accuracy)
    print('Macro Precision:', precision)
    print('Macro Recall:', recall)
    print('Macro F1 score:', f1score)
    print('MCC:', matthews)
    print("\n")

# Evaluate models
evaluate_model(model_IT, test_iterator_IT, "InformationTheory")
evaluate_model(model_CL, test_iterator_CL, "ComputationalLinguistics")
evaluate_model(model_CV, test_iterator_CV, "ComputerVision")

In [15]:
TEXT_P2 = Field(sequential=True, tokenize="spacy", lower=True)
LABEL_P2 = LabelField(dtype=torch.float)

# Field - P2
train_datafield_P2 = [("title", TEXT_P2),
                      ("abstract", None),
                      ("InformationTheory", LABEL_P2),
                      ("ComputationalLinguistics", LABEL_P2),
                      ("ComputerVision", LABEL_P2)
                      ]

train_data_P2, test_data_P2 = TabularDataset.splits(
    path="./",
    train="train.csv", test="test.csv", format="csv",
    skip_header=True, fields=train_datafield_P2)

train_data_1000_P2, valid_data_P2 = split_dataset(train_data_P2, 1000)

# Building vocab - P2
MAX_VOCAB_SIZE = 1000

TEXT_P2.build_vocab(train_data_1000_P2, max_size=MAX_VOCAB_SIZE)
LABEL_P2.build_vocab(train_data_1000_P2)

# Create iterator #2 for P2 train_data

train_iterator_IT_P2, validation_IT_P2, test_iterator_IT_P2 = generate_label_iterator(train_data_1000_P2, "InformationTheory", validation=True)
train_iterator_CL_P2, validation_CL_P2, test_iterator_CL_P2 = generate_label_iterator(train_data_1000_P2, "ComputationalLinguistics", validation=True)
train_iterator_CV_P2, validation_CV_P2, test_iterator_CV_P2 = generate_label_iterator(train_data_1000_P2, "ComputerVision", validation=True)

model_IT_P2, optimizer_IT_P2 = generate_model_and_optimizer()
model_CL_P2, optimizer_CL_P2 = generate_model_and_optimizer()
model_CV_P2, optimizer_CV_P2 = generate_model_and_optimizer()

# Training Loop - P2

models_P2 = [model_IT_P2, model_CL_P2, model_CV_P2]
optimizers_P2 = [optimizer_IT_P2, optimizer_CL_P2, optimizer_CV_P2]
iterators_P2 = [(train_iterator_IT_P2, test_iterator_IT_P2), (train_iterator_CL_P2, test_iterator_CL_P2), (train_iterator_CV_P2, test_iterator_CV_P2)]

best_valid_losses_P2 = [float("inf"), float("inf"), float("inf")]
model_file_names_P2 = ["RNN_model_IT_P2.pt", "RNN_model_CL_P2.pt", "RNN_model_CV_P2.pt"]

train_loop(models_P2, optimizers_P2, iterators_P2, N_EPOCHS, label_names, target_field, model_file_names_P2, best_valid_losses_P2, "P2", "1000")


Training model for P2_InformationTheory, using 1000 data of title...
Epoch: 01 | Epoch Time: 0m 1s
InformationTheory Test Loss: 0.697 | Test Acc: 53.11%
Epoch: 02 | Epoch Time: 0m 1s
InformationTheory Test Loss: 0.710 | Test Acc: 53.06%
Epoch: 03 | Epoch Time: 0m 1s
InformationTheory Test Loss: 0.725 | Test Acc: 53.06%
Epoch: 04 | Epoch Time: 0m 1s
InformationTheory Test Loss: 0.740 | Test Acc: 53.05%
Epoch: 05 | Epoch Time: 0m 1s
InformationTheory Test Loss: 0.751 | Test Acc: 53.02%
Training model for P2_ComputationalLinguistics, using 1000 data of title...
Epoch: 01 | Epoch Time: 0m 1s
ComputationalLinguistics Test Loss: 0.591 | Test Acc: 80.95%
Epoch: 02 | Epoch Time: 0m 1s
ComputationalLinguistics Test Loss: 0.555 | Test Acc: 81.00%
Epoch: 03 | Epoch Time: 0m 1s
ComputationalLinguistics Test Loss: 0.534 | Test Acc: 81.01%
Epoch: 04 | Epoch Time: 0m 1s
ComputationalLinguistics Test Loss: 0.521 | Test Acc: 81.00%
Epoch: 05 | Epoch Time: 0m 1s
ComputationalLinguistics Test Loss: 0.512

In [None]:

# Evaluate models
evaluate_model(model_IT_P2, test_iterator_IT_P2, "InformationTheory")
evaluate_model(model_CL_P2, test_iterator_CL_P2, "ComputationalLinguistics")
evaluate_model(model_CV_P2, test_iterator_CV_P2, "ComputerVision")

In [None]:
# Using all whole dataset

In [22]:
TEXT = data.Field(sequential=True, tokenize=custom_tokenizer, lower=False)
# TEXT = Field(sequential = True, tokenize = "spacy", lower = True)
LABEL = LabelField(dtype = torch.float)
 
train_datafield = [("title", TEXT), 
                   ("abstract", None),
                   ("InformationTheory", LABEL), 
                   ("ComputationalLinguistics", LABEL),
                   ("ComputerVision", LABEL)
                   ]

train_data, test_data = TabularDataset.splits(
    path = "./",
    train = "train.csv", test = "test.csv", format = "csv",
    skip_header = True, fields = train_datafield)

MAX_VOCAB_SIZE = 1000

TEXT.build_vocab(train_data, max_size = MAX_VOCAB_SIZE)
LABEL.build_vocab(train_data)

# Run generate_label_iterator functions

train_iterator_IT, test_iterator_IT = generate_label_iterator(train_data, "InformationTheory", validation = False)
train_iterator_CL, test_iterator_CL = generate_label_iterator(train_data, "ComputationalLinguistics", validation = False)
train_iterator_CV, test_iterator_CV = generate_label_iterator(train_data, "ComputerVision", validation = False)

model_IT, optimizer_IT = generate_model_and_optimizer()
model_CL, optimizer_CL = generate_model_and_optimizer()
model_CV, optimizer_CV = generate_model_and_optimizer()

criterion = nn.BCEWithLogitsLoss()
criterion = criterion.to(device)

In [23]:
# Training Loop

models = [model_IT, model_CL, model_CV]
optimizers = [optimizer_IT, optimizer_CL, optimizer_CV]
iterators = [(train_iterator_IT, test_iterator_IT), (train_iterator_CL, test_iterator_CL), (train_iterator_CV, test_iterator_CV)]


best_valid_losses = [float("inf"), float("inf"), float("inf")]
model_file_names = ["RNN_model_IT.pt", "RNN_model_CL.pt", "RNN_model_CV.pt"]

train_loop(models, optimizers, iterators, N_EPOCHS, label_names, target_field, model_file_names, best_valid_losses, "P1", "ALL")

Training model for P1_InformationTheory, using ALL data of title...
Epoch: 01 | Epoch Time: 0m 25s
InformationTheory Test Loss: 0.806 | Test Acc: 53.06%
Epoch: 02 | Epoch Time: 0m 23s
InformationTheory Test Loss: 0.814 | Test Acc: 53.04%
Epoch: 03 | Epoch Time: 0m 23s
InformationTheory Test Loss: 0.817 | Test Acc: 53.06%
Epoch: 04 | Epoch Time: 0m 23s
InformationTheory Test Loss: 0.814 | Test Acc: 53.08%
Epoch: 05 | Epoch Time: 0m 24s
InformationTheory Test Loss: 0.817 | Test Acc: 53.08%
Training model for P1_ComputationalLinguistics, using ALL data of title...
Epoch: 01 | Epoch Time: 0m 23s
ComputationalLinguistics Test Loss: 0.492 | Test Acc: 81.07%
Epoch: 02 | Epoch Time: 0m 23s
ComputationalLinguistics Test Loss: 0.492 | Test Acc: 81.15%
Epoch: 03 | Epoch Time: 0m 23s
ComputationalLinguistics Test Loss: 0.492 | Test Acc: 81.23%
Epoch: 04 | Epoch Time: 0m 23s
ComputationalLinguistics Test Loss: 0.492 | Test Acc: 81.25%
Epoch: 05 | Epoch Time: 0m 23s
ComputationalLinguistics Test Los

In [21]:
# # Training Loop

# N_EPOCHS = 5
# label_names = ["InformationTheory", "ComputationalLinguistics", "ComputerVision"]
# models = [model_IT, model_CL, model_CV]
# optimizers = [optimizer_IT, optimizer_CL, optimizer_CV]
# iterators = [(train_iterator_IT, test_iterator_IT), (train_iterator_CL, test_iterator_CL), (train_iterator_CV, test_iterator_CV)]


# for idx, (label_name, model, optimizer, (train_iterator, test_iterator)) in enumerate(zip(label_names, models, optimizers, iterators)):
#     print(f"Training model for {label_name}...")
#     best_valid_loss_IT = float("inf")
#     best_valid_loss_CL = float("inf")
#     best_valid_loss_CV = float("inf")
    
#     for epoch in range(N_EPOCHS):
#         start_time = time.time()
        
        
#         if label_name == label_names[0]:
#             train_loss_IT, train_acc_IT = train(model_IT, train_iterator_IT, optimizers[idx], criterion, label_name, "title")
#             test_loss_IT, test_acc_IT = evaluate(model_IT, test_iterator_IT, criterion, label_name, "title")
#             end_time = time.time()
#             epoch_mins, epoch_secs = epoch_time(start_time, end_time)
#             if test_loss_IT < best_valid_loss_IT:
#                 best_valid_loss_IT = test_loss_IT
#                 torch.save(models[0].state_dict(), "RNN_model_IT.pt")
#             print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
#             print(f'InformationTheory Test Loss: {test_loss_IT:.3f} | Test Acc: {test_acc_IT*100:.2f}%')
#             model_IT.eval()
#             y_predict = []
#             y_test = []
#             with torch.no_grad():
#                 for batch in test_iterator_IT:
#                     predictions = model_IT(batch.title).squeeze(1)
#                     rounded_preds = torch.round(torch.sigmoid(predictions))
#                     y_predict += rounded_preds.tolist()
#                     y_test += batch.InformationTheory.tolist()
                       
#         elif label_name == label_names[1]:
#             train_loss_CL, train_acc_CL = train(model_CL, train_iterator_CL, optimizers[idx], criterion, label_name, "title")
#             test_loss_CL, test_acc_CL = evaluate(model_CL, test_iterator_CL, criterion, label_name, "title")
#             end_time = time.time()
#             epoch_mins, epoch_secs = epoch_time(start_time, end_time)
#             if test_loss_CL < best_valid_loss_CL:
#                 best_valid_loss_CL = test_loss_CL
#                 torch.save(models[1].state_dict(), "RNN_model_CL.pt")
#             print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
#             print(f'ComputationalLinguistics Test Loss: {test_loss_CL:.3f} | Test Acc: {test_acc_CL*100:.2f}%')

           
#         elif label_name == label_names[2]:
#             train_loss_CV, train_acc_CV = train(model_CV, train_iterator_CV, optimizers[2], criterion, label_name, "title")
#             test_loss_CV, test_acc_CV = evaluate(model_CV, test_iterator_CV, criterion, label_name, "title")
#             end_time = time.time()
#             epoch_mins, epoch_secs = epoch_time(start_time, end_time)
#             if test_loss_CV < best_valid_loss_CV:
#                 best_valid_loss_CV = test_loss_CV
#                 torch.save(models[2].state_dict(), "RNN_model_CV.pt")
#             print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
#             print(f'ComputerVision Test Loss: {test_loss_CV:.3f} | Test Acc: {test_acc_CV*100:.2f}%')


Training model for InformationTheory...
Epoch: 01 | Epoch Time: 0m 23s
InformationTheory Test Loss: 0.803 | Test Acc: 53.09%
Epoch: 02 | Epoch Time: 0m 26s
InformationTheory Test Loss: 0.812 | Test Acc: 53.08%
Epoch: 03 | Epoch Time: 0m 23s
InformationTheory Test Loss: 0.817 | Test Acc: 53.07%
Epoch: 04 | Epoch Time: 0m 24s
InformationTheory Test Loss: 0.812 | Test Acc: 53.05%
Epoch: 05 | Epoch Time: 0m 23s
InformationTheory Test Loss: 0.815 | Test Acc: 53.06%
Training model for ComputationalLinguistics...
Epoch: 01 | Epoch Time: 0m 24s
ComputationalLinguistics Test Loss: 0.493 | Test Acc: 80.97%
Epoch: 02 | Epoch Time: 0m 23s
ComputationalLinguistics Test Loss: 0.492 | Test Acc: 81.14%
Epoch: 03 | Epoch Time: 0m 26s
ComputationalLinguistics Test Loss: 0.492 | Test Acc: 81.22%
Epoch: 04 | Epoch Time: 0m 26s
ComputationalLinguistics Test Loss: 0.492 | Test Acc: 81.26%
Epoch: 05 | Epoch Time: 0m 25s
ComputationalLinguistics Test Loss: 0.493 | Test Acc: 81.29%
Training model for ComputerV

In [None]:
TEXT_P2 = Field(sequential=True, tokenize="spacy", lower=True)
LABEL_P2 = LabelField(dtype=torch.float)

# Field - P2
train_datafield_P2 = [("title", TEXT_P2),
                      ("abstract", None),
                      ("InformationTheory", LABEL_P2),
                      ("ComputationalLinguistics", LABEL_P2),
                      ("ComputerVision", LABEL_P2)
                      ]

train_data_P2, test_data_P2 = TabularDataset.splits(
    path="./",
    train="train.csv", test="test.csv", format="csv",
    skip_header=True, fields=train_datafield_P2)


# Building vocab - P2
MAX_VOCAB_SIZE = 1000

TEXT_P2.build_vocab(train_data_P2, max_size=MAX_VOCAB_SIZE)
LABEL_P2.build_vocab(train_data_P2)

# Create iterator #2 for P2 train_data

train_iterator_IT_P2, test_iterator_IT_P2 = generate_label_iterator(train_data_1000_P2, "InformationTheory", validation=False)
train_iterator_CL_P2, test_iterator_CL_P2 = generate_label_iterator(train_data_1000_P2, "ComputationalLinguistics", validation=False)
train_iterator_CV_P2, test_iterator_CV_P2 = generate_label_iterator(train_data_1000_P2, "ComputerVision", validation=False)

model_IT_P2, optimizer_IT_P2 = generate_model_and_optimizer()
model_CL_P2, optimizer_CL_P2 = generate_model_and_optimizer()
model_CV_P2, optimizer_CV_P2 = generate_model_and_optimizer()

# Training Loop - P2

models_P2 = [model_IT_P2, model_CL_P2, model_CV_P2]
optimizers_P2 = [optimizer_IT_P2, optimizer_CL_P2, optimizer_CV_P2]
iterators_P2 = [(train_iterator_IT_P2, test_iterator_IT_P2), (train_iterator_CL_P2, test_iterator_CL_P2), (train_iterator_CV_P2, test_iterator_CV_P2)]

best_valid_losses_P2 = [float("inf"), float("inf"), float("inf")]
model_file_names_P2 = ["RNN_model_IT_P2.pt", "RNN_model_CL_P2.pt", "RNN_model_CV_P2.pt"]

for idx, (label_name, model, optimizer, (train_iterator, test_iterator)) in enumerate(zip(label_names, models_P2, optimizers_P2, iterators_P2)):
    print(f"Training model for P2_{label_name}...")
    
    for epoch in range(N_EPOCHS):
        start_time = time.time()

        train_loss, train_acc = train(model, train_iterator, optimizer, criterion, label_name, target_field)
        test_loss, test_acc = evaluate(model, test_iterator, criterion, label_name, target_field)
        
        end_time = time.time()
        epoch_mins, epoch_secs = epoch_time(start_time, end_time)
        
        if test_loss < best_valid_losses_P2[idx]:
            best_valid_losses_P2[idx] = test_loss
            torch.save(model.state_dict(), model_file_names_P2[idx])

        print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
        print(f'{label_name} Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')
            # For predictions and ground truth collection
        model.eval()
        y_predict_P2 = []
        y_test_P2 = []
        with torch.no_grad():
            for batch in test_iterator:
                predictions = model(getattr(batch, target_field)).squeeze(1)
                rounded_preds = torch.round(torch.sigmoid(predictions))
                y_predict_P2 += rounded_preds.tolist()
                y_test_P2 += getattr(batch, label_name).tolist()


In [None]:
TEXT = Field(sequential = True, tokenize = "spacy", lower = True)
LABEL = LabelField(dtype = torch.float)
 
train_datafield = [("title", None), 
                   ("abstract", TEXT),
                   ("InformationTheory", LABEL), 
                   ("ComputationalLinguistics", LABEL),
                   ("ComputerVision", LABEL)
                   ]

train_data, test_data = TabularDataset.splits(
    path = "./",
    train = "train.csv", test = "test.csv", format = "csv",
    skip_header = True, fields = train_datafield)

train_data_1000, valid_data = split_dataset(train_data, 1000)
MAX_VOCAB_SIZE = 1000

TEXT.build_vocab(train_data_1000, max_size = MAX_VOCAB_SIZE)
LABEL.build_vocab(train_data_1000)

# Run generate_label_iterator functions

train_iterator_IT, validation_IT, test_iterator_IT = generate_label_iterator(train_data_1000, "InformationTheory", validation = True)
train_iterator_CL, validation_CL, test_iterator_CL = generate_label_iterator(train_data_1000, "ComputationalLinguistics", validation = True)
train_iterator_CV, validation_CV, test_iterator_CV = generate_label_iterator(train_data_1000, "ComputerVision", validation = True)

model_IT, optimizer_IT = generate_model_and_optimizer()
model_CL, optimizer_CL = generate_model_and_optimizer()
model_CV, optimizer_CV = generate_model_and_optimizer()

criterion = nn.BCEWithLogitsLoss()
criterion = criterion.to(device)

In [None]:
# Training Loop

N_EPOCHS = 5
label_names = ["InformationTheory", "ComputationalLinguistics", "ComputerVision"]
models = [model_IT, model_CL, model_CV]
optimizers = [optimizer_IT, optimizer_CL, optimizer_CV]
iterators = [(train_iterator_IT, test_iterator_IT), (train_iterator_CL, test_iterator_CL), (train_iterator_CV, test_iterator_CV)]


for idx, (label_name, model, optimizer, (train_iterator, test_iterator)) in enumerate(zip(label_names, models, optimizers, iterators)):
    print(f"Training model for {label_name}...")
    best_valid_loss_IT = float("inf")
    best_valid_loss_CL = float("inf")
    best_valid_loss_CV = float("inf")
    
    target_field = "abstract"
    for epoch in range(N_EPOCHS):
        start_time = time.time()
        
        
        if label_name == label_names[0]:
            train_loss_IT, train_acc_IT = train(model_IT, train_iterator_IT, optimizers[idx], criterion, label_name, target_field)
            test_loss_IT, test_acc_IT = evaluate(model_IT, test_iterator_IT, criterion, label_name, "abstract")
            end_time = time.time()
            epoch_mins, epoch_secs = epoch_time(start_time, end_time)
            if test_loss_IT < best_valid_loss_IT:
                best_valid_loss_IT = test_loss_IT
                torch.save(models[0].state_dict(), "RNN_model_IT.pt")
            print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
            print(f'InformationTheory Test Loss: {test_loss_IT:.3f} | Test Acc: {test_acc_IT*100:.2f}%')
            model_IT.eval()
            y_predict = []
            y_test = []
            with torch.no_grad():
                for batch in test_iterator_IT:
                    predictions = model_IT(batch.abstract).squeeze(1)
                    rounded_preds = torch.round(torch.sigmoid(predictions))
                    y_predict += rounded_preds.tolist()
                    y_test += batch.InformationTheory.tolist()
                       
        elif label_name == label_names[1]:
            train_loss_CL, train_acc_CL = train(model_CL, train_iterator_CL, optimizers[idx], criterion, label_name, "abstract")
            test_loss_CL, test_acc_CL = evaluate(model_CL, test_iterator_CL, criterion, label_name, "abstract")
            end_time = time.time()
            epoch_mins, epoch_secs = epoch_time(start_time, end_time)
            if test_loss_CL < best_valid_loss_CL:
                best_valid_loss_CL = test_loss_CL
                torch.save(models[1].state_dict(), "RNN_model_CL.pt")
            print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
            print(f'ComputationalLinguistics Test Loss: {test_loss_CL:.3f} | Test Acc: {test_acc_CL*100:.2f}%')

           
        elif label_name == label_names[2]:
            train_loss_CV, train_acc_CV = train(model_CV, train_iterator_CV, optimizers[2], criterion, label_name, "abstract")
            test_loss_CV, test_acc_CV = evaluate(model_CV, test_iterator_CV, criterion, label_name, "abstract")
            end_time = time.time()
            epoch_mins, epoch_secs = epoch_time(start_time, end_time)
            if test_loss_CV < best_valid_loss_CV:
                best_valid_loss_CV = test_loss_CV
                torch.save(models[2].state_dict(), "RNN_model_CV.pt")
            print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
            print(f'ComputerVision Test Loss: {test_loss_CV:.3f} | Test Acc: {test_acc_CV*100:.2f}%')


In [None]:
# Training Loop

N_EPOCHS = 5
label_names = ["InformationTheory", "ComputationalLinguistics", "ComputerVision"]
models = [model_IT, model_CL, model_CV]
optimizers = [optimizer_IT, optimizer_CL, optimizer_CV]
iterators = [(train_iterator_IT, test_iterator_IT), (train_iterator_CL, test_iterator_CL), (train_iterator_CV, test_iterator_CV)]

best_valid_losses = [float("inf"), float("inf"), float("inf")]
model_file_names = ["RNN_model_IT.pt", "RNN_model_CL.pt", "RNN_model_CV.pt"]

for idx, (label_name, model, optimizer, (train_iterator, test_iterator)) in enumerate(zip(label_names, models, optimizers, iterators)):
    print(f"Training model for {label_name}...")
    target_field = "abstract"
    for epoch in range(N_EPOCHS):
        start_time = time.time()

        train_loss, train_acc = train(model, train_iterator, optimizer, criterion, label_name, target_field)
        test_loss, test_acc = evaluate(model, test_iterator, criterion, label_name, target_field)
        
        end_time = time.time()
        epoch_mins, epoch_secs = epoch_time(start_time, end_time)
        
        if test_loss < best_valid_losses[idx]:
            best_valid_losses[idx] = test_loss
            torch.save(model.state_dict(), model_file_names[idx])

        print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
        print(f'{label_name} Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')


In [None]:
TEXT.build_vocab(train_data, max_size = MAX_VOCAB_SIZE)
LABEL.build_vocab(train_data)

In [None]:
# Run generate_label_iterator functions

train_iterator_IT, validation_IT, test_iterator_IT = generate_label_iterator(train_data, "InformationTheory", validation = True)
train_iterator_CL, validation_CL, test_iterator_CL = generate_label_iterator(train_data, "ComputationalLinguistics", validation = True)
train_iterator_CV, validation_CV, test_iterator_CV = generate_label_iterator(train_data, "ComputerVision", validation = True)

In [None]:
model_IT, optimizer_IT = generate_model_and_optimizer()
model_CL, optimizer_CL = generate_model_and_optimizer()
model_CV, optimizer_CV = generate_model_and_optimizer()

criterion = nn.BCEWithLogitsLoss()
criterion = criterion.to(device)

In [None]:
# Training Loop

N_EPOCHS = 5
label_names = ["InformationTheory", "ComputationalLinguistics", "ComputerVision"]
models = [model_IT, model_CL, model_CV]
optimizers = [optimizer_IT, optimizer_CL, optimizer_CV]
iterators = [(train_iterator_IT, test_iterator_IT), (train_iterator_CL, test_iterator_CL), (train_iterator_CV, test_iterator_CV)]


for idx, (label_name, model, optimizer, (train_iterator, test_iterator)) in enumerate(zip(label_names, models, optimizers, iterators)):
    print(f"Training model for {label_name}...")
    best_valid_loss_IT = float("inf")
    best_valid_loss_CL = float("inf")
    best_valid_loss_CV = float("inf")
    
    target_field = "abstract"
    for epoch in range(N_EPOCHS):
        start_time = time.time()
        
        
        if label_name == label_names[0]:
            train_loss_IT, train_acc_IT = train(model_IT, train_iterator_IT, optimizers[idx], criterion, label_name, target_field)
            test_loss_IT, test_acc_IT = evaluate(model_IT, test_iterator_IT, criterion, label_name, "abstract")
            end_time = time.time()
            epoch_mins, epoch_secs = epoch_time(start_time, end_time)
            if test_loss_IT < best_valid_loss_IT:
                best_valid_loss_IT = test_loss_IT
                torch.save(models[0].state_dict(), "RNN_model_IT.pt")
            print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
            print(f'InformationTheory Test Loss: {test_loss_IT:.3f} | Test Acc: {test_acc_IT*100:.2f}%')
            model_IT.eval()
            y_predict = []
            y_test = []
            with torch.no_grad():
                for batch in test_iterator_IT:
                    predictions = model_IT(batch.abstract).squeeze(1)
                    rounded_preds = torch.round(torch.sigmoid(predictions))
                    y_predict += rounded_preds.tolist()
                    y_test += batch.InformationTheory.tolist()
                       
        elif label_name == label_names[1]:
            train_loss_CL, train_acc_CL = train(model_CL, train_iterator_CL, optimizers[idx], criterion, label_name, "abstract")
            test_loss_CL, test_acc_CL = evaluate(model_CL, test_iterator_CL, criterion, label_name, "abstract")
            end_time = time.time()
            epoch_mins, epoch_secs = epoch_time(start_time, end_time)
            if test_loss_CL < best_valid_loss_CL:
                best_valid_loss_CL = test_loss_CL
                torch.save(models[1].state_dict(), "RNN_model_CL.pt")
            print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
            print(f'ComputationalLinguistics Test Loss: {test_loss_CL:.3f} | Test Acc: {test_acc_CL*100:.2f}%')

           
        elif label_name == label_names[2]:
            train_loss_CV, train_acc_CV = train(model_CV, train_iterator_CV, optimizers[2], criterion, label_name, "abstract")
            test_loss_CV, test_acc_CV = evaluate(model_CV, test_iterator_CV, criterion, label_name, "abstract")
            end_time = time.time()
            epoch_mins, epoch_secs = epoch_time(start_time, end_time)
            if test_loss_CV < best_valid_loss_CV:
                best_valid_loss_CV = test_loss_CV
                torch.save(models[2].state_dict(), "RNN_model_CV.pt")
            print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
            print(f'ComputerVision Test Loss: {test_loss_CV:.3f} | Test Acc: {test_acc_CV*100:.2f}%')
