In [1]:
# !pip3 install torch==1.9.1+cu111 torchvision==0.10.1+cu111 torchaudio==0.9.1 -f https://download.pytorch.org/whl/torch_stable.html

In [2]:
# !pip3 install torchtext==0.10.0

In [3]:
# Import Libraries
import torch
import torchtext

from torchtext.legacy import data
import torch.nn as nn
from torchtext.legacy.data import Field, LabelField, TabularDataset, Dataset
import copy

import torch.optim as optim
import collections
import random
import time
import pandas as pd
import re
import spacy
from nltk.stem import PorterStemmer

from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix, matthews_corrcoef
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# check for device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(f"Current GPU: {torch.cuda.current_device()} - {torch.cuda.get_device_name(torch.cuda.current_device())}, device type: {device}")


Current GPU: 0 - NVIDIA GeForce RTX 3070 Laptop GPU, device type: cuda


In [5]:
import os
import random
os.getcwd()

'C:\\Users\\Sunny\\Desktop\\Master\\Sem3\\5212\\A1\\Dataset_Assignment1'

In [6]:
# Set up random seed

SEED = 1

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [7]:
# Start with working on "title" as TEXT, using top 1000 records of train_data as training data

# Set up field & loading datasets

# Import STOP_WORDS from spaCy
spacy_en = spacy.load("en_core_web_sm")
from spacy.lang.en.stop_words import STOP_WORDS

# Create first pre-processing method P1
# P1 aimms to remove all stop_words in TEXT Field
def custom_tokenizer(text):
  
    return [tok.text for tok in spacy_en.tokenizer(text) if tok.text.lower() not in STOP_WORDS]

ps = PorterStemmer()

def custom_tokenizer_P2(text):
    
    return [ps.stem(tok.text) for tok in spacy_en.tokenizer(text)]
# Create Field
TEXT = data.Field(sequential=True, tokenize=custom_tokenizer, lower=False)
LABEL = data.LabelField(dtype=torch.float, use_vocab = False, preprocessing = int)
 
    
train_datafield = [("title", TEXT), 
                   ("abstract", None),
                   ("InformationTheory", LABEL), 
                   ("ComputationalLinguistics", LABEL),
                   ("ComputerVision", LABEL)
                   ]



# Dataset - P1
train_data_whole, test_data = TabularDataset.splits(
    path = "./",
    train = "train.csv", test = "test.csv", format = "csv",
    skip_header = True, fields = train_datafield)

In [8]:
train_data, valid_data = train_data_whole.split(split_ratio = 0.9, random_state = random.getstate())
train_1000, remaining = train_data_whole.split(split_ratio = 1000 / len(train_data_whole), random_state = random.getstate())
train_1000, valid_1000 = train_1000.split(split_ratio = 0.9, random_state = random.getstate())



In [9]:
# Building vocab
MAX_VOCAB_SIZE = 5000

TEXT.build_vocab(train_1000, max_size = MAX_VOCAB_SIZE)

In [10]:
# Create iterator #1 for small train_data

BATCH_SIZE = 64
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
def preprocess_target_label(target_field, validation):
    for example in train_data:
        setattr(example, f"label_{target_field}", getattr(example, target_field))
        
    for example in test_data:
        setattr(example, f"label_{target_field}", getattr(example, target_field))

    if validation:
        for example in valid_data:
            setattr(example, f"label_{target_field}", getattr(example, target_field))

def generate_label_iterator(dataset, target, field_name, validation = True):
    preprocess_target_label(target, validation)
    
    label_attr = f"label_{target}"
    if validation:
        iterators = data.BucketIterator.splits(
            (dataset, valid_data, test_data),
            batch_size = BATCH_SIZE,
            device = device,
            sort_key = lambda x: len(getattr(x, field_name)),
            sort_within_batch = False)
        return iterators[0], iterators[1], iterators[2]
    else:
        iterators = data.BucketIterator.splits(
            (dataset, test_data),
            batch_size = BATCH_SIZE,
            device = device,
            sort_key = lambda x: len(getattr(x, field_name)),
            sort_within_batch = False)
        return iterators[0], iterators[1]

cuda


In [11]:
# Run generate_label_iterator functions

target_field = "title"
train_iterator_IT, validation_IT, test_iterator_IT = generate_label_iterator(train_1000, "InformationTheory", target_field, validation = True)
train_iterator_CL, validation_CL, test_iterator_CL = generate_label_iterator(train_1000, "ComputationalLinguistics", target_field, validation = True)
train_iterator_CV, validation_CV, test_iterator_CV = generate_label_iterator(train_1000, "ComputerVision", target_field, validation = True)

In [12]:
# Define RNN


class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, pos_weight = None):
        
        super(RNN, self).__init__()
        
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        
        self.rnn = nn.RNN(embedding_dim, hidden_dim)
        
        self.fc = nn.Linear(hidden_dim, output_dim)
        
        self.pos_weight = pos_weight

        if self.pos_weight is not None:
            self.criterion = nn.BCEWithLogitsLoss(pos_weight=self.pos_weight)
        else:
            self.criterion = nn.BCEWithLogitsLoss()

    def forward(self, text):

        embedded = self.embedding(text)
        
        output, hidden = self.rnn(embedded)

        assert torch.equal(output[-1,:,:], hidden.squeeze(0))
        
        return self.fc(hidden.squeeze(0))


In [13]:
# Initialize counts
negative_IT_count, positive_IT_count = 0, 0
negative_CL_count, positive_CL_count = 0, 0
negative_CV_count, positive_CV_count = 0, 0

# Iterate through the dataset
for example in train_1000.examples:
    if getattr(example, "InformationTheory") == 0:
        negative_IT_count += 1
    else:
        positive_IT_count += 1

    if getattr(example, "ComputationalLinguistics") == 0:
        negative_CL_count += 1
    else:
        positive_CL_count += 1

    if getattr(example, "ComputerVision") == 0:
        negative_CV_count += 1
    else:
        positive_CV_count += 1

In [14]:
print(negative_CV_count, positive_CV_count)
print(negative_IT_count, positive_IT_count)
print(negative_CL_count, positive_CL_count)

439 461
682 218
679 221


In [15]:
pos_weight_IT = torch.tensor([negative_IT_count / positive_IT_count])
pos_weight_CL = torch.tensor([negative_CL_count / positive_CL_count])
pos_weight_CV = torch.tensor([negative_CV_count / positive_CV_count])

In [16]:
# Model initialization & optimizer
def generate_model_and_optimizer(pos_weight, embedding_dim=100, hidden_dim=256, output_dim=1, lr=1e-2):
    INPUT_DIM = len(TEXT.vocab)

    model = RNN(INPUT_DIM, embedding_dim, hidden_dim, output_dim, pos_weight)

    optimizer = optim.SGD(model.parameters(), lr=lr)

    model = model.to(device)
    
    return model, optimizer

model_IT, optimizer_IT = generate_model_and_optimizer(pos_weight = pos_weight_IT)
model_CL, optimizer_CL = generate_model_and_optimizer(pos_weight = pos_weight_CL)
model_CV, optimizer_CV = generate_model_and_optimizer(pos_weight = pos_weight_CV)



In [17]:
# Evaluation functions

def binary_accuracy(preds, y):

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

def train(model, iterator, optimizer, criterion, label_field, target_field):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:

        optimizer.zero_grad()
                
        predictions = model(getattr(batch, target_field)).squeeze(1)
        
        # Use the specific label field
        loss = criterion(predictions, getattr(batch, label_field))
        
        acc = binary_accuracy(predictions, getattr(batch, label_field))
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)


def evaluate(model, iterator, criterion, label_field, target_field):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(getattr(batch, target_field)).squeeze(1)
            
            # Use the specific label field
            loss = criterion(predictions, getattr(batch, label_field))
            
            acc = binary_accuracy(predictions, getattr(batch, label_field))

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)
    
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [18]:
def train_loop(ModelsList, OptimizersList, IteratorsList, N_EPOCHS, label_names, target_field, modelFileNames, bestLossesList, preprocess, train_size, patience=5):
    result = []
    best_model_states = []
    best_epochs = [0, 0, 0]
    for idx, (label_name, model, optimizer, iterators_) in enumerate(zip(label_names, ModelsList, OptimizersList, IteratorsList)):
        print(f"Training model for {preprocess}_{label_name}, using {train_size} data of {target_field}...")

        bad_epochs = 0
        best_model_state = None
        best_epoch = 0

        for epoch in range(N_EPOCHS):
            start_time = time.time()

            train_iterator = iterators_[0]
            test_iterator = iterators_[2] if len(iterators_) > 2 else iterators_[1]
            valid_iterator = iterators_[1] if len(iterators_) > 2 else None

            train_loss, train_acc = train(model, train_iterator, optimizer, model.criterion, label_name, target_field)

            if valid_iterator:
                valid_loss, valid_acc = evaluate(model, valid_iterator, model.criterion, label_name, target_field)

            test_loss, test_acc = evaluate(model, test_iterator, model.criterion, label_name, target_field)

            end_time = time.time()
            epoch_mins, epoch_secs = epoch_time(start_time, end_time)

            if test_loss < bestLossesList[idx]:
                bestLossesList[idx] = test_loss
                best_model_state = copy.deepcopy(model.state_dict())
#                 best_model_state = model.state_dict()
                bad_epochs = 0
                best_epochs[idx] = epoch + 1

            else:
                bad_epochs += 1

            print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
            print(f'{label_name} Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')

            if valid_iterator:
                print(f'{label_name} Valid Loss: {valid_loss:.3f} | Valid Acc: {valid_acc*100:.2f}%')

            print(f'{label_name} Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

            if bad_epochs > patience:
                print(f"Early stopping at epoch {epoch+1} for {label_name} model.")
#                 model.load_state_dict(best_model_state)
                print(f"Best performing epoch for {label_name} model: {best_epochs[idx]}")

                break

            # For predictions and ground truth collection
            model.eval()
            y_predict, y_test = [], []
            with torch.no_grad():
                for batch in test_iterator:
                    predictions = model(getattr(batch, target_field)).squeeze(1)
                    rounded_preds = torch.round(torch.sigmoid(predictions))
                    y_predict += rounded_preds.tolist()
                    y_test += getattr(batch, label_name).tolist()
        
        best_model_states.append(best_model_state)

        result.append((y_predict, y_test))
        
    return result, best_model_states



In [19]:
# def train_loop(ModelsList, OptimizersList, IteratorsList, N_EPOCHS, label_names, target_field, modelFileNames, bestLossesList, preprocess, train_size, patience=5):
#     result = []
#     for idx, (label_name, model, optimizer, iterators_) in enumerate(zip(label_names, ModelsList, OptimizersList, IteratorsList)):
#         print(f"Training model for {preprocess}_{label_name}, using {train_size} data of {target_field}...")

#         bad_epochs = 0
#         best_model_state = None
#         best_epoch = 1

#         for epoch in range(N_EPOCHS):
#             start_time = time.time()

#             train_iterator = iterators_[0]
#             test_iterator = iterators_[2] if len(iterators_) > 2 else iterators_[1]
#             valid_iterator = iterators_[1] if len(iterators_) > 2 else None

#             train_loss, train_acc = train(model, train_iterator, optimizer, model.criterion, label_name, target_field)

#             if valid_iterator:
#                 valid_loss, valid_acc = evaluate(model, valid_iterator, model.criterion, label_name, target_field)

#             test_loss, test_acc = evaluate(model, test_iterator, model.criterion, label_name, target_field)

#             end_time = time.time()
#             epoch_mins, epoch_secs = epoch_time(start_time, end_time)

#             if test_loss < bestLossesList[idx]:
#                 bestLossesList[idx] = test_loss
#                 best_model_state = model.state_dict()
#                 best_epoch = epoch + 1
#                 bad_epochs = 0
#             else:
#                 bad_epochs += 1

#             print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
#             print(f'{label_name} Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')

#             if valid_iterator:
#                 print(f'{label_name} Valid Loss: {valid_loss:.3f} | Valid Acc: {valid_acc*100:.2f}%')

#             print(f'{label_name} Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

#             if bad_epochs > patience:
#                 print(f"Early stopping at epoch {epoch+1} for {label_name} model.")
                
#                 model = model.load_state_dict(best_model_state)
                
#                 print(f"Best performing epoch for {label_name} model: {best_epoch}")

#                 break

#             # For predictions and ground truth collection
#             model.eval()
#             y_predict, y_test = [], []
#             with torch.no_grad():
#                 for batch in test_iterator:
#                     predictions = model(getattr(batch, target_field)).squeeze(1)
#                     rounded_preds = torch.round(torch.sigmoid(predictions))
#                     y_predict += rounded_preds.tolist()
#                     y_test += getattr(batch, label_name).tolist()
                    
#         result.append((y_predict, y_test))
#     return result




In [20]:
N_EPOCHS = 50
label_names = ["InformationTheory", "ComputationalLinguistics", "ComputerVision"]
models = [model_IT, model_CL, model_CV]
optimizers = [optimizer_IT, optimizer_CL, optimizer_CV]
iterators = [(train_iterator_IT, validation_IT, test_iterator_IT), (train_iterator_CL, validation_CL, test_iterator_CL), (train_iterator_CV, validation_CV, test_iterator_CV)]
model_file_names = ["RNN_model_IT_T_P1_1000", "RNN_model_CL_T_P1_1000.pt", "RNN_model_CV_T_P1_1000.pt"]
best_valid_losses = [float("inf"), float("inf"), float("inf")]


In [21]:
results = []

results, best_model_states = train_loop(models, optimizers, iterators, N_EPOCHS, label_names, target_field, model_file_names, best_valid_losses, "P1", "1000", 10)

Training model for P1_InformationTheory, using 1000 data of title...
Epoch: 01 | Epoch Time: 0m 2s
InformationTheory Train Loss: 1.059 | Train Acc: 31.35%
InformationTheory Valid Loss: 1.052 | Valid Acc: 58.21%
InformationTheory Test Loss: 1.426 | Test Acc: 52.49%
Epoch: 02 | Epoch Time: 0m 0s
InformationTheory Train Loss: 1.029 | Train Acc: 54.17%
InformationTheory Valid Loss: 1.051 | Valid Acc: 59.28%
InformationTheory Test Loss: 1.427 | Test Acc: 52.69%
Epoch: 03 | Epoch Time: 0m 0s
InformationTheory Train Loss: 1.055 | Train Acc: 38.12%
InformationTheory Valid Loss: 1.051 | Valid Acc: 62.11%
InformationTheory Test Loss: 1.434 | Test Acc: 52.94%
Epoch: 04 | Epoch Time: 0m 0s
InformationTheory Train Loss: 1.039 | Train Acc: 63.65%
InformationTheory Valid Loss: 1.051 | Valid Acc: 61.02%
InformationTheory Test Loss: 1.434 | Test Acc: 53.10%
Epoch: 05 | Epoch Time: 0m 0s
InformationTheory Train Loss: 1.035 | Train Acc: 58.65%
InformationTheory Valid Loss: 1.051 | Valid Acc: 61.91%
Infor

In [22]:
# def evaluate_model(model, iterator, label_field, target_field, best_epoch):
def evaluate_model(model, iterator, label_field, target_field):
    
#     print(f"Evaluating {label_field} model at epoch {best_epoch}")

    y_predict = []
    y_test = []
    
    model.eval()
    with torch.no_grad():
        for batch in iterator:
            predictions = model(getattr(batch, target_field)).squeeze(1)
            rounded_preds = torch.round(torch.sigmoid(predictions))
            y_predict += rounded_preds.tolist()
            y_test += getattr(batch, label_field).tolist()

    y_predict = np.asarray(y_predict)
    y_test = np.asarray(y_test)

    # Compute metrics
    recall = recall_score(y_test, y_predict, average='macro')
    precision = precision_score(y_test, y_predict, average='macro')
    f1score = f1_score(y_test, y_predict, average='macro')
    accuracy = accuracy_score(y_test, y_predict)
    matthews = matthews_corrcoef(y_test, y_predict)

    # Print metrics
    print(f"{label_field}:")
    print(confusion_matrix(y_test, y_predict))
    print('Accuracy:', accuracy)
    print('Macro Precision:', precision)
    print('Macro Recall:', recall)
    print('Macro F1 score:', f1score)
    print('MCC:', matthews)
    print("\n")


In [23]:
model_IT.load_state_dict(best_model_states[0])
model_CL.load_state_dict(best_model_states[1])
model_CV.load_state_dict(best_model_states[2])

# Evaluate models
evaluate_model(model_IT, test_iterator_IT, "InformationTheory", target_field)
evaluate_model(model_CL, test_iterator_CL, "ComputationalLinguistics", target_field)
evaluate_model(model_CV, test_iterator_CV, "ComputerVision", target_field)

InformationTheory:
[[6376 3211]
 [4590 3889]]
Accuracy: 0.5681943983172811
Macro Precision: 0.5645900003339387
Macro Recall: 0.5618649283703301
Macro F1 score: 0.5598532653771587
MCC: 0.12642556298626814


ComputationalLinguistics:
[[12488  2210]
 [ 2855   513]]
Accuracy: 0.7196391010738403
Macro Precision: 0.5011584052452778
Macro Recall: 0.5009776606056571
Macro F1 score: 0.49992116886432636
MCC: 0.0021284051998569716


ComputerVision:
[[8184 3663]
 [3845 2374]]
Accuracy: 0.58441270895605
Macro Precision: 0.5367987415647677
Macro Recall: 0.5362705985476264
Macro F1 score: 0.5364720278007302
MCC: 0.07306743138645395




In [24]:
# P2

target_field = "title"
TEXT = Field(sequential=True, tokenize=custom_tokenizer_P2, lower=True)
LABEL = data.LabelField(dtype=torch.float, use_vocab = False, preprocessing = int)

# Field - P2
train_datafield_P2 = [("title", TEXT),
                      ("abstract", None),
                      ("InformationTheory", LABEL),
                      ("ComputationalLinguistics", LABEL),
                      ("ComputerVision", LABEL)
                      ]

train_data_whole, test_data = TabularDataset.splits(
    path="./",
    train="train.csv", test="test.csv", format="csv",
    skip_header=True, fields=train_datafield_P2)

train_data, valid_data = train_data_whole.split(split_ratio = 0.9, random_state = random.getstate())
train_1000, remaining = train_data_whole.split(split_ratio = 1000 / len(train_data_whole), random_state = random.getstate())
train_1000, valid_1000 = train_1000.split(split_ratio = 0.9, random_state = random.getstate())


# Building vocab - P2
MAX_VOCAB_SIZE = 5000

TEXT.build_vocab(train_1000, max_size=MAX_VOCAB_SIZE)

# Create iterator #2 for P2 train_data

train_iterator_IT, validation_IT, test_iterator_IT = generate_label_iterator(train_1000, "InformationTheory", target_field, validation=True)
train_iterator_CL, validation_CL, test_iterator_CL = generate_label_iterator(train_1000, "ComputationalLinguistics", target_field, validation=True)
train_iterator_CV, validation_CV, test_iterator_CV = generate_label_iterator(train_1000, "ComputerVision", target_field, validation=True)

In [25]:
len(TEXT.vocab)

2055

In [26]:
model_IT, optimizer_IT = generate_model_and_optimizer(pos_weight = pos_weight_IT)
model_CL, optimizer_CL = generate_model_and_optimizer(pos_weight = pos_weight_CL)
model_CV, optimizer_CV = generate_model_and_optimizer(pos_weight = pos_weight_CV)

# Training Loop - P2

models = [model_IT, model_CL, model_CV]
optimizers = [optimizer_IT, optimizer_CL, optimizer_CV]
iterators = [(train_iterator_IT, validation_IT, test_iterator_IT), (train_iterator_CL, validation_CL, test_iterator_CL), (train_iterator_CV, validation_CV, test_iterator_CV)]

best_valid_losses = [float("inf"), float("inf"), float("inf")]
model_file_names = ["RNN_model_IT_T_P2_1000.pt", "RNN_model_CL_T_P2_1000.pt", "RNN_model_T_P2_1000.pt"]

In [27]:
results_P2_1000_title, best_model_states = train_loop(models, optimizers, iterators, N_EPOCHS, label_names, target_field, model_file_names, best_valid_losses, "P2", "1000", 10)

Training model for P2_InformationTheory, using 1000 data of title...
Epoch: 01 | Epoch Time: 0m 1s
InformationTheory Train Loss: 1.052 | Train Acc: 52.50%
InformationTheory Valid Loss: 1.045 | Valid Acc: 51.87%
InformationTheory Test Loss: 1.372 | Test Acc: 53.19%
Epoch: 02 | Epoch Time: 0m 0s
InformationTheory Train Loss: 1.026 | Train Acc: 54.48%
InformationTheory Valid Loss: 1.045 | Valid Acc: 52.13%
InformationTheory Test Loss: 1.374 | Test Acc: 53.12%
Epoch: 03 | Epoch Time: 0m 0s
InformationTheory Train Loss: 1.056 | Train Acc: 37.08%
InformationTheory Valid Loss: 1.044 | Valid Acc: 53.73%
InformationTheory Test Loss: 1.379 | Test Acc: 53.57%
Epoch: 04 | Epoch Time: 0m 0s
InformationTheory Train Loss: 1.033 | Train Acc: 64.27%
InformationTheory Valid Loss: 1.044 | Valid Acc: 53.39%
InformationTheory Test Loss: 1.380 | Test Acc: 53.39%
Epoch: 05 | Epoch Time: 0m 0s
InformationTheory Train Loss: 1.025 | Train Acc: 61.77%
InformationTheory Valid Loss: 1.044 | Valid Acc: 54.36%
Infor

In [28]:
print(TEXT.vocab.freqs.most_common(50))


[('-', 590), ('\n  ', 444), ('for', 345), ('and', 237), (':', 223), ('of', 222), ('in', 158), ('a', 147), ('with', 139), ('network', 128), ('imag', 123), ('the', 119), ('learn', 117), ('on', 83), ('use', 83), ('to', 79), ('neural', 73), ('deep', 70), ('model', 70), ('base', 63), ('detect', 59), ('gener', 58), ('multi', 57), ('code', 57), ('from', 52), ('classif', 49), ('segment', 48), ('via', 48), ('recognit', 46), ('data', 46), ('channel', 41), ('object', 39), ('featur', 39), ('3d', 36), ('analysi', 36), ('visual', 34), ('convolut', 33), ('video', 32), ('text', 31), ('languag', 30), ('supervis', 30), ('inform', 30), ('semant', 28), ('an', 28), ('improv', 27), ('system', 27), ('optim', 26), ('attent', 26), ('adversari', 26), ('approach', 25)]


In [29]:

model_IT.load_state_dict(best_model_states[0])
model_CL.load_state_dict(best_model_states[1])
model_CV.load_state_dict(best_model_states[2])

evaluate_model(model_IT, test_iterator_IT, "InformationTheory", target_field)
evaluate_model(model_CL, test_iterator_CL, "ComputationalLinguistics", target_field)
evaluate_model(model_CV, test_iterator_CV, "ComputerVision", target_field)

InformationTheory:
[[4549 5038]
 [3401 5078]]
Accuracy: 0.5328794420458319
Macro Precision: 0.5370891619478204
Macro Recall: 0.5366940465004669
Macro F1 score: 0.5324785856287286
MCC: 0.07378215050204674


ComputationalLinguistics:
[[10251  4447]
 [ 2243  1125]]
Accuracy: 0.6296911325141149
Macro Precision: 0.511188098212483
Macro Recall: 0.5157339785431404
Macro F1 score: 0.502824804373967
MCC: 0.02653550807606708


ComputerVision:
[[7326 4521]
 [3559 2660]]
Accuracy: 0.5527510240230267
Macro Precision: 0.5217291176372192
Macro Recall: 0.5230529498737134
Macro F1 score: 0.5207844290775578
MCC: 0.04476249589515029




In [30]:
# Using all data to train models

In [31]:
# P1
target_field = "title"

TEXT = data.Field(sequential=True, tokenize=custom_tokenizer, lower=True)
LABEL = data.LabelField(dtype=torch.float, use_vocab = False, preprocessing = int)

train_datafield = [("title", TEXT), 
                   ("abstract", None),
                   ("InformationTheory", LABEL), 
                   ("ComputationalLinguistics", LABEL),
                   ("ComputerVision", LABEL)
                   ]

# Dataset - P1
train_data_whole, test_data = TabularDataset.splits(
    path = "./",
    train = "train.csv", test = "test.csv", format = "csv",
    skip_header = True, fields = train_datafield)


train_data, valid_data = train_data_whole.split(split_ratio = 0.9, random_state = random.getstate())
train_1000, remaining = train_data_whole.split(split_ratio = 1000 / len(train_data_whole), random_state = random.getstate())
train_1000, valid_1000 = train_1000.split(split_ratio = 0.9, random_state = random.getstate())

In [32]:
# Building vocab
MAX_VOCAB_SIZE = 5000

TEXT.build_vocab(train_data, max_size = MAX_VOCAB_SIZE)
# LABEL.build_vocab(train_data)

In [33]:
# Run generate_label_iterator functions

train_iterator_IT, validation_IT, test_iterator_IT = generate_label_iterator(train_data, "InformationTheory", target_field, validation = True)
train_iterator_CL, validation_CL, test_iterator_CL = generate_label_iterator(train_data, "ComputationalLinguistics", target_field, validation = True)
train_iterator_CV, validation_CV, test_iterator_CV = generate_label_iterator(train_data, "ComputerVision", target_field, validation = True)

In [34]:
# Initialize counts
negative_IT_count, positive_IT_count = 0, 0
negative_CL_count, positive_CL_count = 0, 0
negative_CV_count, positive_CV_count = 0, 0

# Iterate through the dataset
for example in train_data.examples:
    if getattr(example, "InformationTheory") == 0:
        negative_IT_count += 1
    else:
        positive_IT_count += 1

    if getattr(example, "ComputationalLinguistics") == 0:
        negative_CL_count += 1
    else:
        positive_CL_count += 1

    if getattr(example, "ComputerVision") == 0:
        negative_CV_count += 1
    else:
        positive_CV_count += 1
        
print(negative_CV_count, positive_CV_count)
print(negative_IT_count, positive_IT_count)
print(negative_CL_count, positive_CL_count)

53998 58502
85503 26997
85499 27001


In [35]:
pos_weight_IT = torch.tensor([negative_IT_count / positive_IT_count])
pos_weight_CL = torch.tensor([negative_CL_count / positive_CL_count])
pos_weight_CV = torch.tensor([negative_CV_count / positive_CV_count])


model_IT, optimizer_IT = generate_model_and_optimizer(pos_weight = pos_weight_IT)
model_CL, optimizer_CL = generate_model_and_optimizer(pos_weight = pos_weight_CL)
model_CV, optimizer_CV = generate_model_and_optimizer(pos_weight = pos_weight_CV)

In [36]:
N_EPOCHS = 30
label_names = ["InformationTheory", "ComputationalLinguistics", "ComputerVision"]
models = [model_IT, model_CL, model_CV]
optimizers = [optimizer_IT, optimizer_CL, optimizer_CV]
iterators = [(train_iterator_IT, validation_IT, test_iterator_IT), (train_iterator_CL, validation_CL, test_iterator_CL), (train_iterator_CV, validation_CV, test_iterator_CV)]
model_file_names = ["RNN_model_IT_A_P1_ALL", "RNN_model_CL_A_P1_ALL.pt", "RNN_model_CV_A_P1_ALL.pt"]
best_valid_losses = [float("inf"), float("inf"), float("inf")]


In [37]:
print(TEXT.vocab.freqs.most_common(50))
print(len(TEXT.vocab))

[('-', 75579), ('\n  ', 55115), (':', 27824), ('learning', 15365), ('based', 10318), ('networks', 9616), ('deep', 8914), ('image', 8753), ('neural', 8506), ('detection', 7289), ('network', 6760), ('multi', 6631), ('segmentation', 5242), ('recognition', 5184), ('data', 4231), ('analysis', 4159), ('language', 4123), ('classification', 4113), ('object', 3703), ('convolutional', 3649), ('codes', 3637), ('images', 3597), ('model', 3515), ('estimation', 3473), ('models', 3444), ('visual', 3408), ('3d', 3369), ('information', 3242), ('semantic', 3144), ('text', 3098), ('video', 3058), ('adversarial', 2929), ('supervised', 2925), ('systems', 2806), ('domain', 2747), ('channel', 2676), ('approach', 2664), ('attention', 2633), ('generation', 2509), ('efficient', 2463), ('end', 2456), ('machine', 2363), ('time', 2295), ('unsupervised', 2289), ('self', 2254), ('training', 2219), ('graph', 2184), ('mimo', 2163), ('translation', 2130), ('robust', 2077)]
5002


In [38]:
RESULTS_T_P1_ALL, best_model_states = train_loop(models, optimizers, iterators, N_EPOCHS, label_names, target_field, model_file_names, best_valid_losses, "P1", "ALL", 5)

Training model for P1_InformationTheory, using ALL data of title...
Epoch: 01 | Epoch Time: 0m 7s
InformationTheory Train Loss: 1.055 | Train Acc: 48.71%
InformationTheory Valid Loss: 1.040 | Valid Acc: 59.54%
InformationTheory Test Loss: 1.401 | Test Acc: 55.49%
Epoch: 02 | Epoch Time: 0m 7s
InformationTheory Train Loss: 1.054 | Train Acc: 48.95%
InformationTheory Valid Loss: 1.025 | Valid Acc: 69.20%
InformationTheory Test Loss: 1.413 | Test Acc: 59.14%
Epoch: 03 | Epoch Time: 0m 7s
InformationTheory Train Loss: 0.969 | Train Acc: 61.23%
InformationTheory Valid Loss: 0.811 | Valid Acc: 78.21%
InformationTheory Test Loss: 1.185 | Test Acc: 72.98%
Epoch: 04 | Epoch Time: 0m 7s
InformationTheory Train Loss: 0.742 | Train Acc: 77.84%
InformationTheory Valid Loss: 0.701 | Valid Acc: 80.53%
InformationTheory Test Loss: 1.056 | Test Acc: 76.62%
Epoch: 05 | Epoch Time: 0m 7s
InformationTheory Train Loss: 0.622 | Train Acc: 82.10%
InformationTheory Valid Loss: 0.590 | Valid Acc: 86.90%
Inform

In [39]:

model_IT.load_state_dict(best_model_states[0])
model_CL.load_state_dict(best_model_states[1])
model_CV.load_state_dict(best_model_states[2])

evaluate_model(model_IT, test_iterator_IT, "InformationTheory", target_field)
evaluate_model(model_CL, test_iterator_CL, "ComputationalLinguistics", target_field)
evaluate_model(model_CV, test_iterator_CV, "ComputerVision", target_field)

InformationTheory:
[[8133 1454]
 [ 629 7850]]
Accuracy: 0.8847005424554412
Macro Precision: 0.8859679333273538
Macro Recall: 0.8870765061972792
Macro F1 score: 0.8846722426984712
MCC: 0.773043644658014


ComputationalLinguistics:
[[13334  1364]
 [  616  2752]]
Accuracy: 0.8904018598472269
Macro Precision: 0.8122262975850862
Macro Recall: 0.8621501980168258
Macro F1 score: 0.8331604117640299
MCC: 0.6725260307125648


ComputerVision:
[[10423  1424]
 [  877  5342]]
Accuracy: 0.8726336765194288
Macro Precision: 0.8559626476997166
Macro Recall: 0.8693906684727787
Macro F1 score: 0.8616937013321913
MCC: 0.725229013167947




In [40]:
# P2
def custom_tokenizer_P2(text):
    return [ps.stem(tok.text) for tok in spacy_en.tokenizer(text)]

target_field = "title"
TEXT = data.Field(sequential=True, tokenize=custom_tokenizer_P2, lower=False)
LABEL = data.LabelField(dtype=torch.float, use_vocab = False, preprocessing = int)

train_datafield = [("title", TEXT), 
                   ("abstract", None),
                   ("InformationTheory", LABEL), 
                   ("ComputationalLinguistics", LABEL),
                   ("ComputerVision", LABEL)
                   ]

train_data_whole, test_data = TabularDataset.splits(
    path = "./",
    train = "train.csv", test = "test.csv", format = "csv",
    skip_header = True, fields = train_datafield)

train_data, valid_data = train_data_whole.split(split_ratio = 0.9, random_state = random.getstate())
train_1000, remaining = train_data_whole.split(split_ratio = 1000 / len(train_data_whole), random_state = random.getstate())
train_1000, valid_1000 = train_1000.split(split_ratio = 0.9, random_state = random.getstate())

# Building vocab
MAX_VOCAB_SIZE = 5000

TEXT.build_vocab(train_data, max_size = MAX_VOCAB_SIZE)
# LABEL.build_vocab(train_data)

train_iterator_IT, validation_IT, test_iterator_IT = generate_label_iterator(train_data, "InformationTheory", target_field, validation = True)
train_iterator_CL, validation_CL, test_iterator_CL = generate_label_iterator(train_data, "ComputationalLinguistics", target_field, validation = True)
train_iterator_CV, validation_CV, test_iterator_CV = generate_label_iterator(train_data, "ComputerVision", target_field, validation = True)


model_IT, optimizer_IT = generate_model_and_optimizer(pos_weight = pos_weight_IT)
model_CL, optimizer_CL = generate_model_and_optimizer(pos_weight = pos_weight_CL)
model_CV, optimizer_CV = generate_model_and_optimizer(pos_weight = pos_weight_CV)


In [41]:
N_EPOCHS = 30
label_names = ["InformationTheory", "ComputationalLinguistics", "ComputerVision"]
models = [model_IT, model_CL, model_CV]
optimizers = [optimizer_IT, optimizer_CL, optimizer_CV]
iterators = [(train_iterator_IT, validation_IT, test_iterator_IT), (train_iterator_CL, validation_CL, test_iterator_CL), (train_iterator_CV, validation_CV, test_iterator_CV)]
model_file_names = ["RNN_model_IT", "RNN_model_CL.pt", "RNN_model_CV.pt"]
best_valid_losses = [float("inf"), float("inf"), float("inf")]

In [42]:
RESULTS_T_P2_ALL, best_model_states = train_loop(models, optimizers, iterators, N_EPOCHS, label_names, target_field, model_file_names, best_valid_losses, "P2", "ALL", 5)

Training model for P2_InformationTheory, using ALL data of title...
Epoch: 01 | Epoch Time: 0m 8s
InformationTheory Train Loss: 1.056 | Train Acc: 49.32%
InformationTheory Valid Loss: 1.035 | Valid Acc: 64.34%
InformationTheory Test Loss: 1.421 | Test Acc: 54.86%
Epoch: 02 | Epoch Time: 0m 8s
InformationTheory Train Loss: 1.055 | Train Acc: 49.15%
InformationTheory Valid Loss: 1.025 | Valid Acc: 69.75%
InformationTheory Test Loss: 1.423 | Test Acc: 57.06%
Epoch: 03 | Epoch Time: 0m 8s
InformationTheory Train Loss: 1.054 | Train Acc: 50.17%
InformationTheory Valid Loss: 1.016 | Valid Acc: 70.48%
InformationTheory Test Loss: 1.414 | Test Acc: 58.57%
Epoch: 04 | Epoch Time: 0m 8s
InformationTheory Train Loss: 1.053 | Train Acc: 49.85%
InformationTheory Valid Loss: 1.008 | Valid Acc: 70.15%
InformationTheory Test Loss: 1.410 | Test Acc: 59.97%
Epoch: 05 | Epoch Time: 0m 8s
InformationTheory Train Loss: 1.053 | Train Acc: 49.39%
InformationTheory Valid Loss: 1.002 | Valid Acc: 72.70%
Inform

In [43]:

model_IT.load_state_dict(best_model_states[0])
model_CL.load_state_dict(best_model_states[1])
model_CV.load_state_dict(best_model_states[2])

evaluate_model(model_IT, test_iterator_IT, "InformationTheory", target_field)
evaluate_model(model_CL, test_iterator_CL, "ComputationalLinguistics", target_field)
evaluate_model(model_CV, test_iterator_CV, "ComputerVision", target_field)

InformationTheory:
[[4770 4817]
 [2235 6244]]
Accuracy: 0.6096534927488099
Macro Precision: 0.6227240529305369
Macro Recall: 0.6169781795932355
Macro F1 score: 0.6070375877585626
MCC: 0.2396333558093955


ComputationalLinguistics:
[[12539  2159]
 [ 1005  2363]]
Accuracy: 0.8248643861397099
Macro Precision: 0.7241768960203225
Macro Recall: 0.7773562959912783
Macro F1 score: 0.7434771662515958
MCC: 0.4987058192041421


ComputerVision:
[[10816  1031]
 [  973  5246]]
Accuracy: 0.8890733975423447
Macro Precision: 0.8766074978858318
Macro Recall: 0.8782588633799385
Macro F1 score: 0.8774213806972979
MCC: 0.7548645549783282




In [44]:
# "Abstract" models 

In [45]:
# P1
target_field = "abstract"

TEXT = data.Field(sequential=True, tokenize=custom_tokenizer, lower=True)
LABEL = data.LabelField(dtype=torch.float, use_vocab = False, preprocessing = int)

train_datafield = [("title", None), 
                   ("abstract", TEXT),
                   ("InformationTheory", LABEL), 
                   ("ComputationalLinguistics", LABEL),
                   ("ComputerVision", LABEL)
                   ]

# Dataset - P1
train_data_whole, test_data = TabularDataset.splits(
    path = "./",
    train = "train.csv", test = "test.csv", format = "csv",
    skip_header = True, fields = train_datafield)

train_data, valid_data = train_data_whole.split(split_ratio = 0.9, random_state = random.getstate())
train_1000, remaining = train_data_whole.split(split_ratio = 1000 / len(train_data_whole), random_state = random.getstate())
train_1000, valid_1000 = train_1000.split(split_ratio = 0.9, random_state = random.getstate())


# Building vocab
MAX_VOCAB_SIZE = 5000

TEXT.build_vocab(train_1000, max_size = MAX_VOCAB_SIZE)

# Run generate_label_iterator functions


train_iterator_IT, validation_IT, test_iterator_IT = generate_label_iterator(train_1000, "InformationTheory", target_field, validation = True)
train_iterator_CL, validation_CL, test_iterator_CL = generate_label_iterator(train_1000, "ComputationalLinguistics", target_field, validation = True)
train_iterator_CV, validation_CV, test_iterator_CV = generate_label_iterator(train_1000, "ComputerVision", target_field, validation = True)

In [46]:
# Initialize counts
negative_IT_count, positive_IT_count = 0, 0
negative_CL_count, positive_CL_count = 0, 0
negative_CV_count, positive_CV_count = 0, 0

# Iterate through the dataset
for example in train_1000.examples:
    if getattr(example, "InformationTheory") == 0:
        negative_IT_count += 1
    else:
        positive_IT_count += 1

    if getattr(example, "ComputationalLinguistics") == 0:
        negative_CL_count += 1
    else:
        positive_CL_count += 1

    if getattr(example, "ComputerVision") == 0:
        negative_CV_count += 1
    else:
        positive_CV_count += 1
        
print(negative_CV_count, positive_CV_count)
print(negative_IT_count, positive_IT_count)
print(negative_CL_count, positive_CL_count)

439 461
682 218
679 221


In [47]:
pos_weight_IT = torch.tensor([negative_IT_count / positive_IT_count])
pos_weight_CL = torch.tensor([negative_CL_count / positive_CL_count])
pos_weight_CV = torch.tensor([negative_CV_count / positive_CV_count])


model_IT, optimizer_IT = generate_model_and_optimizer(pos_weight = pos_weight_IT)
model_CL, optimizer_CL = generate_model_and_optimizer(pos_weight = pos_weight_CL)
model_CV, optimizer_CV = generate_model_and_optimizer(pos_weight = pos_weight_CV)

In [48]:

N_EPOCHS = 30
label_names = ["InformationTheory", "ComputationalLinguistics", "ComputerVision"]
models = [model_IT, model_CL, model_CV]
optimizers = [optimizer_IT, optimizer_CL, optimizer_CV]
iterators = [(train_iterator_IT, validation_IT, test_iterator_IT), (train_iterator_CL, validation_CL, test_iterator_CL), (train_iterator_CV, validation_CV, test_iterator_CV)]
model_file_names = ["RNN_model_IT", "RNN_model_CL.pt", "RNN_model_CV.pt"]
best_valid_losses = [float("inf"), float("inf"), float("inf")]


In [49]:
print(TEXT.vocab.freqs.most_common(200))

[('\n', 12511), ('.', 6260), ('-', 4841), (')', 1470), ('(', 1412), ('  ', 900), ('model', 640), ('data', 578), ('based', 544), ('image', 542), ('learning', 492), ('performance', 466), ('proposed', 455), ('method', 436), ('paper', 434), ('$', 432), ('results', 429), ('models', 418), ('network', 418), ('propose', 404), ('images', 393), ('information', 363), ('methods', 341), ('approach', 329), ('state', 324), ('neural', 299), ('dataset', 296), ('new', 294), ('task', 294), ('problem', 289), ('networks', 284), ('training', 274), ('features', 274), ('different', 266), ('art', 265), ('deep', 261), ('work', 255), ('classification', 254), ('"', 252), ('large', 231), ('time', 224), ('system', 222), ('multi', 220), ('novel', 216), ('framework', 214), ('object', 214), ('use', 207), ('high', 203), ('detection', 201), ('%', 199), ('datasets', 197), ('multiple', 197), ('accuracy', 194), ('channel', 192), ('language', 191), ('tasks', 186), ('recognition', 185), ('segmentation', 184), ('codes', 180),

In [50]:
RESULTS_A_P1_1000, best_model_states = train_loop(models, optimizers, iterators, N_EPOCHS, label_names, target_field, model_file_names, best_valid_losses, "P1", "1000", 5)

Training model for P1_InformationTheory, using 1000 data of abstract...
Epoch: 01 | Epoch Time: 0m 3s
InformationTheory Train Loss: 1.054 | Train Acc: 39.90%
InformationTheory Valid Loss: 1.048 | Valid Acc: 65.81%
InformationTheory Test Loss: 1.429 | Test Acc: 52.15%
Epoch: 02 | Epoch Time: 0m 3s
InformationTheory Train Loss: 1.032 | Train Acc: 51.98%
InformationTheory Valid Loss: 1.050 | Valid Acc: 70.74%
InformationTheory Test Loss: 1.453 | Test Acc: 52.63%
Epoch: 03 | Epoch Time: 0m 3s
InformationTheory Train Loss: 1.055 | Train Acc: 52.40%
InformationTheory Valid Loss: 1.054 | Valid Acc: 73.15%
InformationTheory Test Loss: 1.478 | Test Acc: 52.52%
Epoch: 04 | Epoch Time: 0m 3s
InformationTheory Train Loss: 1.034 | Train Acc: 65.31%
InformationTheory Valid Loss: 1.049 | Valid Acc: 72.80%
InformationTheory Test Loss: 1.480 | Test Acc: 53.03%
Epoch: 05 | Epoch Time: 0m 3s
InformationTheory Train Loss: 1.032 | Train Acc: 59.79%
InformationTheory Valid Loss: 1.050 | Valid Acc: 73.22%
In

In [51]:
model_IT.load_state_dict(best_model_states[0])
model_CL.load_state_dict(best_model_states[1])
model_CV.load_state_dict(best_model_states[2])

# Evaluate models
evaluate_model(model_IT, test_iterator_IT, "InformationTheory", target_field)
evaluate_model(model_CL, test_iterator_CL, "ComputationalLinguistics", target_field)
evaluate_model(model_CV, test_iterator_CV, "ComputerVision", target_field)

InformationTheory:
[[ 250 9337]
 [ 130 8349]]
Accuracy: 0.47597697332004874
Macro Precision: 0.5649815197271705
Macro Recall: 0.5053724912725004
Macro F1 score: 0.34417316107419726
MCC: 0.03736911278625885


ComputationalLinguistics:
[[14290   408]
 [ 3309    59]]
Accuracy: 0.794254400531385
Macro Precision: 0.46915814152862734
Macro Recall: 0.494879467983913
Macro F1 score: 0.4578405710461319
MCC: -0.0251337799575223


ComputerVision:
[[8050 3797]
 [4367 1852]]
Accuracy: 0.5481014059559394
Macro Precision: 0.4880751899463206
Macro Recall: 0.4886469962678598
Macro F1 score: 0.48781710920354693
MCC: -0.02327078967671577




In [52]:
#P2

target_field = "abstract"

TEXT = data.Field(sequential=True, tokenize=custom_tokenizer_P2, lower=False)
LABEL = data.LabelField(dtype=torch.float, use_vocab = False, preprocessing = int)

train_datafield = [("title", None), 
                   ("abstract", TEXT),
                   ("InformationTheory", LABEL), 
                   ("ComputationalLinguistics", LABEL),
                   ("ComputerVision", LABEL)
                   ]

# Dataset
train_data_whole, test_data = TabularDataset.splits(
    path = "./",
    train = "train.csv", test = "test.csv", format = "csv",
    skip_header = True, fields = train_datafield)

train_data, valid_data = train_data_whole.split(split_ratio = 0.9, random_state = random.getstate())
train_1000, remaining = train_data_whole.split(split_ratio = 1000 / len(train_data_whole), random_state = random.getstate())
train_1000, valid_1000 = train_1000.split(split_ratio = 0.9, random_state = random.getstate())
# Building vocab
MAX_VOCAB_SIZE = 5000

TEXT.build_vocab(train_1000, max_size = MAX_VOCAB_SIZE)

# Run generate_label_iterator functions
train_iterator_IT, validation_IT, test_iterator_IT = generate_label_iterator(train_1000, "InformationTheory", target_field, validation = True)
train_iterator_CL, validation_CL, test_iterator_CL = generate_label_iterator(train_1000, "ComputationalLinguistics", target_field, validation = True)
train_iterator_CV, validation_CV, test_iterator_CV = generate_label_iterator(train_1000, "ComputerVision", target_field, validation = True)

In [53]:

model_IT, optimizer_IT = generate_model_and_optimizer(pos_weight = pos_weight_IT)
model_CL, optimizer_CL = generate_model_and_optimizer(pos_weight = pos_weight_CL)
model_CV, optimizer_CV = generate_model_and_optimizer(pos_weight = pos_weight_CV)

N_EPOCHS = 30
label_names = ["InformationTheory", "ComputationalLinguistics", "ComputerVision"]
models = [model_IT, model_CL, model_CV]
optimizers = [optimizer_IT, optimizer_CL, optimizer_CV]
iterators = [(train_iterator_IT, validation_IT, test_iterator_IT), (train_iterator_CL, validation_CL, test_iterator_CL), (train_iterator_CV, validation_CV, test_iterator_CV)]
model_file_names = ["RNN_model_IT", "RNN_model_CL.pt", "RNN_model_CV.pt"]
best_valid_losses = [float("inf"), float("inf"), float("inf")]


In [54]:
RESULTS_A_P2_1000, best_model_states = train_loop(models, optimizers, iterators, N_EPOCHS, label_names, target_field, model_file_names, best_valid_losses, "P2", "1000", 5)

Training model for P2_InformationTheory, using 1000 data of abstract...
Epoch: 01 | Epoch Time: 0m 5s
InformationTheory Train Loss: 1.056 | Train Acc: 52.50%
InformationTheory Valid Loss: 1.053 | Valid Acc: 68.65%
InformationTheory Test Loss: 1.458 | Test Acc: 53.02%
Epoch: 02 | Epoch Time: 0m 5s
InformationTheory Train Loss: 1.031 | Train Acc: 53.75%
InformationTheory Valid Loss: 1.053 | Valid Acc: 72.33%
InformationTheory Test Loss: 1.473 | Test Acc: 52.45%
Epoch: 03 | Epoch Time: 0m 5s
InformationTheory Train Loss: 1.056 | Train Acc: 47.08%
InformationTheory Valid Loss: 1.060 | Valid Acc: 73.42%
InformationTheory Test Loss: 1.501 | Test Acc: 52.27%
Epoch: 04 | Epoch Time: 0m 5s
InformationTheory Train Loss: 1.034 | Train Acc: 65.21%
InformationTheory Valid Loss: 1.053 | Valid Acc: 72.90%
InformationTheory Test Loss: 1.477 | Test Acc: 52.35%
Epoch: 05 | Epoch Time: 0m 5s
InformationTheory Train Loss: 1.034 | Train Acc: 68.23%
InformationTheory Valid Loss: 1.052 | Valid Acc: 72.93%
In

In [55]:
model_IT.load_state_dict(best_model_states[0])
model_CL.load_state_dict(best_model_states[1])
model_CV.load_state_dict(best_model_states[2])

# Evaluate models
evaluate_model(model_IT, test_iterator_IT, "InformationTheory", target_field)
evaluate_model(model_CL, test_iterator_CL, "ComputationalLinguistics", target_field)
evaluate_model(model_CV, test_iterator_CV, "ComputerVision", target_field)

InformationTheory:
[[ 287 9300]
 [ 214 8265]]
Accuracy: 0.4733754013063213
Macro Precision: 0.5216961465625538
Macro Recall: 0.5023487734187358
Macro F1 score: 0.3457972087984818
MCC: 0.014277161109285449


ComputationalLinguistics:
[[10404  4294]
 [ 2465   903]]
Accuracy: 0.6258718033875789
Macro Precision: 0.49110425713035843
Macro Recall: 0.4879815236548738
Macro F1 score: 0.4828368419638268
MCC: -0.020679775168131014


ComputerVision:
[[11735   112]
 [ 6156    63]]
Accuracy: 0.6530499280416252
Macro Precision: 0.5079581912693534
Macro Recall: 0.5003381879210782
Macro F1 score: 0.4044659402994262
MCC: 0.0032810755315444964




In [56]:
model_IT.load_state_dict(best_model_states[0])
model_CL.load_state_dict(best_model_states[1])
model_CV.load_state_dict(best_model_states[2])

# Evaluate models
evaluate_model(model_IT, test_iterator_IT, "InformationTheory", target_field)
evaluate_model(model_CL, test_iterator_CL, "ComputationalLinguistics", target_field)
evaluate_model(model_CV, test_iterator_CV, "ComputerVision", target_field)

InformationTheory:
[[ 287 9300]
 [ 214 8265]]
Accuracy: 0.4733754013063213
Macro Precision: 0.5216961465625538
Macro Recall: 0.5023487734187358
Macro F1 score: 0.3457972087984818
MCC: 0.014277161109285449


ComputationalLinguistics:
[[10404  4294]
 [ 2465   903]]
Accuracy: 0.6258718033875789
Macro Precision: 0.49110425713035843
Macro Recall: 0.4879815236548738
Macro F1 score: 0.4828368419638268
MCC: -0.020679775168131014


ComputerVision:
[[11735   112]
 [ 6156    63]]
Accuracy: 0.6530499280416252
Macro Precision: 0.5079581912693534
Macro Recall: 0.5003381879210782
Macro F1 score: 0.4044659402994262
MCC: 0.0032810755315444964




In [62]:
# "Abstract", using all data
# P1
target_field = "abstract"

TEXT = data.Field(sequential=True, tokenize=custom_tokenizer, lower=True)
LABEL = data.LabelField(dtype=torch.float, use_vocab = False, preprocessing = int)

train_datafield = [("title", None), 
                   ("abstract", TEXT),
                   ("InformationTheory", LABEL), 
                   ("ComputationalLinguistics", LABEL),
                   ("ComputerVision", LABEL)
                   ]

# Dataset - P1
train_data_whole, test_data = TabularDataset.splits(
    path = "./",
    train = "train.csv", test = "test.csv", format = "csv",
    skip_header = True, fields = train_datafield)


# Building vocab
MAX_VOCAB_SIZE = 5000
train_data, valid_data = train_data_whole.split(split_ratio = 0.9, random_state = random.getstate())
train_1000, remaining = train_data_whole.split(split_ratio = 1000 / len(train_data_whole), random_state = random.getstate())
train_1000, valid_1000 = train_1000.split(split_ratio = 0.9, random_state = random.getstate())
TEXT.build_vocab(train_data, max_size = MAX_VOCAB_SIZE)

# Run generate_label_iterator functions


train_iterator_IT, validation_IT, test_iterator_IT = generate_label_iterator(train_data, "InformationTheory", target_field, validation = True)
train_iterator_CL, validation_CL, test_iterator_CL = generate_label_iterator(train_data, "ComputationalLinguistics", target_field, validation = True)
train_iterator_CV, validation_CV, test_iterator_CV = generate_label_iterator(train_data, "ComputerVision", target_field, validation = True)

In [63]:
# Initialize counts
negative_IT_count, positive_IT_count = 0, 0
negative_CL_count, positive_CL_count = 0, 0
negative_CV_count, positive_CV_count = 0, 0

# Iterate through the dataset
for example in train_data.examples:
    if getattr(example, "InformationTheory") == 0:
        negative_IT_count += 1
    else:
        positive_IT_count += 1

    if getattr(example, "ComputationalLinguistics") == 0:
        negative_CL_count += 1
    else:
        positive_CL_count += 1

    if getattr(example, "ComputerVision") == 0:
        negative_CV_count += 1
    else:
        positive_CV_count += 1
        
print(negative_CV_count, positive_CV_count)
print(negative_IT_count, positive_IT_count)
print(negative_CL_count, positive_CL_count)

pos_weight_IT = torch.tensor([negative_IT_count / positive_IT_count])
pos_weight_CL = torch.tensor([negative_CL_count / positive_CL_count])
pos_weight_CV = torch.tensor([negative_CV_count / positive_CV_count])

53998 58502
85503 26997
85499 27001


In [64]:
model_IT, optimizer_IT = generate_model_and_optimizer(pos_weight = pos_weight_IT)
model_CL, optimizer_CL = generate_model_and_optimizer(pos_weight = pos_weight_CL)
model_CV, optimizer_CV = generate_model_and_optimizer(pos_weight = pos_weight_CV)

N_EPOCHS = 30
label_names = ["InformationTheory", "ComputationalLinguistics", "ComputerVision"]
models = [model_IT, model_CL, model_CV]
optimizers = [optimizer_IT, optimizer_CL, optimizer_CV]
iterators = [(train_iterator_IT, validation_IT, test_iterator_IT), (train_iterator_CL, validation_CL, test_iterator_CL), (train_iterator_CV, validation_CV, test_iterator_CV)]
model_file_names = ["RNN_model_IT", "RNN_model_CL.pt", "RNN_model_CV.pt"]
best_valid_losses = [float("inf"), float("inf"), float("inf")]


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.

In [None]:
RESULTS_A_P1_ALL, best_model_states = train_loop(models, optimizers, iterators, N_EPOCHS, label_names, target_field, model_file_names, best_valid_losses, "P1", "ALL", 5)

In [None]:
model_IT.load_state_dict(best_model_states[0])
model_CL.load_state_dict(best_model_states[1])
model_CV.load_state_dict(best_model_states[2])

# Evaluate models
evaluate_model(model_IT, test_iterator_IT, "InformationTheory", target_field)
evaluate_model(model_CL, test_iterator_CL, "ComputationalLinguistics", target_field)
evaluate_model(model_CV, test_iterator_CV, "ComputerVision", target_field)

In [None]:
# P2

target_field = "abstract"

TEXT = data.Field(sequential=True, tokenize=custom_tokenizer_P2, lower=True)
LABEL = data.LabelField(dtype=torch.float, use_vocab = False, preprocessing = int)

train_datafield = [("title", None), 
                   ("abstract", TEXT),
                   ("InformationTheory", LABEL), 
                   ("ComputationalLinguistics", LABEL),
                   ("ComputerVision", LABEL)
                   ]

# Dataset
train_data, test_data = TabularDataset.splits(
    path = "./",
    train = "train.csv", test = "test.csv", format = "csv",
    skip_header = True, fields = train_datafield)

# Building vocab
MAX_VOCAB_SIZE = 5000

train_data, valid_data = train_data_whole.split(split_ratio = 0.9, random_state = random.getstate())

TEXT.build_vocab(train_data, max_size = MAX_VOCAB_SIZE)
# LABEL.build_vocab(train_data)

# Run generate_label_iterator functions
train_iterator_IT, validation_IT, test_iterator_IT = generate_label_iterator(train_data, "InformationTheory", target_field, validation = True)
train_iterator_CL, validation_CL, test_iterator_CL = generate_label_iterator(train_data, "ComputationalLinguistics", target_field, validation = True)
train_iterator_CV, validation_CV, test_iterator_CV = generate_label_iterator(train_data, "ComputerVision", target_field, validation = True)

In [None]:
model_IT, optimizer_IT = generate_model_and_optimizer(pos_weight = pos_weight_IT)
model_CL, optimizer_CL = generate_model_and_optimizer(pos_weight = pos_weight_CL)
model_CV, optimizer_CV = generate_model_and_optimizer(pos_weight = pos_weight_CV)

N_EPOCHS = 30
label_names = ["InformationTheory", "ComputationalLinguistics", "ComputerVision"]
models = [model_IT, model_CL, model_CV]
optimizers = [optimizer_IT, optimizer_CL, optimizer_CV]
iterators = [(train_iterator_IT, validation_IT, test_iterator_IT), (train_iterator_CL, validation_CL, test_iterator_CL), (train_iterator_CV, validation_CV, test_iterator_CV)]
model_file_names = ["RNN_model_IT", "RNN_model_CL.pt", "RNN_model_CV.pt"]
best_valid_losses = [float("inf"), float("inf"), float("inf")]


In [None]:
RESULTS_A_P2_ALL, best_model_states = train_loop(models, optimizers, iterators, N_EPOCHS, label_names, target_field, model_file_names, best_valid_losses, "P2", "ALL", 5)

In [None]:
model_IT.load_state_dict(best_model_states[0])
model_CL.load_state_dict(best_model_states[1])
model_CV.load_state_dict(best_model_states[2])

# Evaluate models
evaluate_model(model_IT, test_iterator_IT, "InformationTheory", target_field)
evaluate_model(model_CL, test_iterator_CL, "ComputationalLinguistics", target_field)
evaluate_model(model_CV, test_iterator_CV, "ComputerVision", target_field)