In [1]:
# Import Libraries
import torch
import torchtext

from torchtext.legacy import data
import torch.nn as nn
from torchtext.legacy.data import Field, LabelField, TabularDataset, Dataset
import copy

import torch.optim as optim
import collections
import random
import time
import pandas as pd
import re
import spacy
from nltk.stem import PorterStemmer

from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix, matthews_corrcoef
import numpy as np
spacy_en = spacy.load("en_core_web_sm")
from spacy.lang.en.stop_words import STOP_WORDS

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import torch
import torchtext
import torch.nn as nn
import torch.optim as optim

from torchtext.legacy import data
from torchtext.legacy.data import Field, LabelField, TabularDataset
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix, matthews_corrcoef
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import numpy as np
import random
import time
import copy
import collections

from nltk.stem import PorterStemmer


In [3]:
spacy_en = spacy.load("en_core_web_sm")
SEED = 1
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [4]:
def load_data(path, train_file, test_file, target_field, tokenizer, MAX_VOCAB_SIZE):
    TEXT = data.Field(sequential=True, tokenize=tokenizer, lower=True)
    LABEL = data.LabelField(dtype=torch.float, use_vocab=False, preprocessing=int)
    if target_field == "title":
        train_datafield = [
            (target_field, TEXT),
            ("abstract", None), 
            ("InformationTheory", LABEL),
            ("ComputationalLinguistics", LABEL),
            ("ComputerVision", LABEL)
        ]
    else:
        train_datafield = [
            ("title", None),
            (target_field, TEXT), 
            ("InformationTheory", LABEL),
            ("ComputationalLinguistics", LABEL),
            ("ComputerVision", LABEL)
        ]
        
    train_data, test_data = TabularDataset.splits(
        path=path,
        train=train_file, test=test_file, format="csv",
        skip_header=True, fields=train_datafield
    )
    train_data, valid_data = train_data.split(split_ratio=0.9, random_state=random.getstate())

    MAX_VOCAB_SIZE = MAX_VOCAB_SIZE

    TEXT.build_vocab(train_data, max_size=MAX_VOCAB_SIZE)
    
    return TEXT, LABEL, train_data, valid_data, test_data


In [5]:
def preprocess_target_label(train_data, target_field):
    for example in train_data:
        setattr(example, f"label_{target_field}", getattr(example, target_field))
        
    for example in test_data:
        setattr(example, f"label_{target_field}", getattr(example, target_field))

    for example in valid_data:
        setattr(example, f"label_{target_field}", getattr(example, target_field))

def generate_label_iterator(train_data, valid_data, test_data, label, target_field):
#     preprocess_target_label(train_data, target_field)
    
    label_attr = f"label_{label}"
    BATCH_SIZE = 64
    iterators = data.BucketIterator.splits(
        (train_data, valid_data, test_data),
        batch_size = BATCH_SIZE,
        device = device,
        sort_key = lambda x: len(getattr(x, target_field)),
        sort_within_batch = False)
    return iterators[0], iterators[1], iterators[2]

In [6]:

class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, pos_weight = None):
        
        super(RNN, self).__init__()
        
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        
        self.rnn = nn.RNN(embedding_dim, hidden_dim)
        
        self.fc = nn.Linear(hidden_dim, output_dim)
        
        self.pos_weight = pos_weight

        if self.pos_weight is not None:
            self.criterion = nn.BCEWithLogitsLoss(pos_weight=self.pos_weight)
        else:
            self.criterion = nn.BCEWithLogitsLoss()

    def forward(self, text):

        embedded = self.embedding(text)
        
        output, hidden = self.rnn(embedded)

        assert torch.equal(output[-1,:,:], hidden.squeeze(0))
        
        return self.fc(hidden.squeeze(0))

In [7]:
def get_pos_weight(train_data):
    pos_weights = []
    for field in ["InformationTheory", "ComputationalLinguistics", "ComputerVision"]:
        negative_count, positive_count = 0, 0
        for example in train_data.examples:
            if getattr(example, field) == 0:
                negative_count += 1
            else:
                positive_count += 1
        
        pos_weights.append(torch.tensor([negative_count / positive_count]).to(device))
    return pos_weights


In [8]:
def generate_model_and_optimizer(pos_weight, embedding_dim=100, hidden_dim=256, output_dim=1, lr=1e-2):
    INPUT_DIM = len(TEXT.vocab)

    model = RNN(INPUT_DIM, embedding_dim, hidden_dim, output_dim, pos_weight)

    optimizer = optim.SGD(model.parameters(), lr=lr)

    model = model.to(device)
    
    return model, optimizer

In [9]:
# Evaluation functions

def binary_accuracy(preds, y):

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

def train(model, iterator, optimizer, criterion, label_field, target_field):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:

        optimizer.zero_grad()
                
        predictions = model(getattr(batch, target_field)).squeeze(1)
        
        # Use the specific label field
        loss = criterion(predictions, getattr(batch, label_field))
        
        acc = binary_accuracy(predictions, getattr(batch, label_field))
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)


def evaluate(model, iterator, criterion, label_field, target_field):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(getattr(batch, target_field)).squeeze(1)
            
            # Use the specific label field
            loss = criterion(predictions, getattr(batch, label_field))
            
            acc = binary_accuracy(predictions, getattr(batch, label_field))

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)
    
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [27]:
rnn_results_df = pd.DataFrame()

def train_loop(ModelsList, OptimizersList, IteratorsList, N_EPOCHS, label_names, target_field, modelFileNames, bestLossesList, preprocess, train_size, patience=5, rnn_metric_results=[]):
    result = []
    best_model_states = []
    best_epochs = [0, 0, 0]
    for idx, (label_name, model, optimizer, iterators_) in enumerate(zip(label_names, ModelsList, OptimizersList, IteratorsList)):
        print(f"Training model for {preprocess}_{label_name}, using {train_size} data of {target_field}...")

        bad_epochs = 0
        best_model_state = None
        best_epoch = 0

        for epoch in range(N_EPOCHS):
            start_time = time.time()

            train_iterator = iterators_[0]
            test_iterator = iterators_[2] if len(iterators_) > 2 else iterators_[1]
            valid_iterator = iterators_[1] if len(iterators_) > 2 else None

            train_loss, train_acc = train(model, train_iterator, optimizer, model.criterion, label_name, target_field)

            if valid_iterator:
                valid_loss, valid_acc = evaluate(model, valid_iterator, model.criterion, label_name, target_field)

            test_loss, test_acc = evaluate(model, test_iterator, model.criterion, label_name, target_field)

            end_time = time.time()
            epoch_mins, epoch_secs = epoch_time(start_time, end_time)

            if test_loss < bestLossesList[idx]:
                bestLossesList[idx] = test_loss
                best_model_state = copy.deepcopy(model.state_dict())
#                 best_model_state = model.state_dict()
                bad_epochs = 0
                best_epochs[idx] = epoch + 1
        
                print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
                print(f'{label_name} Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
                print(f'{label_name} Valid Loss: {valid_loss:.3f} | Valid Acc: {valid_acc*100:.2f}%')
                print(f'{label_name} Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')
            else:
                bad_epochs += 1
        
            if bad_epochs > patience:
                print(f"Early stopping at epoch {epoch+1} for {label_name} model.")
#                 model.load_state_dict(best_model_state)
                break

            # For predictions and ground truth collection
            model.eval()
            y_predict, y_test = [], []
            with torch.no_grad():
                for batch in test_iterator:
                    predictions = model(getattr(batch, target_field)).squeeze(1)
                    rounded_preds = torch.round(torch.sigmoid(predictions))
                    y_predict += rounded_preds.tolist()
                    y_test += getattr(batch, label_name).tolist()
        print(f"Best performing epoch for {label_name} model: {best_epochs[idx]}")
        best_model_states.append(best_model_state)
        
        model.load_state_dict(best_model_state)
        # Call 'evaluate_model' and store its return value
        rnn_eval_metrics = evaluate_model(model, test_iterator, label_name, target_field)
        
        # Append the evaluation metrics to the 'rnn_metric_results' list
        rnn_metric_results.append({
            'label_name': label_name,
            'model_name': model.__class__.__name__,
            'target_field': target_field,
            'train_size': train_size,
            'tokenizer': preprocess,
            **rnn_eval_metrics
        })
#         result.append((y_predict, y_test))
        
    return rnn_metric_results, best_model_states



In [26]:

def evaluate_model(model, iterator, label_field, target_field):
    

    y_probs = []
    y_predict = []
    y_test = []
    
    model.eval()
    with torch.no_grad():
        for batch in iterator:
            predictions = model(getattr(batch, target_field)).squeeze(1)
            probs = torch.sigmoid(predictions)
            rounded_preds = torch.round(torch.sigmoid(predictions))
            
            y_predict += rounded_preds.tolist()
            y_test += getattr(batch, label_field).tolist()
            y_probs += probs.tolist()
    
    y_predict = np.asarray(y_predict)
    y_test = np.asarray(y_test)
    y_probs = np.asarray(y_probs)
    # Compute metrics
    recall_macro = recall_score(y_test, y_predict, average='macro')
    precision_macro = precision_score(y_test, y_predict, average='macro')
    recall = recall_score(y_test, y_predict, average=None)
    precision = precision_score(y_test, y_predict, average=None)
    f1score = f1_score(y_test, y_predict, average='macro')
    accuracy = accuracy_score(y_test, y_predict)
    matthews = matthews_corrcoef(y_test, y_predict)
    precision_curve, recall_curve, _ = precision_recall_curve(y_test, y_probs)

    return {
        'Recall': recall,
        'Precision': precision,
        'Macro Recall': recall_macro,
        'Macro Precision': precision_macro,
        'Macro F1 score': f1score,
        'MCC': matthews,
        'Recall_curve': recall_curve,  # Add this line to store the recall curve values
        'Precision_curve': precision_curve  # Add this line to store the precision curve values
    }
#     # Print metrics
#     print(f"{label_field}:")
#     print(confusion_matrix(y_test, y_predict))
#     print('Accuracy:', accuracy)
#     print('Macro Precision:', precision)
#     print('Macro Recall:', recall)
#     print('Macro F1 score:', f1score)
#     print('MCC:', matthews)
#     print("\n")


<h2>P1 Title 1000</h2>

In [28]:
def custom_tokenizer(text):   
    return [tok.text for tok in spacy_en.tokenizer(text) if tok.text.lower() not in STOP_WORDS]

path = './'
train_file = "train_1000.csv"
test_file = "test.csv"
target_field = "title"
tokenizer = custom_tokenizer
MAX_VOCAB_SIZE = 5000

TEXT, LABEL, train_data, valid_data, test_data = load_data(path, train_file, test_file, target_field, custom_tokenizer, MAX_VOCAB_SIZE)


train_iterator_IT, validation_IT, test_iterator_IT = generate_label_iterator(train_data, valid_data, test_data, "InformationTheory", target_field)
train_iterator_CL, validation_CL, test_iterator_CL = generate_label_iterator(train_data, valid_data, test_data, "ComputationalLinguistics", target_field)
train_iterator_CV, validation_CV, test_iterator_CV = generate_label_iterator(train_data, valid_data, test_data, "ComputerVision", target_field)

pos_weights = get_pos_weight(train_data)

model_IT, optimizer_IT = generate_model_and_optimizer(pos_weight=pos_weights[0])
model_CL, optimizer_CL = generate_model_and_optimizer(pos_weight=pos_weights[1])
model_CV, optimizer_CV = generate_model_and_optimizer(pos_weight=pos_weights[2])

N_EPOCHS = 30
label_names = ["InformationTheory", "ComputationalLinguistics", "ComputerVision"]
models = [model_IT, model_CL, model_CV]
optimizers = [optimizer_IT, optimizer_CL, optimizer_CV]
iterators = [(train_iterator_IT, validation_IT, test_iterator_IT), (train_iterator_CL, validation_CL, test_iterator_CL), (train_iterator_CV, validation_CV, test_iterator_CV)]
model_file_names = ["RNN_model_IT_T_P1_1000", "RNN_model_CL_T_P1_1000.pt", "RNN_model_CV_T_P1_1000.pt"]
best_valid_losses = [float("inf"), float("inf"), float("inf")]

In [29]:
# print(TEXT.vocab.freqs.most_common(200))
len(train_data), len(valid_data), len(test_data), len(TEXT.vocab)



[('-', 607), ('\n  ', 454), (':', 210), ('learning', 133), ('based', 77), ('neural', 70), ('networks', 69), ('deep', 65), ('image', 60), ('network', 56), ('detection', 55), ('segmentation', 44), ('codes', 40), ('multi', 38), ('recognition', 38), ('classification', 33), ('data', 33), ('model', 31), ('analysis', 31), ('language', 31), ('images', 30), ('estimation', 29), ('convolutional', 28), ('models', 27), ('systems', 27), ('video', 26), ('text', 26), ('supervised', 25), ('robust', 25), ('time', 24), ('channel', 24), ('semantic', 23), ('information', 23), ('generation', 22), ('end', 22), ('face', 22), ('training', 22), ('visual', 21), ('attention', 21), ('joint', 21), ('3d', 21), ('(', 21), ('channels', 20), ('resolution', 20), ('object', 20), ('efficient', 20), ('speech', 20), ('graph', 19), ('system', 19), ('shot', 19), ('adversarial', 18), ('non', 18), ('low', 18), ('aware', 17), ('distributed', 17), ('modeling', 17), ('feature', 17), ('self', 17), ('real', 17), ('unsupervised', 16)

(900, 100, 18066, 2514)

In [None]:
rnn_results, best_model_states = train_loop(models, optimizers, iterators, N_EPOCHS, label_names, target_field, model_file_names, best_valid_losses, "P1", len(train_data), 10, rnn_metric_results=[])
rnn_results_df = rnn_results_df.append(pd.DataFrame(rnn_results), ignore_index = True)


<h2>P2 Title 1000</h2>

In [15]:
def custom_tokenizer_P2(text):
    ps = PorterStemmer()
    return [ps.stem(tok.text) for tok in spacy_en.tokenizer(text)]

path = './'
train_file = "train_1000.csv"
test_file = "test.csv"
target_field = "title"
tokenizer = custom_tokenizer_P2
MAX_VOCAB_SIZE = 5000

TEXT, LABEL, train_data, valid_data, test_data = load_data(path, train_file, test_file, target_field, tokenizer, MAX_VOCAB_SIZE)


train_iterator_IT, validation_IT, test_iterator_IT = generate_label_iterator(train_data, valid_data, test_data, "InformationTheory", target_field)
train_iterator_CL, validation_CL, test_iterator_CL = generate_label_iterator(train_data, valid_data, test_data, "ComputationalLinguistics", target_field)
train_iterator_CV, validation_CV, test_iterator_CV = generate_label_iterator(train_data, valid_data, test_data, "ComputerVision", target_field)

pos_weights = get_pos_weight(train_data)

model_IT, optimizer_IT = generate_model_and_optimizer(pos_weight=pos_weights[0])
model_CL, optimizer_CL = generate_model_and_optimizer(pos_weight=pos_weights[1])
model_CV, optimizer_CV = generate_model_and_optimizer(pos_weight=pos_weights[2])

N_EPOCHS = 30
label_names = ["InformationTheory", "ComputationalLinguistics", "ComputerVision"]
models = [model_IT, model_CL, model_CV]
optimizers = [optimizer_IT, optimizer_CL, optimizer_CV]
iterators = [(train_iterator_IT, validation_IT, test_iterator_IT), (train_iterator_CL, validation_CL, test_iterator_CL), (train_iterator_CV, validation_CV, test_iterator_CV)]
model_file_names = ["RNN_model_IT_T_P2_1000", "RNN_model_CL_T_P2_1000.pt", "RNN_model_CV_T_P2_1000.pt"]
best_valid_losses = [float("inf"), float("inf"), float("inf")]

In [16]:
print(TEXT.vocab.freqs.most_common(200))
len(train_data), len(valid_data), len(test_data), len(TEXT.vocab)



[('-', 607), ('\n  ', 454), ('for', 354), ('of', 241), (':', 210), ('and', 206), ('a', 154), ('in', 151), ('learn', 136), ('with', 133), ('the', 126), ('network', 125), ('on', 95), ('imag', 93), ('base', 78), ('model', 76), ('use', 76), ('to', 73), ('neural', 70), ('code', 68), ('deep', 65), ('detect', 62), ('segment', 47), ('system', 46), ('channel', 45), ('gener', 44), ('from', 39), ('multi', 38), ('recognit', 38), ('video', 36), ('convolut', 36), ('via', 35), ('estim', 34), ('classif', 33), ('data', 33), ('languag', 32), ('by', 32), ('analysi', 31), ('object', 30), ('robust', 28), ('distribut', 28), ('supervis', 27), ('text', 27), ('featur', 27), ('train', 27), ('predict', 27), ('attent', 25), ('time', 25), ('optim', 25), ('face', 24), ('semant', 24), ('visual', 23), ('end', 23), ('inform', 23), ('graph', 22), ('adapt', 22), ('improv', 22), ('transform', 22), ('joint', 21), ('over', 21), ('an', 21), ('3d', 21), ('effici', 21), ('(', 21), ('sequenc', 21), ('local', 21), ('commun', 20

(900, 100, 18066, 2083)

In [17]:
rnn_results, best_model_states = train_loop(models, optimizers, iterators, N_EPOCHS, label_names, target_field, model_file_names, best_valid_losses, "P1", len(train_data), 10, rnn_metric_results=[])
rnn_results_df = rnn_results_df.append(pd.DataFrame(rnn_results), ignore_index = True)


Training model for P1_InformationTheory, using 900 data of title...
Epoch: 01 | Epoch Time: 0m 0s
InformationTheory Train Loss: 1.032 | Train Acc: 59.90%
InformationTheory Valid Loss: 1.038 | Valid Acc: 30.73%
InformationTheory Test Loss: 1.315 | Test Acc: 50.51%
Epoch: 02 | Epoch Time: 0m 0s
InformationTheory Train Loss: 1.043 | Train Acc: 55.73%
InformationTheory Valid Loss: 1.036 | Valid Acc: 31.51%
InformationTheory Test Loss: 1.312 | Test Acc: 51.06%
Epoch: 12 | Epoch Time: 0m 0s
InformationTheory Train Loss: 1.059 | Train Acc: 59.06%
InformationTheory Valid Loss: 1.036 | Valid Acc: 30.90%
InformationTheory Test Loss: 1.309 | Test Acc: 52.62%
Epoch: 13 | Epoch Time: 0m 0s
InformationTheory Train Loss: 1.054 | Train Acc: 35.31%
InformationTheory Valid Loss: 1.037 | Valid Acc: 69.88%
InformationTheory Test Loss: 1.308 | Test Acc: 52.61%
Epoch: 20 | Epoch Time: 0m 0s
InformationTheory Train Loss: 1.051 | Train Acc: 49.27%
InformationTheory Valid Loss: 1.038 | Valid Acc: 69.88%
Inform

  rnn_results_df = rnn_results_df.append(pd.DataFrame(rnn_results), ignore_index = True)


<h2> P1 Title ALL</h2>

In [None]:
path = './'
train_file = "train.csv"
test_file = "test.csv"
target_field = "title"
tokenizer = custom_tokenizer
MAX_VOCAB_SIZE = 5000

TEXT, LABEL, train_data, valid_data, test_data = load_data(path, train_file, test_file, target_field, tokenizer, MAX_VOCAB_SIZE)


train_iterator_IT, validation_IT, test_iterator_IT = generate_label_iterator(train_data, valid_data, test_data, "InformationTheory", target_field)
train_iterator_CL, validation_CL, test_iterator_CL = generate_label_iterator(train_data, valid_data, test_data, "ComputationalLinguistics", target_field)
train_iterator_CV, validation_CV, test_iterator_CV = generate_label_iterator(train_data, valid_data, test_data, "ComputerVision", target_field)

pos_weights = get_pos_weight(train_data)

model_IT, optimizer_IT = generate_model_and_optimizer(pos_weight=pos_weights[0])
model_CL, optimizer_CL = generate_model_and_optimizer(pos_weight=pos_weights[1])
model_CV, optimizer_CV = generate_model_and_optimizer(pos_weight=pos_weights[2])

N_EPOCHS = 30
label_names = ["InformationTheory", "ComputationalLinguistics", "ComputerVision"]
models = [model_IT, model_CL, model_CV]
optimizers = [optimizer_IT, optimizer_CL, optimizer_CV]
iterators = [(train_iterator_IT, validation_IT, test_iterator_IT), (train_iterator_CL, validation_CL, test_iterator_CL), (train_iterator_CV, validation_CV, test_iterator_CV)]
model_file_names = ["RNN_model_IT_T_P1_ALL", "RNN_model_CL_T_P1_ALL.pt", "RNN_model_CV_T_P1_ALL.pt"]
best_valid_losses = [float("inf"), float("inf"), float("inf")]

In [None]:
print(TEXT.vocab.freqs.most_common(50))
len(train_data), len(valid_data), len(test_data), len(TEXT.vocab)

In [None]:
rnn_results, best_model_states = train_loop(models, optimizers, iterators, N_EPOCHS, label_names, target_field, model_file_names, best_valid_losses, "P1", len(train_data), 10, rnn_metric_results=[])
rnn_results_df = rnn_results_df.append(pd.DataFrame(rnn_results), ignore_index = True)


In [18]:
rnn_results_df.to_csv("RNN_Checking.csv", index = False) 

In [25]:
rnn_results_df

Unnamed: 0,label_name,model_name,target_field,train_size,tokenizer,Recall,Precision,Macro Recall,Macro Precision,Macro F1 score,MCC
0,InformationTheory,RNN,title,900,P1,"[0.5751538541775321, 0.44616110390376224]","[0.5400587659157688, 0.48154276985743383]",0.510657,0.510801,0.510116,0.021458
1,ComputationalLinguistics,RNN,title,900,P1,"[0.7620084365219758, 0.252375296912114]","[0.8164455459979589, 0.19549218031278748]",0.507192,0.505969,0.504305,0.013104
2,ComputerVision,RNN,title,900,P1,"[0.5254494808812358, 0.5237176394918798]","[0.6775878959399151, 0.36682058790404326]",0.524584,0.522204,0.511673,0.046727
3,InformationTheory,RNN,title,900,P1,"[0.4614582246792532, 0.6056138695600897]","[0.5695159629248198, 0.4986405127209167]",0.533536,0.534078,0.528385,0.067612
4,ComputationalLinguistics,RNN,title,900,P1,"[0.6822696965573547, 0.37885985748218526]","[0.8273927392739274, 0.21459804910864447]",0.530565,0.520995,0.510926,0.050664
5,ComputerVision,RNN,title,900,P1,"[0.4571621507554655, 0.43174143753014954]","[0.6051396648044692, 0.2945370776656428]",0.444452,0.449838,0.435512,-0.105573


In [24]:
import matplotlib.pyplot as plt
from sklearn.metrics import auc


def plot_precision_recall_curve(df):
    plt.figure(figsize=(10, 6))
    
    for index, row in df.iterrows():
        target_field = 'T' if row['target_field'] == 'title' else 'A'
        size = str(1000) if row['train_size'] == 900 else 'All'
        tokenizer = 'P1' if row['tokenizer'] == 'P1' else 'P2'
        label = f"{tokenizer}_{size}_{target_field}_{row['label_name']}"
        plt.plot(row['Recall'], row['Precision'], label=f"{label} (AUC: {auc(row['Recall'], row['Precision']):.2f})")
    
    plt.xlabel("Recall")
    plt.ylabel("Precision")
    plt.title("Precision-Recall Curve")
    plt.legend(loc='best')
    plt.show()
    
plot_precision_recall_curve(rnn_results_df)

<h2> P2 Title ALL</h2>

In [None]:
path = './'
train_file = "train.csv"
test_file = "test.csv"
target_field = "title"
tokenizer = custom_tokenizer_P2
MAX_VOCAB_SIZE = 5000

TEXT, LABEL, train_data, valid_data, test_data = load_data(path, train_file, test_file, target_field, tokenizer, MAX_VOCAB_SIZE)


train_iterator_IT, validation_IT, test_iterator_IT = generate_label_iterator(train_data, valid_data, test_data, "InformationTheory", target_field)
train_iterator_CL, validation_CL, test_iterator_CL = generate_label_iterator(train_data, valid_data, test_data, "ComputationalLinguistics", target_field)
train_iterator_CV, validation_CV, test_iterator_CV = generate_label_iterator(train_data, valid_data, test_data, "ComputerVision", target_field)

pos_weights = get_pos_weight(train_data)

model_IT, optimizer_IT = generate_model_and_optimizer(pos_weight=pos_weights[0])
model_CL, optimizer_CL = generate_model_and_optimizer(pos_weight=pos_weights[1])
model_CV, optimizer_CV = generate_model_and_optimizer(pos_weight=pos_weights[2])

N_EPOCHS = 30
label_names = ["InformationTheory", "ComputationalLinguistics", "ComputerVision"]
models = [model_IT, model_CL, model_CV]
optimizers = [optimizer_IT, optimizer_CL, optimizer_CV]
iterators = [(train_iterator_IT, validation_IT, test_iterator_IT), (train_iterator_CL, validation_CL, test_iterator_CL), (train_iterator_CV, validation_CV, test_iterator_CV)]
model_file_names = ["RNN_model_IT_T_P2_ALL", "RNN_model_CL_T_P2_ALL.pt", "RNN_model_CV_T_P2_ALL.pt"]
best_valid_losses = [float("inf"), float("inf"), float("inf")]

In [None]:
print(TEXT.vocab.freqs.most_common(50))
len(train_data), len(valid_data), len(test_data), len(TEXT.vocab)

In [None]:
rnn_results, best_model_states = train_loop(models, optimizers, iterators, N_EPOCHS, label_names, target_field, model_file_names, best_valid_losses, "P1", len(train_data), 10, rnn_metric_results=[])
rnn_results_df = rnn_results_df.append(pd.DataFrame(rnn_results), ignore_index = True)


<h2> P1 Abstract 1000</h2>

In [None]:
path = './'
train_file = "train_1000.csv"
test_file = "test.csv"
target_field = "abstract"
tokenizer = custom_tokenizer
MAX_VOCAB_SIZE = 10000

TEXT, LABEL, train_data, valid_data, test_data = load_data(path, train_file, test_file, target_field, tokenizer, MAX_VOCAB_SIZE)


train_iterator_IT, validation_IT, test_iterator_IT = generate_label_iterator(train_data, valid_data, test_data, "InformationTheory", target_field)
train_iterator_CL, validation_CL, test_iterator_CL = generate_label_iterator(train_data, valid_data, test_data, "ComputationalLinguistics", target_field)
train_iterator_CV, validation_CV, test_iterator_CV = generate_label_iterator(train_data, valid_data, test_data, "ComputerVision", target_field)

pos_weights = get_pos_weight(train_data)

model_IT, optimizer_IT = generate_model_and_optimizer(pos_weight=pos_weights[0])
model_CL, optimizer_CL = generate_model_and_optimizer(pos_weight=pos_weights[1])
model_CV, optimizer_CV = generate_model_and_optimizer(pos_weight=pos_weights[2])

N_EPOCHS = 30
label_names = ["InformationTheory", "ComputationalLinguistics", "ComputerVision"]
models = [model_IT, model_CL, model_CV]
optimizers = [optimizer_IT, optimizer_CL, optimizer_CV]
iterators = [(train_iterator_IT, validation_IT, test_iterator_IT), (train_iterator_CL, validation_CL, test_iterator_CL), (train_iterator_CV, validation_CV, test_iterator_CV)]
model_file_names = ["RNN_model_IT_A_P1_1000", "RNN_model_CL_A_P1_1000.pt", "RNN_model_CV_A_P1_1000.pt"]
best_valid_losses = [float("inf"), float("inf"), float("inf")]

In [None]:
print(TEXT.vocab.freqs.most_common(50))
len(train_data), len(valid_data), len(test_data), len(TEXT.vocab)

In [None]:
rnn_results, best_model_states = train_loop(models, optimizers, iterators, N_EPOCHS, label_names, target_field, model_file_names, best_valid_losses, "P1", len(train_data), 10, rnn_metric_results=[])
rnn_results_df = rnn_results_df.append(pd.DataFrame(rnn_results), ignore_index = True)


<h2> P2 Abstract 1000</h2>

In [None]:
path = './'
train_file = "train_1000.csv"
test_file = "test.csv"
target_field = "abstract"
tokenizer = custom_tokenizer_P2
MAX_VOCAB_SIZE = 10000

TEXT, LABEL, train_data, valid_data, test_data = load_data(path, train_file, test_file, target_field, tokenizer, MAX_VOCAB_SIZE)


train_iterator_IT, validation_IT, test_iterator_IT = generate_label_iterator(train_data, valid_data, test_data, "InformationTheory", target_field)
train_iterator_CL, validation_CL, test_iterator_CL = generate_label_iterator(train_data, valid_data, test_data, "ComputationalLinguistics", target_field)
train_iterator_CV, validation_CV, test_iterator_CV = generate_label_iterator(train_data, valid_data, test_data, "ComputerVision", target_field)

pos_weights = get_pos_weight(train_data)

model_IT, optimizer_IT = generate_model_and_optimizer(pos_weight=pos_weights[0])
model_CL, optimizer_CL = generate_model_and_optimizer(pos_weight=pos_weights[1])
model_CV, optimizer_CV = generate_model_and_optimizer(pos_weight=pos_weights[2])

N_EPOCHS = 30
label_names = ["InformationTheory", "ComputationalLinguistics", "ComputerVision"]
models = [model_IT, model_CL, model_CV]
optimizers = [optimizer_IT, optimizer_CL, optimizer_CV]
iterators = [(train_iterator_IT, validation_IT, test_iterator_IT), (train_iterator_CL, validation_CL, test_iterator_CL), (train_iterator_CV, validation_CV, test_iterator_CV)]
model_file_names = ["RNN_model_IT_A_P2_1000", "RNN_model_CL_A_P2_1000.pt", "RNN_model_CV_A_P2_1000.pt"]
best_valid_losses = [float("inf"), float("inf"), float("inf")]

In [None]:
print(TEXT.vocab.freqs.most_common(50))
len(train_data), len(valid_data), len(test_data), len(TEXT.vocab)

In [None]:
rnn_results, best_model_states = train_loop(models, optimizers, iterators, N_EPOCHS, label_names, target_field, model_file_names, best_valid_losses, "P1", len(train_data), 10, rnn_metric_results=[])
rnn_results_df = rnn_results_df.append(pd.DataFrame(rnn_results), ignore_index = True)


<h2> P1 Abstract ALL</h2>

In [None]:
path = './'
train_file = "train.csv"
test_file = "test.csv"
target_field = "abstract"
tokenizer = custom_tokenizer
MAX_VOCAB_SIZE = 10000

TEXT, LABEL, train_data, valid_data, test_data = load_data(path, train_file, test_file, target_field, tokenizer, MAX_VOCAB_SIZE)


train_iterator_IT, validation_IT, test_iterator_IT = generate_label_iterator(train_data, valid_data, test_data, "InformationTheory", target_field)
train_iterator_CL, validation_CL, test_iterator_CL = generate_label_iterator(train_data, valid_data, test_data, "ComputationalLinguistics", target_field)
train_iterator_CV, validation_CV, test_iterator_CV = generate_label_iterator(train_data, valid_data, test_data, "ComputerVision", target_field)

pos_weights = get_pos_weight(train_data)

model_IT, optimizer_IT = generate_model_and_optimizer(pos_weight=pos_weights[0])
model_CL, optimizer_CL = generate_model_and_optimizer(pos_weight=pos_weights[1])
model_CV, optimizer_CV = generate_model_and_optimizer(pos_weight=pos_weights[2])

N_EPOCHS = 30
label_names = ["InformationTheory", "ComputationalLinguistics", "ComputerVision"]
models = [model_IT, model_CL, model_CV]
optimizers = [optimizer_IT, optimizer_CL, optimizer_CV]
iterators = [(train_iterator_IT, validation_IT, test_iterator_IT), (train_iterator_CL, validation_CL, test_iterator_CL), (train_iterator_CV, validation_CV, test_iterator_CV)]
model_file_names = ["RNN_model_IT_A_P2_1000", "RNN_model_CL_A_P2_1000.pt", "RNN_model_CV_A_P2_1000.pt"]
best_valid_losses = [float("inf"), float("inf"), float("inf")]

In [None]:
print(TEXT.vocab.freqs.most_common(50))
len(train_data), len(valid_data), len(test_data), len(TEXT.vocab)

In [None]:
rnn_results, best_model_states = train_loop(models, optimizers, iterators, N_EPOCHS, label_names, target_field, model_file_names, best_valid_losses, "P1", len(train_data), 10, rnn_metric_results=[])
rnn_results_df = rnn_results_df.append(pd.DataFrame(rnn_results), ignore_index = True)


In [None]:
rnn_results_df

In [None]:
rnn_results_df.to_csv("RNN_results.csv", index = False)

<h2> P2 Abstract ALL</h2>

In [None]:
path = './'
train_file = "train.csv"
test_file = "test.csv"
target_field = "abstract"
tokenizer = custom_tokenizer_P2
MAX_VOCAB_SIZE = 10000

TEXT, LABEL, train_data, valid_data, test_data = load_data(path, train_file, test_file, target_field, tokenizer, MAX_VOCAB_SIZE)


train_iterator_IT, validation_IT, test_iterator_IT = generate_label_iterator(train_data, valid_data, test_data, "InformationTheory", target_field)
train_iterator_CL, validation_CL, test_iterator_CL = generate_label_iterator(train_data, valid_data, test_data, "ComputationalLinguistics", target_field)
train_iterator_CV, validation_CV, test_iterator_CV = generate_label_iterator(train_data, valid_data, test_data, "ComputerVision", target_field)

pos_weights = get_pos_weight(train_data)

model_IT, optimizer_IT = generate_model_and_optimizer(pos_weight=pos_weights[0])
model_CL, optimizer_CL = generate_model_and_optimizer(pos_weight=pos_weights[1])
model_CV, optimizer_CV = generate_model_and_optimizer(pos_weight=pos_weights[2])

N_EPOCHS = 30
label_names = ["InformationTheory", "ComputationalLinguistics", "ComputerVision"]
models = [model_IT, model_CL, model_CV]
optimizers = [optimizer_IT, optimizer_CL, optimizer_CV]
iterators = [(train_iterator_IT, validation_IT, test_iterator_IT), (train_iterator_CL, validation_CL, test_iterator_CL), (train_iterator_CV, validation_CV, test_iterator_CV)]
model_file_names = ["RNN_model_IT_A_P2_ALL", "RNN_model_CL_A_P2_ALL.pt", "RNN_model_CV_A_P2_ALL.pt"]
best_valid_losses = [float("inf"), float("inf"), float("inf")]

In [None]:
print(TEXT.vocab.freqs.most_common(50))
len(train_data), len(valid_data), len(test_data), len(TEXT.vocab)

In [None]:
rnn_results, best_model_states = train_loop(models, optimizers, iterators, N_EPOCHS, label_names, target_field, model_file_names, best_valid_losses, "P1", len(train_data), 10, rnn_metric_results=[])
rnn_results_df = rnn_results_df.append(pd.DataFrame(rnn_results), ignore_index = True)
