In [None]:
!pip install transformers

In [None]:
!pip install sacremoses

In [None]:
import pandas as pd
import numpy as np
import re

from transformers import AutoModel, AutoTokenizer

PATH = '/content/drive/MyDrive/GRN/CAMDA/'

file_name = 'Datasets/DILI_data1.csv'
data_raw = pd.read_csv(PATH + file_name)
data = data_raw.sample(frac=1)
data['Abstract'] = data['Abstract'].fillna("")
data['Title'] = data['Title'].fillna("")
data['Documents'] = data['Title'].map(str) + '. ' + data['Abstract'].map(str)
labels = data['Label'].to_list()


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class TextCNN(nn.Module):
    def __init__(self, vocab_size, emb_dim=120, n_filters=10, filter_sizes=[3, 4, 5], output_dim=1, dropout=0.5, from_transformer = False):
        super(TextCNN, self).__init__()
        if from_transformer:
            model = AutoModel.from_pretrained(from_transformer)
            if re.search('bert', from_transformer):
                self.embedding = model.embeddings.word_embeddings
            elif re.search('biogpt', from_transformer):
                self.embedding = model.embed_tokens
            emb_dim = self.embedding.weight.shape[-1]
        else:
            self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.convs = nn.ModuleList([nn.Conv2d(in_channels=1, out_channels=n_filters, kernel_size=(fs, emb_dim)) for fs in filter_sizes])
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        self.dropout = nn.Dropout(dropout)
        self.name = 'CNN'

    def forward(self, text):
        # text: [batch_size, seq_len]
        embedded = self.embedding(text)
        # embedded: [batch_size, seq_len, embedding_dim]
        embedded = embedded.unsqueeze(1)
        # embedded: [batch_size, 1, seq_len, embedding_dim]

        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
        # conved_n: [batch_size, n_filters, seq_len - filter_sizes[n] + 1]

        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        # pooled_n: [batch_size, n_filters]

        cat = self.dropout(torch.cat(pooled, dim=1))
        # cat: [batch_size, n_filters * len(filter_sizes)]

        output = self.fc(cat)
        # output: [batch_size, output_dim]

        return torch.sigmoid(output.squeeze())

In [None]:
import torch
from torch import nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

class LSTM(nn.Module):
    def __init__(self, vocab_size, emb_dim = 120, dimension=64, from_transformer = False):
        super(LSTM, self).__init__()
        if from_transformer:
            model = AutoModel.from_pretrained(from_transformer)
            if re.search('bert', from_transformer):
                self.embedding = model.embeddings.word_embeddings
            elif re.search('biogpt', from_transformer):
                self.embedding = model.embed_tokens
            emb_dim = self.embedding.weight.shape[-1]
        else:
            self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.dimension = dimension
        self.lstm = nn.LSTM(input_size=emb_dim,
                            hidden_size=dimension,
                            num_layers=1,
                            batch_first=True,
                            bidirectional=True)
        self.drop = nn.Dropout(p=0.5)

        self.fc = nn.Linear(2*dimension, 1)

        self.name = "LSTM"

    def forward(self, text, text_len):

        text_emb = self.embedding(text)

        packed_input = pack_padded_sequence(text_emb, text_len, batch_first=True, enforce_sorted=False)
        packed_output, _ = self.lstm(packed_input)
        output, _ = pad_packed_sequence(packed_output, batch_first=True)

        out_forward = output[range(len(output)), text_len - 1, :self.dimension]
        out_reverse = output[:, 0, self.dimension:]
        out_reduced = torch.cat((out_forward, out_reverse), 1)
        text_fea = self.drop(out_reduced)

        text_fea = self.fc(text_fea)
        text_fea = torch.squeeze(text_fea, 1)
        text_out = torch.sigmoid(text_fea)

        return text_out

In [None]:
import nltk
from collections import Counter
from nltk.tokenize import word_tokenize

class NLTKTokenizer:
    def __init__(self):
        # Download the Punkt tokenizer model for word_tokenize
        nltk.download('punkt')
        self.vocab = None
        self.word2id = None
        self.id2word = None

    def tokenize(self, text):
        return word_tokenize(text)

    def collect_vocab(self, texts, min_freq=1):
        word_counts = Counter()
        for text in texts:
            tokens = self.tokenize(text)
            word_counts.update(tokens)

        self.vocab = [word for word, count in word_counts.items() if count >= min_freq]
        self.vocab.append('[UNK]')
        self.vocab.append('[PAD]')
        self.vocab.sort()

        self.word2id = {word: idx for idx, word in enumerate(self.vocab)}
        self.id2word = {idx: word for word, idx in self.word2id.items()}

    def encode(self, text, max_length=None, padding=False, truncation=True):
        if not self.word2id:
            raise ValueError("Vocabulary has not been collected. Call collect_vocab() first.")

        tokens = self.tokenize(text)
        token_ids = [self.word2id.get(token, self.word2id.get('[UNK]')) for token in tokens]

        if max_length and truncation:
            token_ids = token_ids[:max_length]

        if padding:
            if not max_length:
                raise ValueError("max_length must be provided for padding.")
            token_ids = token_ids + [self.word2id['[PAD]'] for _ in range(max_length - len(token_ids))]

        return token_ids

    def decode(self, token_ids):
        if not self.id2word:
            raise ValueError("Vocabulary has not been collected. Call collect_vocab() first.")

        tokens = [self.id2word[token_id] for token_id in token_ids]
        text = " ".join(tokens)
        return text

from_transformer = 'allenai/scibert_scivocab_uncased'
# from_transformer = 'microsoft/biogpt'
# from_transformer = False
if from_transformer:
    tokenizer = AutoTokenizer.from_pretrained(from_transformer)
else:
    tokenizer = NLTKTokenizer()    

# encoded_tokens = tokenizer.encode("This is an example sentence.", max_length=10, padding=True)
# print("Encoded Tokens:", encoded_tokens)

# decoded_text = tokenizer.decode(encoded_tokens)
# print("Decoded Text:", decoded_text)


Downloading (…)lve/main/config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/228k [00:00<?, ?B/s]

In [None]:
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence

class TextDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        return self.data[index], self.labels[index]

def collate_fn(batch):
    texts, labels = zip(*batch)
    lengths = torch.tensor([len(t) for t in texts])
    texts = pad_sequence(texts, batch_first=True, padding_value=0)
    labels = torch.tensor(labels, dtype=torch.float32)
    return texts, lengths, labels

def tokenize_and_pad(texts, tokenizer, max_len):
    tokenized_ids = [tokenizer.encode(t)[:max_len] for t in texts]
    tokenized_ids = [torch.tensor(t) for t in tokenized_ids]
    return tokenized_ids

texts = data['Documents'].tolist()

max_len = 512

if not from_transformer:
    tokenizer.collect_vocab(texts)

tokenized_data = tokenize_and_pad(texts, tokenizer, max_len)

# Splitting the dataset
X_train, X_val, y_train, y_val = train_test_split(tokenized_data, labels, test_size=0.2, random_state=42)

# Create DataLoaders
train_dataset = TextDataset(X_train, y_train)
val_dataset = TextDataset(X_val, y_val)

batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=batch_size, collate_fn=collate_fn)



In [None]:
from transformers import BertForSequenceClassification, BertConfig


# Model, Loss, and Optimizer
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if re.search('biogpt', str(from_transformer)):
    vocab_size = tokenizer.vocab_size
else:
    vocab_size = len(tokenizer.vocab)

model = LSTM(emb_dim = 768, vocab_size=vocab_size, from_transformer = from_transformer).to(device)

# model = TextCNN(vocab_size, emb_dim=120, n_filters=10, filter_sizes=[3, 4, 5], output_dim=1, dropout=0.001, from_transformer = False).to(device)

# config = BertConfig.from_pretrained("bert-base-uncased")
# model = BertForSequenceClassification(config).to(device)

focal_loss=True
criterion = nn.BCELoss()
optimizer = optim.AdamW(model.parameters(), lr=1e-4)

Downloading pytorch_model.bin:   0%|          | 0.00/442M [00:00<?, ?B/s]

Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
def sigmoid_focal_loss(
    inputs: torch.Tensor,
    targets: torch.Tensor,
    alpha: float = 0.75,
    gamma: float = 2,
    reduction: str = "sum",
) -> torch.Tensor:
    """
    Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002.

    Args:
        inputs (Tensor): A float tensor of arbitrary shape.
                The predictions for each example.
        targets (Tensor): A float tensor with the same shape as inputs. Stores the binary
                classification label for each element in inputs
                (0 for the negative class and 1 for the positive class).
        alpha (float): Weighting factor in range (0,1) to balance
                positive vs negative examples or -1 for ignore. Default: ``0.25``.
        gamma (float): Exponent of the modulating factor (1 - p_t) to
                balance easy vs hard examples. Default: ``2``.
        reduction (string): ``'none'`` | ``'mean'`` | ``'sum'``
                ``'none'``: No reduction will be applied to the output.
                ``'mean'``: The output will be averaged.
                ``'sum'``: The output will be summed. Default: ``'none'``.
    Returns:
        Loss tensor with the reduction option applied.
    """
    # Original implementation from https://github.com/facebookresearch/fvcore/blob/master/fvcore/nn/focal_loss.py

    p = torch.sigmoid(inputs)
    ce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
    p_t = p * targets + (1 - p) * (1 - targets)
    loss = ce_loss * ((1 - p_t) ** gamma)

    if alpha >= 0:
        alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
        loss = alpha_t * loss

    # Check reduction option and return loss accordingly
    if reduction == "none":
        pass
    elif reduction == "mean":
        loss = loss.mean()
    elif reduction == "sum":
        loss = loss.sum()
    else:
        raise ValueError(
            f"Invalid Value for arg 'reduction': '{reduction} \n Supported reduction modes: 'none', 'mean', 'sum'"
        )
    return loss

if focal_loss:
    criterion = sigmoid_focal_loss


In [None]:
criterion

In [None]:
from tqdm import tqdm 

# Training and Validation
num_epochs = 4
for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    train_preds, train_targets = [], []

    for texts, lengths, batch_labels in train_loader:
        texts, lengths, batch_labels = texts.to(device), lengths.to('cpu'), batch_labels.to(device)
        optimizer.zero_grad()
        if model.name == 'LSTM':
            outputs = model(texts, lengths)
        else:
            outputs = model(texts)
        loss = criterion(outputs, batch_labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        train_preds += outputs.detach().cpu().numpy().tolist()
        train_targets += batch_labels.detach().cpu().numpy().tolist()

    train_preds = np.array(train_preds) > 0.5
    train_acc = accuracy_score(train_targets, train_preds)
    train_f1 = f1_score(train_targets, train_preds)

    model.eval()
    val_loss = 0
    val_preds, val_targets = [], []
    with torch.no_grad():
        for texts, lengths, batch_labels in val_loader:
            texts, lengths, batch_labels = texts.to(device), lengths.to('cpu'), batch_labels.to(device)
            
            if model.name == 'LSTM':
                outputs = model(texts, lengths)
            else:
                outputs = model(texts)

            loss = criterion(outputs, batch_labels)

            val_loss += loss.item()
            val_preds += outputs.detach().cpu().numpy().tolist()
            val_targets += batch_labels.detach().cpu().numpy().tolist()

    val_preds = np.array(val_preds) > 0.5
    val_acc = accuracy_score(val_targets, val_preds)
    val_f1 = f1_score(val_targets, val_preds)

    print(f"Epoch: {epoch+1}/{num_epochs} | "
          f"Train Loss: {train_loss/len(train_loader):.4f} | "
          f"Train Acc: {train_acc:.4f} | "
          f"Train F1: {train_f1:.4f} | "
          f"Val Loss: {val_loss/len(val_loader):.4f} | "
          f"Val Acc: {val_acc:.4f} | "
          f"Val F1: {val_f1:.4f}")


Epoch: 1/4 | Train Loss: 0.5465 | Train Acc: 0.9668 | Train F1: 0.9664 | Val Loss: 0.5819 | Val Acc: 0.9380 | Val F1: 0.9400
Epoch: 2/4 | Train Loss: 0.5208 | Train Acc: 0.9800 | Train F1: 0.9799 | Val Loss: 0.5729 | Val Acc: 0.9447 | Val F1: 0.9460
Epoch: 3/4 | Train Loss: 0.5118 | Train Acc: 0.9847 | Train F1: 0.9846 | Val Loss: 0.5774 | Val Acc: 0.9444 | Val F1: 0.9469
Epoch: 4/4 | Train Loss: 0.5035 | Train Acc: 0.9892 | Train F1: 0.9891 | Val Loss: 0.5779 | Val Acc: 0.9433 | Val F1: 0.9458


In [None]:
# data = pd.read_csv(PATH + 'Datasets/Test/CAMDA 2022 T1 test.csv', sep = '\t')

data = pd.read_csv(PATH + 'Datasets/Test/CAMDA 2022 T2 test.csv', sep = '\t')

# data = pd.read_csv(PATH + 'Datasets/Test/CAMDA 2022 T3 test.csv', sep = '\t')

# data = pd.read_csv(PATH + 'Datasets/Validation/CAMDA 2022 T1 validation.csv')

# data = pd.read_csv(PATH + 'Datasets/Validation/CAMDA 2022 T2 validation.csv')

# data = pd.read_csv(PATH + 'Datasets/Validation/CAMDA 2022 T3 validation.csv')

# data = pd.read_csv(PATH + 'Datasets/Validation/CAMDA 2022 V4 validation.csv')

data['Documents'] = data[',Description'].map(str)

docs = [re.sub('[A-Z]+\: |\[|\]', '', doc) for doc in data['Documents'].to_list()]

tokenized_data = tokenize_and_pad(docs, tokenizer, max_len)

len(data)

21724

In [None]:
import time
from tqdm import tqdm 

results = []
preds = []
t1 = time.time()
for tokens in tqdm(tokenized_data):
    tokens = tokens.to(device)
    
    length = torch.tensor([len(tokens)])

    pred = model(tokens.unsqueeze(0), length).squeeze().item()
    preds.append(pred)
    if pred>0.5:
        results.append(1)
    else:
        results.append(0)

print("Classification time: %s\t min" % (round((time.time()-t1)/60, 3)))

# with open('/content/drive/MyDrive/GRN/CAMDA/Submissions/Test3/t3_lstm.csv', 'w') as f:
with open('/content/drive/MyDrive/GRN/CAMDA/Submissions/Validation/v2_lstm.csv', 'w') as f:
  f.write('\n'.join([str(int(i)) for i in results]))

    


100%|██████████| 21724/21724 [02:17<00:00, 157.77it/s]

Classification time: 2.295	 min





In [None]:
results_adj = torch.where(torch.tensor(preds)>0.8, 1, 0)
torch.sum(results_adj)

tensor(2861)

In [None]:
torch.save(model, '/content/lstm_dili.pt')

In [None]:

# with open('/content/drive/MyDrive/GRN/CAMDA/Submissions/Test3/t3_lstm_adj.csv', 'w') as f:
with open('/content/drive/MyDrive/GRN/CAMDA/Submissions/Validation/v2_lstm_adj.csv', 'w') as f:
  f.write('\n'.join([str(int(i)) for i in results_adj]))

In [None]:
torch.save(model, '/content/drive/MyDrive/GRN/CAMDA/Models/lstm_dili.pt')

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]