In [None]:
jupyter nbextension enable --py widgetsnbextension

In [None]:
import numpy as np
import pandas as pd
from sklearn import metrics
import torch
import transformers
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig, AutoTokenizer, AutoModel

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

print(f"Device : {device}")

Device : cuda


In [None]:
MAX_LEN = 256
TRAIN_BATCH_SIZE = 32
VAL_BATCH_SIZE = 32
EPOCHS = 3
LEARNING_RATE = 2e-5

In [None]:
#Uncomment tokenizer to be used

#tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_cased")

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [None]:
class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.data = dataframe
        self.text = self.data.Text
        self.targets = self.data.category
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )

        ids = inputs['input_ids']
        attention_mask = inputs['attention_mask']
        token_type_ids = inputs['token_type_ids']

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(attention_mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }


class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = BertModel.from_pretrained('bert-base-cased', return_dict=False)
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, 11)

    def forward(self, ids, mask, token_type_ids):
        _, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids)
        output_2 = self.l2(output_1)
        output = self.l3(output_2)
        return output


class SciBERTClass(torch.nn.Module):
    def __init__(self):
        super(SciBERTClass, self).__init__()
        self.l1 = AutoModel.from_pretrained("allenai/scibert_scivocab_cased", return_dict=False)
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, 11)

    def forward(self, ids, mask, token_type_ids):
        _, output_1 = self.l1(ids, attention_mask=mask, token_type_ids=token_type_ids)
        output_2 = self.l2(output_1)
        output = self.l3(output_2)
        return output


def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)


def train(epoch):
    model.train()
    for _, data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype=torch.long)
        mask = data['mask'].to(device, dtype=torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
        targets = data['targets'].to(device, dtype=torch.float)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)

        if _ % 500 == 0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()


def validation(epoch):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets


def get_data():
    df = pd.read_csv('F:/ML/PaperClassification/Data/SampledArxiv.csv')
    df['category'] = df[df.columns[1:]].values.tolist()
    data = df[['Text', 'category']].copy()
    train_size = 0.8
    train_dataset = data.sample(frac=train_size, random_state=42)
    test_dataset = data.drop(train_dataset.index).reset_index(drop=True)
    train_dataset = train_dataset.reset_index(drop=True)

    print("FULL Dataset: {}".format(data.shape))
    print("TRAIN Dataset: {}".format(train_dataset.shape))
    print("TEST Dataset: {}".format(test_dataset.shape))

    training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN)
    testing_set = CustomDataset(test_dataset, tokenizer, MAX_LEN)

    return training_set, testing_set

In [None]:
training_set, testing_set = get_data()

train_params = {
    'batch_size': TRAIN_BATCH_SIZE,
    'shuffle': True,
    'num_workers': 0
}
test_params = {
    'batch_size': VAL_BATCH_SIZE,
    'shuffle': True,
    'num_workers': 0
}

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

FULL Dataset: (710407, 2)
TRAIN Dataset: (568326, 2)
TEST Dataset: (142081, 2)


In [None]:
#Uncomment model to be used

#model = BERTClass()
model = SciBERTClass()
model.to(device)
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

In [None]:
for epoch in range(EPOCHS):
    train(epoch)



Epoch: 0, Loss:  0.6391966938972473
Epoch: 0, Loss:  0.16672003269195557
Epoch: 0, Loss:  0.12156003713607788
Epoch: 0, Loss:  0.0771590992808342
Epoch: 0, Loss:  0.1375623643398285
Epoch: 0, Loss:  0.0667928010225296
Epoch: 0, Loss:  0.10788874328136444
Epoch: 0, Loss:  0.08249708265066147
Epoch: 0, Loss:  0.07441641390323639
Epoch: 0, Loss:  0.07802104949951172
Epoch: 0, Loss:  0.07774622738361359
Epoch: 0, Loss:  0.10352782160043716
Epoch: 0, Loss:  0.08946099132299423
Epoch: 0, Loss:  0.0959123745560646
Epoch: 0, Loss:  0.0807165801525116
Epoch: 0, Loss:  0.1299416422843933
Epoch: 0, Loss:  0.0773983895778656
Epoch: 0, Loss:  0.04049670696258545
Epoch: 0, Loss:  0.09803441911935806
Epoch: 0, Loss:  0.10183229297399521
Epoch: 0, Loss:  0.09978660196065903
Epoch: 0, Loss:  0.05341735854744911
Epoch: 0, Loss:  0.06825852394104004
Epoch: 0, Loss:  0.08274786174297333
Epoch: 0, Loss:  0.06359201669692993
Epoch: 0, Loss:  0.07750534266233444
Epoch: 0, Loss:  0.1036125123500824
Epoch: 0, 

In [None]:
for epoch in range(EPOCHS):
    outputs, targets = validation(epoch)
    outputs = np.array(outputs) >= 0.5
    accuracy = metrics.accuracy_score(targets, outputs)
    f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
    f1_score_macro = metrics.f1_score(targets, outputs, average='macro')
    print(f"Accuracy Score = {accuracy}")
    print(f"F1 Score (Micro) = {f1_score_micro}")
    print(f"F1 Score (Macro) = {f1_score_macro}")

Accuracy Score = 0.7498891477396696
F1 Score (Micro) = 0.8612193564179542
F1 Score (Macro) = 0.8033602262464462




Accuracy Score = 0.7498891477396696
F1 Score (Micro) = 0.8612193564179542
F1 Score (Macro) = 0.8033602262464462




Accuracy Score = 0.7498891477396696
F1 Score (Micro) = 0.8612193564179542
F1 Score (Macro) = 0.8033602262464462


In [None]:
output_model_file = 'BERT_3/BERT_3.bin'
output_vocab_file = './BERT_3/'

model_to_save = model
torch.save(model_to_save, output_model_file)
tokenizer.save_vocabulary(output_vocab_file)

('./BERT_3/vocab.txt',)