In [33]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [34]:
!pip install transformers



In [35]:
%cd '/content/drive/MyDrive/url_multilabel'

/content/drive/MyDrive/url_multilabel


In [36]:
import os
import pandas as pd
import numpy as np
import shutil
import sys
import tqdm.notebook as tq
from collections import defaultdict

import torch
import torch.nn as nn

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [37]:
df_data = pd.read_csv('final_output.csv')
df_data

Unnamed: 0.1,Unnamed: 0,URL,content,0ther,Blog,Commercial/promotional,Content delivery,Entertainment,Error message,Gateway,Index,Informative,Journalistic,Personal,Scientific,Shopping
0,0,http://www.cybersexnetwork3.com/hardbodies/buk...,{'summary_text': 'We are currently manufacturi...,1,0,0,0,0,0,0,1,0,0,0,0,0
1,1,http://www.dirty-cartoonpictures.com/,{'summary_text': 'if you attended one of the f...,1,0,0,1,1,0,0,0,0,0,0,0,0
2,2,http://www.angstnromance.net/nanda/fic/sg1.html,{'summary_text': 'We only charge contingency f...,1,0,0,0,0,0,0,0,0,0,0,0,0
3,3,http://www.geocities.com/youraveragegirl/,{'summary_text': 'This site uses cookies. By u...,0,1,0,0,0,0,0,0,0,0,1,0,0
4,4,http://www.erostranssexuals.com,"{'summary_text': ""Explore last year's mileston...",1,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2158,2158,http://www.botany.hawaii.edu/faculty/carr/rham...,{'summary_text': 'The Rhamnaceae are mainly tr...,0,0,0,0,0,0,0,0,0,0,1,0,0
2159,2159,http://www.ibiblio.org/pfaf/cgi-bin/arr_html?t...,"{'summary_text': ""You don't have permission to...",0,0,0,0,0,1,0,0,0,0,0,0,0
2160,2160,http://scitec.uwichill.edu.bb/bcs/staff/lec/fe...,"404 Not Found, Error: Name or service not known",0,0,0,0,0,1,0,0,0,0,0,0,0
2161,2161,http://www.collateralthx.com,"404 Not Found, Error: Name or service not known",0,0,0,0,0,1,0,0,0,0,0,0,0


In [38]:
# Combine title and abstract to increase power
df_data["combined"] = df_data["URL"] + ". " + df_data["content"]
df_data.drop(columns=["URL", "content", "Unnamed: 0"], axis=1, inplace=True)

In [39]:
df_data

Unnamed: 0,0ther,Blog,Commercial/promotional,Content delivery,Entertainment,Error message,Gateway,Index,Informative,Journalistic,Personal,Scientific,Shopping,combined
0,1,0,0,0,0,0,0,1,0,0,0,0,0,http://www.cybersexnetwork3.com/hardbodies/buk...
1,1,0,0,1,1,0,0,0,0,0,0,0,0,http://www.dirty-cartoonpictures.com/. {'summa...
2,1,0,0,0,0,0,0,0,0,0,0,0,0,http://www.angstnromance.net/nanda/fic/sg1.htm...
3,0,1,0,0,0,0,0,0,0,0,1,0,0,http://www.geocities.com/youraveragegirl/. {'s...
4,1,0,0,0,0,0,0,0,0,0,0,0,0,http://www.erostranssexuals.com. {'summary_tex...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2158,0,0,0,0,0,0,0,0,0,0,1,0,0,http://www.botany.hawaii.edu/faculty/carr/rham...
2159,0,0,0,0,0,1,0,0,0,0,0,0,0,http://www.ibiblio.org/pfaf/cgi-bin/arr_html?t...
2160,0,0,0,0,0,1,0,0,0,0,0,0,0,http://scitec.uwichill.edu.bb/bcs/staff/lec/fe...
2161,0,0,0,0,0,1,0,0,0,0,0,0,0,"http://www.collateralthx.com. 404 Not Found, E..."


In [40]:
from sklearn.model_selection import train_test_split
# split into train and test
df_train, df_test = train_test_split(df_data, random_state=77, test_size=0.30, shuffle=True)
# split test into test and validation datasets
df_test, df_valid = train_test_split(df_test, random_state=88, test_size=0.50, shuffle=True)

In [41]:
# Hyperparameters
MAX_LEN = 512
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 8
TEST_BATCH_SIZE = 8
EPOCHS = 20
LEARNING_RATE = 1e-05
THRESHOLD = 0.5 # threshold for the sigmoid

In [42]:
from transformers import BertTokenizer, BertModel

In [43]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [44]:
df_train['combined']

1666    http://ecoles.uneq.qc.ca. 404 Not Found, Error...
844     http://www.sdmcu.org/. {'summary_text': 'Rates...
1970    http://www.usnews.com/usnews/rankguide/rghome....
990     http://www.sppre.com. {'summary_text': 'Buy no...
117     http://myspace.virgin.net/malcolm.chorley/. 40...
                              ...                        
736     http://www.colonialconcepts.com/. 404 Not Foun...
927     http://hoffmanservices.com/. {'summary_text': ...
1317    http://en.wikipedia.org/wiki/bsd/os. Other rea...
2004    http://www.umiacs.umd.edu/research/parallel/in...
607     http://www.lastcut.com. {'summary_text': 'The ...
Name: combined, Length: 1514, dtype: object

In [45]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, max_len, target_list):
        self.tokenizer = tokenizer
        self.df = df
        self.title = list(df['combined'])
        self.targets = self.df[target_list].values
        self.max_len = max_len

    def __len__(self):
        return len(self.title)

    def __getitem__(self, index):
        title = str(self.title[index])
        title = " ".join(title.split())
        inputs = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs["token_type_ids"].flatten(),
            'targets': torch.FloatTensor(self.targets[index]),
            'title': title
        }

In [46]:
target_list = list(df_data.columns)
target_list = target_list[:-1]
target_list

['0ther',
 'Blog',
 'Commercial/promotional',
 'Content delivery',
 'Entertainment',
 'Error message',
 'Gateway',
 'Index',
 'Informative',
 'Journalistic',
 'Personal',
 'Scientific',
 'Shopping']

In [47]:
train_dataset = CustomDataset(df_train, tokenizer, MAX_LEN, target_list)
valid_dataset = CustomDataset(df_valid, tokenizer, MAX_LEN, target_list)
test_dataset = CustomDataset(df_test, tokenizer, MAX_LEN, target_list)

In [48]:
# Data loaders
train_data_loader = torch.utils.data.DataLoader(train_dataset,
    batch_size=TRAIN_BATCH_SIZE,
    shuffle=True,
    num_workers=0
)

val_data_loader = torch.utils.data.DataLoader(valid_dataset,
    batch_size=VALID_BATCH_SIZE,
    shuffle=False,
    num_workers=0
)

test_data_loader = torch.utils.data.DataLoader(test_dataset,
    batch_size=TEST_BATCH_SIZE,
    shuffle=False,
    num_workers=0
)

In [49]:
import gc

gc.collect()

torch.cuda.empty_cache()

In [50]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.bert_model = BertModel.from_pretrained('bert-base-uncased', return_dict=True)
        self.dropout = torch.nn.Dropout(0.3)
        self.linear = torch.nn.Linear(768, 13)

    def forward(self, input_ids, attn_mask, token_type_ids):
        output = self.bert_model(
            input_ids,
            attention_mask=attn_mask,
            token_type_ids=token_type_ids
        )
        output_dropout = self.dropout(output.pooler_output)
        output = self.linear(output_dropout)
        return output

model = BERTClass()

# # Freezing BERT layers: (tested, weaker convergence)
# for param in model.bert_model.parameters():
#     param.requires_grad = False

model.to(device)

BERTClass(
  (bert_model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_a

In [51]:
# BCEWithLogitsLoss combines a Sigmoid layer and the BCELoss in one single class.
# This version is more numerically stable than using a plain Sigmoid followed
# by a BCELoss as, by combining the operations into one layer,
# we take advantage of the log-sum-exp trick for numerical stability.
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [52]:
from transformers import AdamW

# define the optimizer
optimizer = AdamW(model.parameters(), lr = 1e-5)



In [53]:
# Training of the model for one epoch
def train_model(training_loader, model, optimizer):

    losses = []
    correct_predictions = 0
    num_samples = 0
    # set model to training mode (activate droput, batch norm)
    model.train()
    # initialize the progress bar
    loop = tq.tqdm(enumerate(training_loader), total=len(training_loader),
                      leave=True, colour='steelblue')
    for batch_idx, data in loop:
        ids = data['input_ids'].to(device, dtype = torch.long)
        mask = data['attention_mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        # forward
        outputs = model(ids, mask, token_type_ids) # (batch,predict)=(32,13)
        loss = loss_fn(outputs, targets)
        losses.append(loss.item())
        # training accuracy, apply sigmoid, round (apply thresh 0.5)
        outputs = torch.sigmoid(outputs).cpu().detach().numpy().round()
        targets = targets.cpu().detach().numpy()
        correct_predictions += np.sum(outputs==targets)
        num_samples += targets.size   # total number of elements in the 2D array

        # backward
        optimizer.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        # grad descent step
        optimizer.step()

        # Update progress bar
        #loop.set_description(f"")
        #loop.set_postfix(batch_loss=loss)

    # returning: trained model, model accuracy, mean loss
    return model, float(correct_predictions)/num_samples, np.mean(losses)

In [54]:
def eval_model(validation_loader, model, optimizer):
    losses = []
    correct_predictions = 0
    num_samples = 0
    # set model to eval mode (turn off dropout, fix batch norm)
    model.eval()

    with torch.no_grad():
        for batch_idx, data in enumerate(validation_loader, 0):
            ids = data['input_ids'].to(device, dtype = torch.long)
            mask = data['attention_mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)

            loss = loss_fn(outputs, targets)
            losses.append(loss.item())

            # validation accuracy
            # add sigmoid, for the training sigmoid is in BCEWithLogitsLoss
            outputs = torch.sigmoid(outputs).cpu().detach().numpy().round()
            targets = targets.cpu().detach().numpy()
            correct_predictions += np.sum(outputs==targets)
            num_samples += targets.size   # total number of elements in the 2D array

    return float(correct_predictions)/num_samples, np.mean(losses)

In [55]:
data_dir = '/content/drive/MyDrive/url_multilabel'
history = defaultdict(list)
best_accuracy = 0

for epoch in range(1, EPOCHS+1):
    print(f'Epoch {epoch}/{EPOCHS}')
    model, train_acc, train_loss = train_model(train_data_loader, model, optimizer)
    val_acc, val_loss = eval_model(val_data_loader, model, optimizer)

    print(f'train_loss={train_loss:.4f}, val_loss={val_loss:.4f} train_acc={train_acc:.4f}, val_acc={val_acc:.4f}')

    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc)
    history['val_loss'].append(val_loss)
    # save the best model
    if val_acc > best_accuracy:
        torch.save(model.state_dict(), os.path.join(data_dir,"output","MLTC_model_state.bin"))
        best_accuracy = val_acc

Epoch 1/20


  0%|          | 0/190 [00:00<?, ?it/s]

train_loss=0.4221, val_loss=0.3396 train_acc=0.8432, val_acc=0.8743
Epoch 2/20


  0%|          | 0/190 [00:00<?, ?it/s]

train_loss=0.3248, val_loss=0.3280 train_acc=0.8789, val_acc=0.8746
Epoch 3/20


  0%|          | 0/190 [00:00<?, ?it/s]

train_loss=0.3130, val_loss=0.3217 train_acc=0.8815, val_acc=0.8788
Epoch 4/20


  0%|          | 0/190 [00:00<?, ?it/s]

train_loss=0.2980, val_loss=0.3147 train_acc=0.8887, val_acc=0.8779
Epoch 5/20


  0%|          | 0/190 [00:00<?, ?it/s]

train_loss=0.2839, val_loss=0.3155 train_acc=0.8938, val_acc=0.8741
Epoch 6/20


  0%|          | 0/190 [00:00<?, ?it/s]

train_loss=0.2641, val_loss=0.3179 train_acc=0.9041, val_acc=0.8807
Epoch 7/20


  0%|          | 0/190 [00:00<?, ?it/s]

train_loss=0.2422, val_loss=0.3214 train_acc=0.9136, val_acc=0.8791
Epoch 8/20


  0%|          | 0/190 [00:00<?, ?it/s]

train_loss=0.2222, val_loss=0.3313 train_acc=0.9225, val_acc=0.8767
Epoch 9/20


  0%|          | 0/190 [00:00<?, ?it/s]

train_loss=0.2030, val_loss=0.3415 train_acc=0.9284, val_acc=0.8738
Epoch 10/20


  0%|          | 0/190 [00:00<?, ?it/s]

train_loss=0.1876, val_loss=0.3451 train_acc=0.9344, val_acc=0.8762
Epoch 11/20


  0%|          | 0/190 [00:00<?, ?it/s]

train_loss=0.1720, val_loss=0.3541 train_acc=0.9398, val_acc=0.8757
Epoch 12/20


  0%|          | 0/190 [00:00<?, ?it/s]

train_loss=0.1589, val_loss=0.3669 train_acc=0.9435, val_acc=0.8724
Epoch 13/20


  0%|          | 0/190 [00:00<?, ?it/s]

train_loss=0.1472, val_loss=0.3662 train_acc=0.9480, val_acc=0.8750
Epoch 14/20


  0%|          | 0/190 [00:00<?, ?it/s]

train_loss=0.1331, val_loss=0.3789 train_acc=0.9548, val_acc=0.8712
Epoch 15/20


  0%|          | 0/190 [00:00<?, ?it/s]

train_loss=0.1240, val_loss=0.3944 train_acc=0.9577, val_acc=0.8724
Epoch 16/20


  0%|          | 0/190 [00:00<?, ?it/s]

train_loss=0.1133, val_loss=0.3891 train_acc=0.9635, val_acc=0.8734
Epoch 17/20


  0%|          | 0/190 [00:00<?, ?it/s]

train_loss=0.1026, val_loss=0.4051 train_acc=0.9684, val_acc=0.8708
Epoch 18/20


  0%|          | 0/190 [00:00<?, ?it/s]

train_loss=0.0949, val_loss=0.4062 train_acc=0.9728, val_acc=0.8736
Epoch 19/20


  0%|          | 0/190 [00:00<?, ?it/s]

train_loss=0.0882, val_loss=0.4192 train_acc=0.9749, val_acc=0.8746
Epoch 20/20


  0%|          | 0/190 [00:00<?, ?it/s]

train_loss=0.0773, val_loss=0.4248 train_acc=0.9794, val_acc=0.8724


In [56]:
# Loading pretrained model (best model)
model = BERTClass()
model.load_state_dict(torch.load(os.path.join(data_dir,"output","MLTC_model_state.bin")))
model = model.to(device)

In [57]:
test_acc, test_loss = eval_model(test_data_loader, model, optimizer)

In [58]:
test_acc

0.8831908831908832

In [59]:
from sklearn.metrics import confusion_matrix, classification_report


In [60]:
def get_predictions(model, data_loader):
    """
    Outputs:
      predictions -
    """
    model = model.eval()

    titles = []
    predictions = []
    prediction_probs = []
    target_values = []

    with torch.no_grad():
      for data in data_loader:
        title = data["title"]
        ids = data["input_ids"].to(device, dtype = torch.long)
        mask = data["attention_mask"].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data["targets"].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)
        # add sigmoid, for the training sigmoid is in BCEWithLogitsLoss
        outputs = torch.sigmoid(outputs).detach().cpu()
        # thresholding at 0.5
        preds = outputs.round()
        targets = targets.detach().cpu()

        titles.extend(title)
        predictions.extend(preds)
        prediction_probs.extend(outputs)
        target_values.extend(targets)

    predictions = torch.stack(predictions)
    prediction_probs = torch.stack(prediction_probs)
    target_values = torch.stack(target_values)

    return titles, predictions, prediction_probs, target_values


In [61]:
titles, predictions, prediction_probs, target_values = get_predictions(model, test_data_loader)

In [62]:
predictions.numpy()

array([[0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [63]:
print(classification_report(target_values, predictions, target_names=target_list))

                        precision    recall  f1-score   support

                 0ther       0.00      0.00      0.00        21
                  Blog       0.00      0.00      0.00        20
Commercial/promotional       0.70      0.29      0.41        73
      Content delivery       0.00      0.00      0.00        25
         Entertainment       1.00      0.10      0.17        21
         Error message       0.47      0.44      0.46       104
               Gateway       0.56      0.25      0.35        91
                 Index       0.00      0.00      0.00        40
           Informative       0.86      0.11      0.19        57
          Journalistic       1.00      0.07      0.12        15
              Personal       0.00      0.00      0.00        14
            Scientific       0.00      0.00      0.00        16
              Shopping       0.00      0.00      0.00        14

             micro avg       0.55      0.19      0.29       511
             macro avg       0.35     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
