In [1]:
!pip install torchsummary

Collecting torchsummary
  Downloading torchsummary-1.5.1-py3-none-any.whl (2.8 kB)
Installing collected packages: torchsummary
Successfully installed torchsummary-1.5.1
[0m

In [2]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import BertModel
from transformers import BertTokenizer
from torchsummary import summary
from transformers import AdamW, get_linear_schedule_with_warmup
from sklearn import metrics
import random
import time



data for training

In [3]:
data_to_train = '/kaggle/input/dir-processed/DIR_processed.csv'
df_dir = pd.read_csv(data_to_train)

titles = df_dir.processed_title.values
categories = df_dir.columns.values[-9:-2]
labels = df_dir.iloc[:,-9:-2].values

data for classification task

In [4]:
data_to_classify = '/kaggle/input/output-relevant-tweets/tweets-22-v3.csv'
#data_to_classify = '/kaggle/input/tweets-binary-v2/tweets-for-binary-class.csv'

df = pd.read_csv(data_to_classify, index_col=0, encoding='ISO 8859-1')
text_cleaned = df.loc[:,"text"].values

text_cleaned = [x for x in text_cleaned if str(x) != 'nan']
df_text_cleaned = pd.DataFrame({"cleaned_text": text_cleaned})
df_text_cleaned.cleaned_text.replace('', np.nan, inplace=True)
df_text_cleaned = df_text_cleaned.drop_duplicates().dropna().reset_index(drop=True)
text_cleaned = df_text_cleaned.cleaned_text.values

In [5]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=True)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

maximum length of first data source for training

In [6]:
#finding max length
max_length = 0
for sentence in titles:

    input_ids = tokenizer.encode(sentence, add_special_tokens=True)
    max_length = max(max_length, len(input_ids))

print('Max sentence length: ', max_length)

Max sentence length:  49


maximum length overall

In [7]:
for sentence in text_cleaned:

    input_ids = tokenizer.encode(sentence, add_special_tokens=True)
    max_length = max(max_length, len(input_ids))

print('Max sentence length: ', max_length)

Max sentence length:  79


In [8]:
input_ids = []
attention_masks = []

In [9]:
for sentence in titles:
    inputs = tokenizer.encode_plus(
        sentence,  # sequence to be encoded
        add_special_tokens=True,  # to encode the sequences with the special tokens
        padding='max_length',  # activates and controls padding
        truncation=True,  # activates and controls truncation
        max_length=max_length,  # controls the maximum length to use       
        return_attention_mask=True,  # whether to return the attention mask
        return_tensors='pt'  # to return PyTorch torch.Tensor objects        
    )
    #list of token ids to be fed to a model.
    input_ids.append(inputs['input_ids'])
    # list of indices specifying which tokens should be considered to by the model 
    attention_masks.append(inputs['attention_mask'])

In [10]:
input_ids_test = []
attention_masks_test = []

In [11]:
for sentence in text_cleaned:
    inputs = tokenizer.encode_plus(
        sentence,  # sequence to be encoded
        add_special_tokens=True,  # to encode the sequences with the special tokens
        padding='max_length',  # activates and controls padding
        truncation=True,  # activates and controls truncation
        max_length=max_length,  # controls the maximum length to use       
        return_attention_mask=True,  # whether to return the attention mask
        return_tensors='pt'  # to return PyTorch torch.Tensor objects        
    )
    #list of token ids to be fed to a model.
    input_ids_test.append(inputs['input_ids'])
    # list of indices specifying which tokens should be considered to by the model 
    attention_masks_test.append(inputs['attention_mask'])

In [12]:
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

input_ids_test = torch.cat(input_ids_test, dim=0)
attention_masks_test = torch.cat(attention_masks_test, dim=0)

In [13]:
#the batches recommended from the authors are: 16 and 32
batch_size = 16

In [14]:
# dataset for training
dataset = TensorDataset(input_ids, attention_masks, labels)

# 80% training set 20% validation set
train_size = int(0.8 * len(dataset))
validation_size = len(dataset) - train_size

train_dataset, validation_dataset = random_split(dataset, [train_size, validation_size])

In [15]:
train_dataloader = DataLoader(
            train_dataset,
            sampler = RandomSampler(train_dataset),
            batch_size = batch_size
        )

validation_dataloader = DataLoader(
            validation_dataset,
            sampler = SequentialSampler(validation_dataset),
            batch_size = batch_size
        )


In [16]:
dataset = TensorDataset(input_ids_test, attention_masks_test)
test_dataloader = DataLoader(
            dataset,
            sampler = RandomSampler(dataset),
            batch_size = batch_size
        )

In [17]:
loss_fn = nn.BCELoss()

In [18]:
class BertClassifier(nn.Module):
    def __init__(self, freeze_bert=False):

        super(BertClassifier, self).__init__()
        # Specify hidden size of BERT, hidden size of the classifier, and number of labels
        D_in, D_out = 768, 7
        H = 64
       
        self.bert = BertModel.from_pretrained('bert-base-multilingual-cased')
       
        self.classifier = nn.Sequential(
            nn.Dropout(0.2),
            nn.Linear(D_in, H),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(H, D_out),
            nn.Sigmoid()
        )

        # Freeze the BERT model
        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False

    def forward(self, input_ids, attention_mask):

        # Feed input to BERT
        outputs = self.bert(input_ids=input_ids,
                            attention_mask=attention_mask)

        # Extract the last hidden state of the token `[CLS]` 
        pooled_output = torch.mean(outputs.last_hidden_state,1)

        # Feed input to classifier
        logits = self.classifier(pooled_output)        

        return logits

In [19]:
def set_seed(seed_value=42):

    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)

In [20]:
def initialize_model(epochs=5):

    # Instantiate Bert Classifier
    bert_classifier = BertClassifier(freeze_bert=False)

    
    bert_classifier

    # Create the optimizer
    optimizer = AdamW(bert_classifier.parameters(),
                      lr=3e-5,    # Default learning rate
                      eps=1e-8    # Default epsilon value
                      )

    # Total number of training steps
    total_steps = len(train_dataloader) * epochs

    # Set up the learning rate scheduler
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=0, # Default value
                                                num_training_steps=total_steps)
    return bert_classifier, optimizer, scheduler

In [21]:
def calculate_metrics(pred, target):
    return {'micro/precision': metrics.precision_score(y_true=target, y_pred=pred, average='micro'),

            'micro/recall': metrics.recall_score(y_true=target, y_pred=pred, average='micro'),

            'micro/f1': metrics.f1_score(y_true=target, y_pred=pred, average='micro'),

            'macro/precision': metrics.precision_score(y_true=target, y_pred=pred, average='macro'),

            'macro/recall': metrics.recall_score(y_true=target, y_pred=pred, average='macro'),

            'macro/f1': metrics.f1_score(y_true=target, y_pred=pred, average='macro'),

            'samples/precision': metrics.precision_score(y_true=target, y_pred=pred, average='samples'),

            'samples/recall': metrics.recall_score(y_true=target, y_pred=pred, average='samples'),

            'samples/f1': metrics.f1_score(y_true=target, y_pred=pred, average='samples')
            }

In [22]:
def train(model, train_dataloader, validation_dataloader=None, epochs=5, evaluation=True):

    # Start training loop
    print("Start training...\n")
    for epoch_i in range(epochs):
        # =======================================
        #               Training
        # =======================================
        # Print the header of the result table
        print(
            f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val Precision':^9} | {'Val Recall':^9} | {'Val F1':^9} | {'Elapsed':^9}")
        print("-" * 100)

        # Measure the elapsed time of each epoch
        t0_epoch, t0_batch = time.time(), time.time()

        # Reset tracking variables at the beginning of each epoch
        total_loss, batch_loss, batch_counts = 0, 0, 0

        # Put the model into the training mode
        model.train()

        # For each batch of training data...
        for step, batch in enumerate(train_dataloader):
            batch_counts += 1
            # Load batch to GPU
            b_input_ids, b_attn_mask, b_labels = tuple(t for t in batch)

            # Zero out any previously calculated gradients
            model.zero_grad()

            # Perform a forward pass. This will return logits.
            logits = model(b_input_ids, b_attn_mask)

            loss=loss_fn(logits, b_labels.type(torch.float))
            # accumulate the loss values
            batch_loss += loss.item()
            total_loss += loss.item()
            # print(loss,logits.shape)
            # Perform a backward pass to calculate gradients
            loss.backward()

            # Clip the norm of the gradients to 1.0 to prevent "exploding gradients"
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # Update parameters and the learning rate
            optimizer.step()
            scheduler.step()

            # Print the loss values and time elapsed for every 20 batches
            if (step % 20 == 0 and step != 0) or (step == len(train_dataloader) - 1):
                # Calculate time elapsed for 20 batches
                time_elapsed = time.time() - t0_batch

                # Print training results
                print(
                    f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12f} | {'-':^10} | {'-':^13} | {'-':^10} | {'-':^9} | {time_elapsed:^9f}")

                # Reset batch tracking variables
                batch_loss, batch_counts = 0, 0
                t0_batch = time.time()

        # Calculate the average loss over the entire training data
        avg_train_loss = total_loss / len(train_dataloader)

        print("-" * 100)
        # =======================================
        #               Evaluation
        # =======================================
        if evaluation == True:
            # After the completion of each training epoch, measure the model's performance
            # on our validation set.
            val_loss, val_precision, val_recall, val_f1 = evaluate(model, validation_dataloader)

            # Print performance over the entire training data
            time_elapsed = time.time() - t0_epoch

            # Print the header of the result table

            print("-" * 100)
            print(
                f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {val_loss:^10.6f} | {val_precision:^13f} | {val_recall:^10.2f} | {val_f1:^9.2f} | {time_elapsed:^9.2f}")
            print("-" * 100)
        print("\n")

    print("Training complete!")

    # saving the model
    # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
    output_dir = '/kaggle/working/'

    # Create output directory if needed
    if not os.path.exists(output_dir):
      os.makedirs(output_dir)

    print("Saving model to %s" % output_dir)

    model.bert.save_pretrained(output_dir)

In [23]:
def evaluate(model, validation_dataloader):

    # Put the model into the evaluation mode. The dropout layers are disabled during
    # the test time.
    model.eval()

    # Tracking variables
    val_precision = []
    val_recall = []
    val_f1 = []
    val_loss = []

    # For each batch in our validation set...
    for batch in validation_dataloader:
        # Load batch to GPU
        b_input_ids, b_attn_mask, b_labels = tuple(t for t in batch)

        # Compute logits
        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)

        # Compute loss
        loss = loss_fn(logits, b_labels.type(torch.float))
        val_loss.append(loss.item())

        # Get the predictions
        logits_arr = logits.cpu().numpy()
        logits_arr[logits_arr >= 0.5] = 1
        logits_arr[logits_arr < 0.5] = 0

        b_labels_arr = b_labels.cpu().numpy()
        # Calculate the accuracy rate
        precision_micro = metrics.precision_score(y_true=b_labels_arr, y_pred=logits_arr, average='micro')
        val_precision.append(precision_micro)
        recall_micro = metrics.recall_score(y_true=b_labels_arr, y_pred=logits_arr, average='micro')
        val_recall.append(recall_micro)
        f1_micro = metrics.f1_score(y_true=b_labels_arr, y_pred=logits_arr, average='micro')
        val_f1.append(f1_micro)

    # Compute the average accuracy and loss over the validation set.
    val_loss = np.mean(val_loss)
    val_precision = np.mean(precision_micro)
    val_recall = np.mean(recall_micro)
    val_f1 = np.mean(f1_micro)

    return val_loss, val_precision, val_recall, val_f1

In [24]:
def bert_predict(model, test_dataloader):

    model.eval()

    all_logits = []


    for batch in test_dataloader:
        b_input_ids, b_attn_mask = tuple(t for t in batch)[:2]

        # Compute logits
        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)
        all_logits.append(logits.type(torch.float))
    
    # Concatenate logits from each batch
    all_logits = torch.cat(all_logits, dim=0)

    probs = all_logits.cpu().numpy()

    return probs

In [25]:
set_seed(42)
bert_classifier, optimizer, scheduler = initialize_model(epochs=4)
train(bert_classifier, train_dataloader, validation_dataloader, epochs=4, evaluation=True)

Downloading pytorch_model.bin:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Start training...

 Epoch  |  Batch  |  Train Loss  |  Val Loss  | Val Precision | Val Recall |  Val F1   |  Elapsed 
----------------------------------------------------------------------------------------------------
   1    |   20    |   0.629210   |     -      |       -       |     -      |     -     | 124.232486
   1    |   40    |   0.554600   |     -      |       -       |     -      |     -     | 116.646772
   1    |   60    |   0.502615   |     -      |       -       |     -      |     -     | 116.634254
   1    |   80    |   0.452462   |     -      |       -       |     -      |     -     | 116.742714
   1    |   100   |   0.409353   |     -      |       -       |     -      |     -     | 116.001536
   1    |   120   |   0.411271   |     -      |       -       |     -      |     -     | 116.753007
   1    |   140   |   0.392934   |     -      |       -       |     -      |     -     | 116.672369
   1    |   160   |   0.383660   |     -      |       -       |     -      |     

In [26]:
probs = bert_predict(bert_classifier, test_dataloader)

# Get predictions from the probabilities
threshold = 0.5
preds = np.where(probs > threshold, 1, 0)

df_text_cleaned['0'] = preds[:,0]
df_text_cleaned['1'] = preds[:,1]
df_text_cleaned['2'] = preds[:,2]
df_text_cleaned['3'] = preds[:,3]
df_text_cleaned['4'] = preds[:,4]
df_text_cleaned['5'] = preds[:,5]
df_text_cleaned['6'] = preds[:,6]
df_text_cleaned.to_excel('tweets-predictions-multiclass-v3.xlsx')