In [1]:
!pip install torchsummary
!pip install Unidecode

Collecting torchsummary
  Downloading torchsummary-1.5.1-py3-none-any.whl (2.8 kB)
Installing collected packages: torchsummary
Successfully installed torchsummary-1.5.1
[0m

In [2]:
import os
import numpy as np
import pandas as pd
import re
import csv
from bs4 import BeautifulSoup
import unicodedata
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer
from tqdm import tqdm
import codecs
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler, random_split
from torchsummary import summary
from transformers import BertTokenizer
from transformers import BertModel
from transformers import AdamW, get_linear_schedule_with_warmup
import matplotlib.pyplot as plt
from sklearn import metrics
import random
import time
from collections import Counter
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from unidecode import unidecode

In [3]:
def remove_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    [s.extract() for s in soup(['iframe','script'])]
    stripped_text = soup.get_text()
    stripped_text = re.sub(r'["\|\n|\r|\n\r]+',' ', stripped_text)
    return stripped_text

def remove_html(text):
    text = re.sub(r'https?:\/\/\S*', ' ', str(text), flags=re.MULTILINE)
    return text

def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii','ignore').decode('utf-8','ignore')
    return text

def remove_special_characters(text):
    text = re.sub(r'[^ \nA-Za-z0-9À-ÖØ-öø-ÿ/]+',' ',text)
    return text

def remove_stopwords_1(text):
    
    stop_file_path = '/kaggle/input/offline-libs/stop-words-2018.7.23/stop-words-2018.7.23/stop_words/stop-words/italian.txt'
   
    with open(stop_file_path, 'r', encoding="utf-8") as f:
        stopwords = f.readlines()
        stop_set = set(m.strip() for m in stopwords)
        stopword_list = list(frozenset(stop_set))
        stopword_list.append('rt')
        stopword_list.append('cè')
        stopword_list.append('via')
        stopword_list.append('d')
    
    tokenizer = ToktokTokenizer()
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    filtered_tokens = [token for token in tokens if token not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    
    return filtered_text

def remove_stopwords_2(text):
    
    stop_file_path = '/kaggle/input/stopword-lists-for-19-languages/italianST.txt' 
    with open(stop_file_path, 'r', encoding="utf-8") as f:
        stopwords = f.readlines()
        stop_set = set(m.strip() for m in stopwords)
        stopword_list = list(frozenset(stop_set))

    
    tokenizer = ToktokTokenizer() 
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    filtered_tokens = [token for token in tokens if token not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    
    return filtered_text

def remove_mentions(text):
    processed_text = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", ' ', text)
    processed_text = ' '.join(processed_text.split())
    return processed_text
def remove_accents(text):
    processed_text = unidecode(text)
    return processed_text

def preprocessing_text(text_arr):
    preprocessed_text = []
    idx = 0
    for text in text_arr:
        text = remove_html(text)
        text = remove_html_tags(text) 
        text = remove_mentions(text)
        text = remove_special_characters(text) 
        text = text.lower() 
        text = remove_stopwords_1(text) 
        text = remove_stopwords_2(text)
        text = remove_accents(text)
        preprocessed_text.append(text)
        idx+=1
    print('Data Preprocessing finished.')
    return preprocessed_text

In [4]:
data_binary='/kaggle/input/tweets-binary-v2/tweets-for-binary-class.csv'
df_binary=pd.read_csv(data_binary, index_col=0, encoding='ISO 8859-1')
titles=df_binary.loc[:,"text"].values
labels = df_binary.iloc[:,8:9].values

In [5]:
text_cleaned = preprocessing_text(titles)
df_text_cleaned = pd.DataFrame({"cleaned_text": text_cleaned})



Data Preprocessing finished.


In [6]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=True)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

In [7]:
max_length = 0

for sentence in text_cleaned:

    input_ids = tokenizer.encode(sentence, add_special_tokens=True)
    max_length = max(max_length, len(input_ids))

print('Max sentence length: ', max_length)

Max sentence length:  73


In [8]:
input_ids = []
attention_masks = []

In [9]:
for sentence in titles:
    inputs = tokenizer.encode_plus(
        sentence,  # sequence to be encoded
        add_special_tokens=True,  # to encode the sequences with the special tokens
        padding='max_length',  # activates and controls padding
        truncation=True,  # activates and controls truncation
        max_length=max_length,  # controls the maximum length to use       
        return_attention_mask=True,  # whether to return the attention mask
        return_tensors='pt'  # to return PyTorch torch.Tensor objects        
    )
    #list of token ids to be fed to a model.
    input_ids.append(inputs['input_ids'])
    # list of indices specifying which tokens should be considered to by the model 
    attention_masks.append(inputs['attention_mask'])

In [10]:
# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

In [11]:
# For fine-tuning BERT on a specific task, the authors recommend a batch size of 16 or 32.
batch_size = 16

In [12]:
dataset = TensorDataset(input_ids, attention_masks, labels)

# 80% training set 10% validation set 10% testing set
train_size = int(0.8 * len(dataset))
validation_size = int(0.1 * len(dataset))
test_size = len(dataset) - train_size - validation_size

train_dataset, validation_dataset, test_dataset = random_split(dataset, [train_size, validation_size, test_size])

In [13]:
train_dataloader = DataLoader(
            train_dataset,
            sampler = RandomSampler(train_dataset),
            batch_size = batch_size
        )

validation_dataloader = DataLoader(
            validation_dataset,
            sampler = SequentialSampler(validation_dataset),
            batch_size = batch_size
        )
test_dataloader = DataLoader(
            test_dataset,
            sampler = SequentialSampler(test_dataset),
            batch_size = batch_size
        )

In [14]:
loss_fn = nn.BCELoss()

In [15]:
class BertClassifier(nn.Module):

    def __init__(self, freeze_bert=True):

        super(BertClassifier, self).__init__()
        
        D_in, D_out = 768, 1

       
        self.bert = BertModel.from_pretrained('bert-base-multilingual-cased')
       
        self.classifier = nn.Sequential(
            nn.Dropout(),
            nn.Linear(D_in, D_out),
            nn.Sigmoid()
        )

        # freeze parameters
        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False

    def forward(self, input_ids, attention_mask):

        # feed input to BERT
        outputs = self.bert(input_ids=input_ids,
                            attention_mask=attention_mask)

        # extract the last hidden state for classification task
        pooled_output = torch.mean(outputs.last_hidden_state,1)

        # feed input to classifier
        logits = self.classifier(pooled_output)        

        return logits

In [16]:
def set_seed(seed_value=42):

    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)

In [17]:
def initialize_model(epochs):

    bert_classifier = BertClassifier(freeze_bert=True)    
    bert_classifier


    optimizer = AdamW(bert_classifier.parameters(),
                      lr=1e-4,    # learning rate
                      eps=1e-8    # epsilon value
                      )

    # total number of training steps
    total_steps = len(train_dataloader) * epochs

    # set up the learning rate scheduler
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
    return bert_classifier, optimizer, scheduler

In [18]:
def calculate_metrics(pred, target):
    return {'precision': metrics.precision_score(y_true=target, y_pred=pred),

            'recall': metrics.recall_score(y_true=target, y_pred=pred),

            'f1': metrics.f1_score(y_true=target, y_pred=pred)          

            }


In [19]:
def train(model, train_dataloader, validation_dataloader=None, epochs=5, evaluation=True):

    print("Start training...\n")
    for epoch_i in range(epochs):

        print(
            f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val Precision':^9} | {'Val Recall':^9} | {'Val F1':^9} | {'Elapsed':^9}")
        print("-" * 100)

        # Measure the elapsed time of each epoch
        t0_epoch, t0_batch = time.time(), time.time()

        # Reset tracking variables at the beginning of each epoch
        total_loss, batch_loss, batch_counts = 0, 0, 0

        # Put the model into the training mode
        model.train()

        # For each batch of training data...
        for step, batch in enumerate(train_dataloader):
            batch_counts += 1
            # Load batch to GPU
            b_input_ids, b_attn_mask, b_labels = tuple(t for t in batch)

            # Zero out any previously calculated gradients
            model.zero_grad()

            # Perform a forward pass. This will return logits.
            logits = model(b_input_ids, b_attn_mask)

            loss=loss_fn(logits, b_labels.type(torch.float))
            # accumulate the loss values
            batch_loss += loss.item()
            total_loss += loss.item()
            # Perform a backward pass to calculate gradients
            loss.backward()

            # Clip the norm of the gradients to 1.0 to prevent "exploding gradients"
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # Update parameters and the learning rate
            optimizer.step()
            scheduler.step()

            # Print the loss values and time elapsed for every 20 batches
            if (step % 20 == 0 and step != 0) or (step == len(train_dataloader) - 1):
                # Calculate time elapsed for 20 batches
                time_elapsed = time.time() - t0_batch

                # Print training results
                print(
                    f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12f} | {'-':^10} | {'-':^13} | {'-':^10} | {'-':^9} | {time_elapsed:^9f}")

                # Reset batch tracking variables
                batch_loss, batch_counts = 0, 0
                t0_batch = time.time()

        # Calculate the average loss over the entire training data
        avg_train_loss = total_loss / len(train_dataloader)

        print("-" * 100)

        if evaluation == True:
            # After the completion of each training epoch, measure the model's performance
            # on our validation set.
            val_loss, val_precision, val_recall, val_f1 = evaluate(model, validation_dataloader)

            # Print performance over the entire training data
            time_elapsed = time.time() - t0_epoch

            # Print the header of the result table

            print("-" * 100)
            print(
                f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {val_loss:^10.6f} | {val_precision:^13f} | {val_recall:^10.2f} | {val_f1:^9.2f} | {time_elapsed:^9.2f}")
            print("-" * 100)
        print("\n")

    print("Training complete!")

    # saving the model
    output_dir = '/kaggle/working/'

    # Create output directory if needed
    if not os.path.exists(output_dir):
      os.makedirs(output_dir)

    print("Saving model to %s" % output_dir)

    model.bert.save_pretrained(output_dir)

In [20]:
def evaluate(model, val_dataloader):

    model.eval()

    # Tracking variables
    val_precision = []
    val_recall = []
    val_f1 = []
    val_loss = []

    # For each batch in our validation set...
    for batch in val_dataloader:
        # Load batch to GPU
        b_input_ids, b_attn_mask, b_labels = tuple(t for t in batch)

        # Compute logits
        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)

        # Compute loss
        loss = loss_fn(logits, b_labels.type(torch.float))
        val_loss.append(loss.item())

        # Get the predictions
        logits_arr = logits.cpu().numpy()
        logits_arr[logits_arr >= 0.5] = 1
        logits_arr[logits_arr < 0.5] = 0

        b_labels_arr = b_labels.cpu().numpy()
        # Calculate the accuracy rate
        precision_micro = metrics.precision_score(y_true=b_labels_arr, y_pred=logits_arr, average='micro')
        val_precision.append(precision_micro)
        recall_micro = metrics.recall_score(y_true=b_labels_arr, y_pred=logits_arr, average='micro')
        val_recall.append(recall_micro)
        f1_micro = metrics.f1_score(y_true=b_labels_arr, y_pred=logits_arr, average='micro')
        val_f1.append(f1_micro)

    # Compute the average accuracy and loss over the validation set.
    val_loss = np.mean(val_loss)
    val_precision = np.mean(precision_micro)
    val_recall = np.mean(recall_micro)
    val_f1 = np.mean(f1_micro)

    return val_loss, val_precision, val_recall, val_f1


In [21]:
def bert_predict(model, test_dataloader):

    model.eval()

    all_logits = []

    # For each batch in test set...
    for batch in test_dataloader:
        # Load batch to GPU
        b_input_ids, b_attn_mask = tuple(t for t in batch)[:2]

        # Compute logits
        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)
        all_logits.append(logits.type(torch.float))
    
    # Concatenate logits from each batch
    all_logits = torch.cat(all_logits, dim=0)

    # Apply softmax to calculate probabilities
    probs = all_logits.cpu().numpy()

    return probs

In [22]:
set_seed(42)
bert_classifier, optimizer, scheduler = initialize_model(epochs=5)
train(bert_classifier, train_dataloader, validation_dataloader, epochs=5, evaluation=True)

Downloading pytorch_model.bin:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Start training...

 Epoch  |  Batch  |  Train Loss  |  Val Loss  | Val Precision | Val Recall |  Val F1   |  Elapsed 
----------------------------------------------------------------------------------------------------
   1    |   20    |   0.701060   |     -      |       -       |     -      |     -     | 38.671972
   1    |   40    |   0.708277   |     -      |       -       |     -      |     -     | 36.344991
   1    |   60    |   0.691409   |     -      |       -       |     -      |     -     | 36.564831
   1    |   80    |   0.687338   |     -      |       -       |     -      |     -     | 36.164650
   1    |   94    |   0.678710   |     -      |       -       |     -      |     -     | 25.094631
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
   1    |    -    |   0.694365   |  0.681098  |   0.692308    |    0.69    |   0.69 

In [23]:
test_labels = np.array([])
for i, data in enumerate(test_dataloader):
    test_labels = np.append(test_labels, data[2].numpy())
test_labels = test_labels.reshape(191,1)
test_labels.shape

(191, 1)

In [24]:
probs = bert_predict(bert_classifier, test_dataloader)

# Get predictions from the probabilities
threshold = 0.5
preds = np.where(probs > threshold, 1, 0)

results = calculate_metrics(pred = preds,target = test_labels.astype("int"))
print(f'Precision: {results["precision"]:.4f}')
print(f'Recall: {results["recall"]:.4f}')
print(f'F1: {results["f1"]:.4f}')


csvFile = open('tweets-binary-prediction-results-v3.csv', 'w')
csvWriter = csv.writer(csvFile)
for i in preds:
    csvWriter.writerow(i)
csvFile.close()


Precision: 0.7215
Recall: 0.5588
F1: 0.6298
