# This notebook is used for testing the fine-tuned BERT model

In [1]:
from torch import nn
from transformers import BertConfig, BertModel
import pandas as pd

class BertClassifier(nn.Module):

    def __init__(self, dropout=0.1):
        
        super(BertClassifier, self).__init__()
        
        self.bert = AutoModel.from_pretrained("paulagarciaserrano/roberta-depression-detection")
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 192)
        self.linear1 = nn.Linear(192, 64)
        self.linear2 = nn.Linear(64, 2)
        self.gelu = nn.GELU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_id, mask):
        _, pooled_output = self.bert(input_ids= input_id,attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        
        # Adding a new hidden layer to the output of BERT model
        linear_output1 = self.linear(dropout_output)
        linear_output1 = self.gelu(linear_output1)
        linear_output1 = self.dropout(linear_output1)
        
        # Adding a new hidden layer to the output of BERT model
        linear_output2 = self.linear1(linear_output1)
        linear_output2 = self.gelu(linear_output2)
        linear_output2 = self.dropout(linear_output2)
        
        # Adding the sigmoid layer for the model output
        linear_output3 = self.linear2(linear_output2)
        final_layer = self.sigmoid(linear_output3)
        
        return final_layer

In [2]:
import torch
from torch import nn
from transformers import BertConfig, BertModel
model = torch.load('model_BERT_Trained.pkl')

In [4]:
import torch
import numpy as np
from transformers import BertTokenizer
from transformers import AutoTokenizer, AutoModel
from tqdm.notebook import tqdm


# We're using a base model for re-training with the base model trained on depression dataset
tokenizer = AutoTokenizer.from_pretrained("paulagarciaserrano/roberta-depression-detection")

labels = {0: 0, 1: 1}

class Dataset(torch.utils.data.Dataset):

    def __init__(self, df):

        self.labels = [labels[label] for label in df['target']]
        
        # Setting the parameters for the BERT tokenizer with max lenght = 64 and converting them to pytorch tensors
        self.texts = [tokenizer(text, 
                               padding='max_length', max_length = 64, truncation=True,
                                return_tensors="pt") for text in tqdm(df['text'])]
        
    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y

In [105]:
def evaluate(model, test_data):
    
    test = Dataset(test_data)
    test_dataloader = torch.utils.data.DataLoader(test, batch_size = 64)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:
        model = model.cuda()

    res = []
    total_acc_test = 0
    with torch.no_grad():
        count = 0
        for test_input, test_label in tqdm(test_dataloader):
            
            test_label = test_label.to(device)
            mask = test_input['attention_mask'].to(device)
            input_id = test_input['input_ids'].squeeze(1).to(device)

            output = model(input_id, mask)
            # res.append(output.argmax(dim=1))
            print(output)
            res.append(output)

            acc = (output.argmax(dim=1) == test_label).sum().item()
            total_acc_test += acc
    
    print(f'Test Accuracy: {total_acc_test / len(test_data): .3f}')
    return res

In [102]:
def custom_standardization(input_data):
    import preprocessor as p
    processed_data = p.clean(input_data)
    lowercase_value = processed_data.lower()
    return lowercase_value

In [103]:
test = pd.DataFrame(
            [['it\'s a good day', 0],
            ['Let\'s all die', 0],
            ['How to hang myself', 1],
             ['How to tie a knot', 0],
             ['How to tie a knot to hang myself', 1],
             ['I wanna die', 1],
            ['I think it\'s all useless', 1],
            ['Please somebody help me', 1],
            ['LOL', 0]],
            columns = ['text', 'target']
        )

# pd.read_csv('Dataset/Twitter/test.csv', encoding = "ISO-8859-1", usecols = ["Sentiment","SentimentText"]).rename(columns = {'Sentiment': 'target', 'SentimentText': 'text'})
# test['target'] = np.where(test['target']==1, 0, 1)
print(len(test))
test.head()

9


Unnamed: 0,text,target
0,it's a good day,0
1,Let's all die,0
2,How to hang myself,1
3,How to tie a knot,0
4,How to tie a knot to hang myself,1


In [88]:
test1 = test #[789000:-789000]

In [89]:
test1['target'].value_counts()

1    5
0    4
Name: target, dtype: int64

In [90]:
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)

INFO: Pandarallel will run on 14 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.

https://nalepae.github.io/pandarallel/troubleshooting/


In [91]:
def custom_standardization(input_data):
    import preprocessor as p
    processed_data = p.clean(input_data)
    lowercase_value = processed_data.lower()
    return lowercase_value

In [92]:
test1["text"] = test1["text"].parallel_apply(custom_standardization)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1), Label(value='0 / 1'))), HBox(c…

In [93]:
# test1['text'] = test1['text'].apply(custom_standardization)

In [106]:
res = evaluate(model, test1)

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

tensor([[1.0000e+00, 2.0163e-06],
        [6.7327e-01, 4.0293e-01],
        [1.0676e-03, 9.9893e-01],
        [1.0000e+00, 2.9002e-06],
        [8.8372e-04, 9.9915e-01],
        [2.0390e-06, 1.0000e+00],
        [1.4111e-06, 1.0000e+00],
        [2.4395e-06, 1.0000e+00],
        [1.0000e+00, 5.5374e-06]], device='cuda:0')
Test Accuracy:  1.000


In [131]:
rows = []
for i in res[0]:
    row = []
    for j in i:
        row.append(j.item())
    rows.append(row)

In [136]:
pd.DataFrame(rows, columns = ['predicted_class0', 'predicted_class1']).to_csv('model_output_test.csv', index = False)

In [137]:
y_pred = pd.read_csv('model_output_test.csv')

In [138]:
y_pred.head()

Unnamed: 0,predicted_class0,predicted_class1
0,1.0,2e-06
1,0.673265,0.402935
2,0.001068,0.998934
3,0.999996,3e-06
4,0.000884,0.999146


In [139]:
pd.concat([test, y_pred], axis = 1)

Unnamed: 0,text,target,predicted_class0,predicted_class1
0,it's a good day,0,1.0,2e-06
1,Let's all die,0,0.673265,0.402935
2,How to hang myself,1,0.001068,0.998934
3,How to tie a knot,0,0.999996,3e-06
4,How to tie a knot to hang myself,1,0.000884,0.999146
5,I wanna die,1,2e-06,0.999998
6,I think it's all useless,1,1e-06,1.0
7,Please somebody help me,1,2e-06,0.999999
8,LOL,0,0.999995,6e-06


In [140]:
y_test = test1['target']

In [141]:
from sklearn.metrics import classification_report, confusion_matrix

In [143]:
# print(confusion_matrix(y_test, y_pred))
# print(classification_report(y_test, y_pred, digits = 4, target_names = ['Non-Suicidal', 'Suicidal']))