In [1]:
from torch import nn
from transformers import BertConfig, BertModel
import pandas as pd

class BertClassifier(nn.Module):

    def __init__(self, dropout=0.1):
        
        super(BertClassifier, self).__init__()
        
        self.bert = AutoModel.from_pretrained("paulagarciaserrano/roberta-depression-detection")
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 192)
        self.linear1 = nn.Linear(192, 64)
        self.linear2 = nn.Linear(64, 2)
        self.gelu = nn.GELU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_id, mask):
        _, pooled_output = self.bert(input_ids= input_id,attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        
        # Adding a new hidden layer to the output of BERT model
        linear_output1 = self.linear(dropout_output)
        linear_output1 = self.gelu(linear_output1)
        linear_output1 = self.dropout(linear_output1)
        
        # Adding a new hidden layer to the output of BERT model
        linear_output2 = self.linear1(linear_output1)
        linear_output2 = self.gelu(linear_output2)
        linear_output2 = self.dropout(linear_output2)
        
        # Adding the sigmoid layer for the model output
        linear_output3 = self.linear2(linear_output2)
        final_layer = self.sigmoid(linear_output3)
        
        return final_layer

In [2]:
import torch
from torch import nn
from transformers import BertConfig, BertModel
model = torch.load('model_BERT_Trained.pkl')

In [3]:
import torch
import numpy as np
from transformers import BertTokenizer
from transformers import AutoTokenizer, AutoModel
from tqdm.notebook import tqdm


# We're using a base model for re-training with the base model trained on depression dataset
tokenizer = AutoTokenizer.from_pretrained("paulagarciaserrano/roberta-depression-detection")

labels = {0: 0, 1: 1}

class Dataset(torch.utils.data.Dataset):

    def __init__(self, df):

        self.labels = [labels[label] for label in df['target']]
        
        # Setting the parameters for the BERT tokenizer with max lenght = 64 and converting them to pytorch tensors
        self.texts = [tokenizer(text, 
                               padding='max_length', max_length = 64, truncation=True,
                                return_tensors="pt") for text in tqdm(df['text'])]
        
    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y

In [4]:
def evaluate(model, test_data):
    
    test = Dataset(test_data)
    test_dataloader = torch.utils.data.DataLoader(test, batch_size = 64)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:
        model = model.cuda()

    res = []
    total_acc_test = 0
    with torch.no_grad():
        count = 0
        for test_input, test_label in tqdm(test_dataloader):
            
            test_label = test_label.to(device)
            mask = test_input['attention_mask'].to(device)
            input_id = test_input['input_ids'].squeeze(1).to(device)

            output = model(input_id, mask)
            res.append(output.argmax(dim=1))

            acc = (output.argmax(dim=1) == test_label).sum().item()
            total_acc_test += acc
    
    print(f'Test Accuracy: {total_acc_test / len(test_data): .3f}')
    return res

In [5]:
def custom_standardization(input_data):
    import preprocessor as p
    processed_data = p.clean(input_data)
    lowercase_value = processed_data.lower()
    return lowercase_value

In [6]:
test = pd.read_csv('Dataset/Twitter/test.csv', encoding = "ISO-8859-1", usecols = ["Sentiment","SentimentText"]).rename(columns = {'Sentiment': 'target', 'SentimentText': 'text'})
test['target'] = np.where(test['target']==1, 0, 1)
print(len(test))
test.head()

1578614


Unnamed: 0,target,text
0,1,is so sad for my APL frie...
1,1,I missed the New Moon trail...
2,0,omg its already 7:30 :O
3,1,.. Omgaga. Im sooo im gunna CRy. I'...
4,1,i think mi bf is cheating on me!!! ...


In [7]:
test1 = test #[789000:-789000]

In [8]:
test1['target'].value_counts()

0    790178
1    788436
Name: target, dtype: int64

In [9]:
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)

INFO: Pandarallel will run on 14 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.

https://nalepae.github.io/pandarallel/troubleshooting/


In [10]:
def custom_standardization(input_data):
    import preprocessor as p
    processed_data = p.clean(input_data)
    lowercase_value = processed_data.lower()
    return lowercase_value

In [11]:
test1["text"] = test1["text"].parallel_apply(custom_standardization)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=112759), Label(value='0 / 112759')…

In [12]:
# test1['text'] = test1['text'].apply(custom_standardization)

In [13]:
res = evaluate(model, test1)

  0%|          | 0/1578614 [00:00<?, ?it/s]

  0%|          | 0/24666 [00:00<?, ?it/s]

Test Accuracy:  0.916


In [14]:
final = []
for i in res:
    for j in i:
        final.append(j.item())

In [15]:
# pd.DataFrame(final).to_csv('model_output.csv', index = False)

In [16]:
y_pred = pd.read_csv('model_output.csv')

In [17]:
y_pred.head()

Unnamed: 0,0
0,1
1,1
2,1
3,1
4,1


In [19]:
y_test = test1['target']

In [25]:
from sklearn.metrics import classification_report, confusion_matrix

In [35]:
print(classification_report(y_test, y_pred, digits = 4, target_names = ['Non-Suicidal', 'Suicidal']))

              precision    recall  f1-score   support

Non-Suicidal     0.9212    0.9105    0.9158    790178
    Suicidal     0.9114    0.9220    0.9166    788436

    accuracy                         0.9162   1578614
   macro avg     0.9163    0.9162    0.9162   1578614
weighted avg     0.9163    0.9162    0.9162   1578614

