In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

import torch
from torch import nn
from dataclasses import dataclass

from transformers import DistilBertModel, DistilBertTokenizer
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import pipeline
from tqdm.notebook import tqdm
from huggingface_hub import PyTorchModelHubMixin

import warnings
warnings.filterwarnings("ignore")

In [2]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [3]:
map = {'biology': 0, 'chemistry': 1, 'computer': 2, 'maths': 3, 'physics': 4, 'social sciences': 5}

In [4]:
test_df = pd.read_csv(r'../../data/final/final_test.csv')
test_df.isna().sum()    

text     0
label    0
dtype: int64

# Testing on Models

We will test only on the test_df

# Zero Shot Classifier

In [5]:
tokenizer = AutoTokenizer.from_pretrained("typeform/distilbert-base-uncased-mnli")
classifier = pipeline('zero-shot-classification', model='typeform/distilbert-base-uncased-mnli',tokenizer=tokenizer,device='cuda:0')
candidate_labels = list(map.keys())

The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.


In [6]:
import time
results = []
CHUNK_SIZE = 256
for chunk in tqdm(range(test_df.shape[0] // CHUNK_SIZE + 1)):
    descr = test_df[CHUNK_SIZE * chunk: (chunk+1) * CHUNK_SIZE]['text'].to_list()
    res = classifier(descr,candidate_labels, truncation=True)
    results += res
    # time.sleep(5)

  0%|          | 0/423 [00:00<?, ?it/s]

In [7]:
results_pd = pd.DataFrame(results)
results_pd.rename(columns={'sequence':'text'},inplace=True)
results_pd['label'] = results_pd['labels'].apply(lambda x: map[x[0]])
results_pd['score'] = results_pd['scores'].apply(lambda x: x[0])
results_pd.head()

Unnamed: 0,text,labels,scores,label,score
0,Getting Started This chapter will be about get...,"[computer, maths, social sciences, chemistry, ...","[0.38398057222366333, 0.17693284153938293, 0.1...",2,0.383981
1,If you are a graphic or web designer and want ...,"[computer, social sciences, biology, chemistry...","[0.23922796547412872, 0.16352412104606628, 0.1...",2,0.239228
2,This approach is very common because it is so ...,"[computer, social sciences, maths, biology, ch...","[0.26167187094688416, 0.17918966710567474, 0.1...",2,0.261672
3,"To deal with this problem, Centralized V...","[chemistry, physics, maths, computer, biology,...","[0.17761172354221344, 0.17454923689365387, 0.1...",1,0.177612
4,"However, this setup also has some serious down...","[computer, maths, social sciences, chemistry, ...","[0.8039895296096802, 0.0479842834174633, 0.046...",2,0.80399


In [8]:
print(classification_report(test_df['label'],results_pd['label'], target_names=[str(l) for l in map.keys()]))

                 precision    recall  f1-score   support

        biology       0.45      0.16      0.24     15988
      chemistry       0.52      0.47      0.49     20678
       computer       0.25      0.36      0.29      8754
          maths       0.22      0.25      0.23     26661
        physics       0.25      0.25      0.25     10306
social sciences       0.26      0.30      0.28     25695

       accuracy                           0.30    108082
      macro avg       0.32      0.30      0.30    108082
   weighted avg       0.32      0.30      0.30    108082



## Custom Model

In [9]:
# We're using a base model for re-training with the base model trained on depression dataset
labels = {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5}
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
class Dataset(torch.utils.data.Dataset):

    def __init__(self, df):

        self.labels = [labels[label] for label in df['label']]
        
        # Setting the parameters for the BERT tokenizer with max length = 512 and converting them to pytorch tensors
        self.texts = [tokenizer(text, 
                               padding='max_length', max_length = 512, truncation=True,
                                return_tensors="pt") for text in tqdm(df['text'])]
        
    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y

In [10]:
class BertClassifier(nn.Module, PyTorchModelHubMixin):

    def __init__(self, config: dict, dropout=0.1 ):
        
        super(BertClassifier, self).__init__()
        
        self.bert = DistilBertModel.from_pretrained("distilbert/distilbert-base-uncased")
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 192)
        self.linear1 = nn.Linear(192, 64)
        self.linear2 = nn.Linear(64, 6)
        self.gelu = nn.GELU()

    def forward(self, input_id, mask):
        output_1 = self.bert(input_ids= input_id,attention_mask=mask)
        hidden_state = output_1[0]
        pooled_output = hidden_state[:, 0]
        dropout_output = self.dropout(pooled_output)
        
        # Adding a new hidden layer to the output of BERT model
        linear_output1 = self.linear(dropout_output)
        linear_output1 = self.gelu(linear_output1)
        linear_output1 = self.dropout(linear_output1)
        
        # Adding a new hidden layer to the output of BERT model
        linear_output2 = self.linear1(linear_output1)
        linear_output2 = self.gelu(linear_output2)
        linear_output2 = self.dropout(linear_output2)
        
        # Adding the sigmoid layer for the model output
        final_layer = self.linear2(linear_output2)
        
        return final_layer

In [11]:
test = Dataset(test_df)
test_dataloader = torch.utils.data.DataLoader(test, batch_size = 256)
reverse_map = {value: key for key, value in map.items()}
config = {
    "id2label": reverse_map,
    "label2id": map,
    "problem_type": "multi_label_classification",
    "torch_dtype": "float32",
    "transformers_version": "4.38.1",
    "use_cache": True
}

model = BertClassifier(config=config)
model.load_state_dict(torch.load(r'../../models/distilbert_model'))
model = model.cuda()
res = []


with torch.no_grad():
        count = 0
        for test_input, test_label in tqdm(test_dataloader):
            
            test_label = test_label.to(device)
            mask = test_input['attention_mask'].to(device)
            input_id = test_input['input_ids'].squeeze(1).to(device)

            output = model(input_id, mask)
            res += output.argmax(dim=1).cpu().tolist()

  0%|          | 0/108082 [00:00<?, ?it/s]

  0%|          | 0/423 [00:00<?, ?it/s]

In [12]:
print(classification_report(test_df['label'],res, target_names=[str(l) for l in map.keys()]))

                 precision    recall  f1-score   support

        biology       0.98      0.99      0.99     15988
      chemistry       1.00      0.99      0.99     20678
       computer       1.00      0.99      0.99      8754
          maths       1.00      1.00      1.00     26661
        physics       0.99      0.98      0.99     10306
social sciences       0.99      1.00      0.99     25695

       accuracy                           0.99    108082
      macro avg       0.99      0.99      0.99    108082
   weighted avg       0.99      0.99      0.99    108082

