In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
import pickle
import os
from tqdm import tqdm
from sklearn.metrics import classification_report
from transformers import BertTokenizer, BertModel, BertConfig
from transformers import AdamW
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data.dataloader import DataLoader
from transformers.activations import get_activation
import numpy as np
from torch.nn.utils.rnn import pad_sequence


## Load data

In [7]:
data_df = pd.read_csv('data/Question_Classification_Dataset.csv')

In [8]:
data_df.head()

Unnamed: 0.1,Unnamed: 0,Questions,Category0,Category1,Category2
0,0,How did serfdom develop in and then leave Russ...,DESCRIPTION,DESC,manner
1,1,What films featured the character Popeye Doyle ?,ENTITY,ENTY,cremat
2,2,How can I find a list of celebrities ' real na...,DESCRIPTION,DESC,manner
3,3,What fowl grabs the spotlight after the Chines...,ENTITY,ENTY,animal
4,4,What is the full form of .com ?,ABBREVIATION,ABBR,exp


In [9]:
le = preprocessing.LabelEncoder()
data_texts = data_df['Questions'].to_list()
labels = le.fit_transform(data_df['Category0'])

In [10]:
X_train, X_test, y_train, y_test = train_test_split(data_texts, labels, test_size=0.2, random_state=0)

## Vectorization

In [None]:
vect = TfidfVectorizer(ngram_range = (1,2)).fit(X_train)

In [None]:
len(vect.vocabulary_)

In [None]:
train_vector = vect.transform(X_train)
test_vector = vect.transform(X_test)

## SVM

In [16]:
model1 = SVC(kernel='linear', probability = True)
model1.fit(train_vector, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [17]:
pred = model1.predict(test_vector)

In [20]:
print(classification_report(pred, y_test))

              precision    recall  f1-score   support

           0       0.62      0.94      0.75        16
           1       0.90      0.80      0.85       257
           2       0.81      0.73      0.77       278
           3       0.85      0.90      0.87       220
           4       0.82      0.93      0.87       154
           5       0.90      0.97      0.94       166

    accuracy                           0.85      1091
   macro avg       0.82      0.88      0.84      1091
weighted avg       0.85      0.85      0.85      1091



# BERT

### Build dataloader

In [11]:
class QuestionDataset(DataLoader):
    def __init__(self, data, labels, tokenizer):
        self.data = data
        self.tokenizer = tokenizer
        self.labels = labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data[idx]
        text_tokenized = self.tokenizer.encode(text)
        label = self.labels[idx]
        
        return torch.tensor(text_tokenized), torch.tensor(label)

In [12]:
class PadSequenceDataset():
    
    def __init__(self, token_pad_value):
        self.token_pad_value = token_pad_value

    def __call__(self, batch):
        seqs = [x[0] for x in batch]
        seqs = pad_sequence(seqs, batch_first=True, padding_value=self.token_pad_value)
        label = torch.stack(list(map(lambda x: x[1], batch)))
        return seqs, label

In [13]:
def create_dataloader(texts, labels, pretrained_model_name=None, max_length=64, batch_size=4, shuffle=True, num_workers=0):
    tokenizer = BertTokenizer.from_pretrained(pretrained_model_name)
    pad_id = tokenizer.pad_token_id
    PS = PadSequenceDataset(token_pad_value=pad_id)
    dataset = QuestionDataset(texts, labels, tokenizer=tokenizer)
    return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers, collate_fn=PS, pin_memory=False)

In [14]:
MODEL_NAME = 'bert-base-uncased'

# train_loader = create_dataloader(X_train, y_train, pretrained_model_name=MODEL_NAME)
# test_loader = create_dataloader(X_test, y_test, pretrained_model_name=MODEL_NAME)

### Build Model

In [15]:
class QuestionClassifier(nn.Module):
    def __init__(self, n_classes, pretrained_model_name='bert-base-uncased'):
        super(QuestionClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(pretrained_model_name)
#         for name, param in self.bert.named_parameters():
#             if 'encoder' in name:
#                 param.requires_grad = False
        self.dense = nn.Linear(self.bert.config.hidden_size, self.bert.config.hidden_size)
        self.dropout = nn.Dropout(0.1)
        self.out_proj = nn.Linear(self.bert.config.hidden_size, n_classes)
        
    def forward(self, input_ids):
        hidden_states, pooled_output = self.bert(
          input_ids=input_ids
        )
        sequence_output_cls = hidden_states[:, 0, :]
        x = self.dropout(sequence_output_cls)
        x = self.dense(x)
        x = get_activation("tanh")(x)
        x = self.dropout(x)
        x = self.out_proj(x)
        return x

### Training

In [16]:
def evaluate(net, test_loader):
    net.eval()
    cuda = torch.cuda.is_available()
    labels = []
    preds = []

    for i, batch in tqdm(enumerate(test_loader)):
        x, label = batch
        label = label.data
        labels.append(label)
        if cuda:
            x = x.cuda()

        prediction = net(x)
        
        prediction.to('cpu')
        pred_label = prediction.argmax(dim=1).to('cpu')
        
        preds.append(pred_label)
    
    all_labels = []
    all_preds = []
    for i in range(len(labels)):
        all_labels += labels[i].tolist()
        all_preds += preds[i].tolist()
    
    print("Testing report: ", classification_report(all_labels, all_preds))

In [17]:
def train(num_epochs=100):
    cuda = torch.cuda.is_available()
    train_loader = create_dataloader(X_train, y_train, pretrained_model_name=MODEL_NAME)
    test_loader = create_dataloader(X_test, y_test, pretrained_model_name=MODEL_NAME)
    net = QuestionClassifier(n_classes=len(set(y_train)), pretrained_model_name=MODEL_NAME)
    if cuda:
        net.cuda()
    def train_model(model, optimizer=None, loss_fn=None, scheduler=None, do_eval=False):
        model.train()
        epoch_loss = []
        all_labels = []
        all_preds = []
        for i, batch in tqdm(enumerate(train_loader)):
            x, labels = batch
            labels = labels.long()
            if cuda:
                x = x.cuda()
                labels = labels.cuda()

            prediction = model(x)
            loss = loss_fn(prediction.view(-1, 6), labels.view(-1))
            pred = prediction.argmax(dim=-1)
            all_labels.append(labels.data.to('cpu'))
            all_preds.append(pred.to('cpu'))
            
            epoch_loss.append(loss.item())

            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
        average_loss = np.mean(epoch_loss)
        new_all_labels = []
        new_all_preds = []
        for i in range(len(all_labels)):
            new_all_labels += all_labels[i].tolist()
            new_all_preds += all_preds[i].tolist()

        print("average RE loss : ", average_loss)
        print("train_cls report: \n", classification_report(new_all_labels, new_all_preds))
        if do_eval:
            evaluate(model, test_loader)
            
    
    optimizer = torch.optim.Adam([{"params": net.parameters(), "lr": 0.00001}])
    for epoch in range(num_epochs):
        print('Epoch:', epoch)
        criteria = torch.nn.CrossEntropyLoss().cuda()
        do_eval = False
        if epoch % 5 == 0 or epoch == num_epochs - 1:
            do_eval = True
        train_model(net, optimizer=optimizer, loss_fn=criteria, scheduler=None, do_eval=do_eval)

In [18]:
train(num_epochs=10)

Epoch: 0


1091it [03:10,  5.72it/s]
3it [00:00, 20.41it/s]

average RE loss :  0.5369372202867111
train_cls report: 
               precision    recall  f1-score   support

           0       0.86      0.29      0.43        62
           1       0.75      0.83      0.79       935
           2       0.75      0.75      0.75       998
           3       0.85      0.88      0.87       988
           4       0.87      0.83      0.85       660
           5       0.95      0.86      0.90       718

    accuracy                           0.82      4361
   macro avg       0.84      0.74      0.77      4361
weighted avg       0.83      0.82      0.82      4361



273it [00:09, 27.39it/s]
1it [00:00,  6.85it/s]

Testing report:                precision    recall  f1-score   support

           0       0.94      0.62      0.75        24
           1       0.93      0.90      0.91       227
           2       0.92      0.92      0.92       252
           3       0.97      0.98      0.97       235
           4       0.96      0.98      0.97       175
           5       0.94      0.99      0.96       178

    accuracy                           0.94      1091
   macro avg       0.94      0.90      0.92      1091
weighted avg       0.94      0.94      0.94      1091

Epoch: 1


1091it [03:00,  6.03it/s]
1it [00:00,  6.80it/s]

average RE loss :  0.14250335110971188
train_cls report: 
               precision    recall  f1-score   support

           0       0.91      0.69      0.79        62
           1       0.94      0.94      0.94       935
           2       0.93      0.94      0.93       998
           3       0.98      0.97      0.98       988
           4       0.97      0.97      0.97       660
           5       0.99      0.99      0.99       718

    accuracy                           0.96      4361
   macro avg       0.95      0.92      0.93      4361
weighted avg       0.96      0.96      0.96      4361

Epoch: 2


1091it [03:01,  6.01it/s]
1it [00:00,  7.19it/s]

average RE loss :  0.06873738170216435
train_cls report: 
               precision    recall  f1-score   support

           0       0.94      0.76      0.84        62
           1       0.97      0.98      0.97       935
           2       0.98      0.98      0.98       998
           3       0.99      0.99      0.99       988
           4       0.98      0.99      0.99       660
           5       0.99      1.00      1.00       718

    accuracy                           0.98      4361
   macro avg       0.98      0.95      0.96      4361
weighted avg       0.98      0.98      0.98      4361

Epoch: 3


1091it [02:54,  6.24it/s]
1it [00:00,  6.71it/s]

average RE loss :  0.03679885823829364
train_cls report: 
               precision    recall  f1-score   support

           0       0.95      0.90      0.93        62
           1       0.98      0.98      0.98       935
           2       0.99      0.98      0.98       998
           3       0.99      1.00      0.99       988
           4       0.99      0.99      0.99       660
           5       1.00      1.00      1.00       718

    accuracy                           0.99      4361
   macro avg       0.98      0.98      0.98      4361
weighted avg       0.99      0.99      0.99      4361

Epoch: 4


1091it [02:54,  6.24it/s]
1it [00:00,  6.41it/s]

average RE loss :  0.03510374377125671
train_cls report: 
               precision    recall  f1-score   support

           0       0.94      0.97      0.95        62
           1       0.99      0.98      0.98       935
           2       0.98      0.98      0.98       998
           3       0.99      0.99      0.99       988
           4       1.00      1.00      1.00       660
           5       1.00      1.00      1.00       718

    accuracy                           0.99      4361
   macro avg       0.98      0.99      0.98      4361
weighted avg       0.99      0.99      0.99      4361

Epoch: 5


1091it [02:55,  6.23it/s]
4it [00:00, 32.52it/s]

average RE loss :  0.019796824859545713
train_cls report: 
               precision    recall  f1-score   support

           0       0.97      0.94      0.95        62
           1       0.99      0.99      0.99       935
           2       0.99      0.99      0.99       998
           3       1.00      1.00      1.00       988
           4       1.00      1.00      1.00       660
           5       1.00      1.00      1.00       718

    accuracy                           0.99      4361
   macro avg       0.99      0.99      0.99      4361
weighted avg       0.99      0.99      0.99      4361



273it [00:08, 32.27it/s]
1it [00:00,  6.71it/s]

Testing report:                precision    recall  f1-score   support

           0       0.95      0.79      0.86        24
           1       0.90      0.96      0.93       227
           2       0.95      0.91      0.93       252
           3       0.99      0.97      0.98       235
           4       0.97      0.97      0.97       175
           5       0.97      0.99      0.98       178

    accuracy                           0.95      1091
   macro avg       0.95      0.93      0.94      1091
weighted avg       0.95      0.95      0.95      1091

Epoch: 6


1091it [02:55,  6.23it/s]
1it [00:00,  7.46it/s]

average RE loss :  0.01965855699848626
train_cls report: 
               precision    recall  f1-score   support

           0       0.98      0.94      0.96        62
           1       0.99      0.99      0.99       935
           2       0.99      0.99      0.99       998
           3       1.00      1.00      1.00       988
           4       1.00      1.00      1.00       660
           5       1.00      1.00      1.00       718

    accuracy                           1.00      4361
   macro avg       0.99      0.99      0.99      4361
weighted avg       1.00      1.00      1.00      4361

Epoch: 7


1091it [02:55,  6.21it/s]
1it [00:00,  6.99it/s]

average RE loss :  0.017493547101724488
train_cls report: 
               precision    recall  f1-score   support

           0       0.97      0.97      0.97        62
           1       1.00      0.99      1.00       935
           2       0.99      0.99      0.99       998
           3       1.00      1.00      1.00       988
           4       0.99      1.00      0.99       660
           5       1.00      1.00      1.00       718

    accuracy                           0.99      4361
   macro avg       0.99      0.99      0.99      4361
weighted avg       0.99      0.99      0.99      4361

Epoch: 8


1091it [02:57,  6.16it/s]
1it [00:00,  6.62it/s]

average RE loss :  0.028229642225775994
train_cls report: 
               precision    recall  f1-score   support

           0       0.98      1.00      0.99        62
           1       0.99      0.99      0.99       935
           2       0.99      0.99      0.99       998
           3       1.00      1.00      1.00       988
           4       1.00      1.00      1.00       660
           5       0.99      0.98      0.99       718

    accuracy                           0.99      4361
   macro avg       0.99      0.99      0.99      4361
weighted avg       0.99      0.99      0.99      4361

Epoch: 9


1091it [02:56,  6.19it/s]
4it [00:00, 32.25it/s]

average RE loss :  0.01572126554415709
train_cls report: 
               precision    recall  f1-score   support

           0       0.98      0.98      0.98        62
           1       0.99      0.99      0.99       935
           2       0.99      0.99      0.99       998
           3       1.00      1.00      1.00       988
           4       0.99      1.00      1.00       660
           5       1.00      1.00      1.00       718

    accuracy                           1.00      4361
   macro avg       0.99      0.99      0.99      4361
weighted avg       1.00      1.00      1.00      4361



273it [00:08, 32.19it/s]

Testing report:                precision    recall  f1-score   support

           0       0.86      0.79      0.83        24
           1       0.92      0.93      0.93       227
           2       0.97      0.90      0.93       252
           3       0.97      0.98      0.97       235
           4       0.97      0.97      0.97       175
           5       0.95      1.00      0.98       178

    accuracy                           0.95      1091
   macro avg       0.94      0.93      0.93      1091
weighted avg       0.95      0.95      0.95      1091






In [None]:
import collections
collections.Counter(y_train)