In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
import pickle
import os
from tqdm import tqdm
from sklearn.metrics import classification_report
from transformers import BertTokenizer, BertModel, BertConfig
from transformers import AdamW
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data.dataloader import DataLoader
from transformers.activations import get_activation
import numpy as np

## Load data

In [2]:
data_df = pd.read_csv('data/Question_Classification_Dataset.csv')

In [3]:
data_df.head()

Unnamed: 0.1,Unnamed: 0,Questions,Category0,Category1,Category2
0,0,How did serfdom develop in and then leave Russ...,DESCRIPTION,DESC,manner
1,1,What films featured the character Popeye Doyle ?,ENTITY,ENTY,cremat
2,2,How can I find a list of celebrities ' real na...,DESCRIPTION,DESC,manner
3,3,What fowl grabs the spotlight after the Chines...,ENTITY,ENTY,animal
4,4,What is the full form of .com ?,ABBREVIATION,ABBR,exp


In [4]:
le = preprocessing.LabelEncoder()
data_texts = data_df['Questions'].to_list()
labels = le.fit_transform(data_df['Category0'])

In [5]:
X_train, X_test, y_train, y_test = train_test_split(data_texts, labels, test_size=0.2, random_state=0)

## Vectorization

In [13]:
vect = TfidfVectorizer(ngram_range = (1,2)).fit(X_train)

In [14]:
len(vect.vocabulary_)

27560

In [15]:
train_vector = vect.transform(X_train)
test_vector = vect.transform(X_test)

## SVM

In [16]:
model1 = SVC(kernel='linear', probability = True)
model1.fit(train_vector, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [17]:
pred = model1.predict(test_vector)

In [20]:
print(classification_report(pred, y_test))

              precision    recall  f1-score   support

           0       0.62      0.94      0.75        16
           1       0.90      0.80      0.85       257
           2       0.81      0.73      0.77       278
           3       0.85      0.90      0.87       220
           4       0.82      0.93      0.87       154
           5       0.90      0.97      0.94       166

    accuracy                           0.85      1091
   macro avg       0.82      0.88      0.84      1091
weighted avg       0.85      0.85      0.85      1091



# BERT

### Build dataloader

In [6]:
class QuestionDataset(DataLoader):
    def __init__(self, data, labels, max_length, tokenizer):
        self.data = data
        self.max_length = max_length
        self.tokenizer = tokenizer
        self.labels = labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data[idx]
        encoding = self.tokenizer.encode_plus(
          text,
          add_special_tokens=True,
          max_length=self.max_length,
          return_token_type_ids=False,
          pad_to_max_length=True,
          return_attention_mask=True,
          return_tensors='pt',
        )
        text_tokenized = encoding['input_ids'].flatten()
        attention_mask = encoding['attention_mask'].flatten()
        label = self.labels[idx]
        
        return torch.tensor(text_tokenized), torch.tensor(attention_mask), torch.tensor(label)

In [7]:
def create_dataloader(texts, labels, pretrained_model_name=None, max_length=64, batch_size=4, shuffle=True, num_workers=0):
    tokenizer = BertTokenizer.from_pretrained(pretrained_model_name)
    dataset = QuestionDataset(texts, labels, max_length=max_length, tokenizer=tokenizer)
    return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers, pin_memory=False)

In [8]:
MODEL_NAME = 'bert-base-uncased'

train_loader = create_dataloader(X_train, y_train, pretrained_model_name=MODEL_NAME)
test_loader = create_dataloader(X_test, y_test, pretrained_model_name=MODEL_NAME)

### Build Model

In [9]:
class QuestionClassifier(nn.Module):
    def __init__(self, n_classes, pretrained_model_name='bert-base-uncased'):
        super(QuestionClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(pretrained_model_name)
#         for name, param in self.bert.named_parameters():
#             if 'encoder' in name:
#                 param.requires_grad = False
        self.dense = nn.Linear(self.bert.config.hidden_size, self.bert.config.hidden_size)
        self.dropout = nn.Dropout(0.1)
        self.out_proj = nn.Linear(self.bert.config.hidden_size, n_classes)
        
    def forward(self, input_ids, attention_mask):
        hidden_states, pooled_output = self.bert(
          input_ids=input_ids,
          attention_mask=attention_mask
        )
#         sequence_output_cls = hidden_states[:, 0, :]
        x = self.dropout(pooled_output)
        x = self.dense(x)
        x = get_activation("tanh")(x)
        x = self.dropout(x)
        x = self.out_proj(x)
        return x

### Training

In [10]:
def evaluate(net, test_loader):
    net.eval()
    cuda = torch.cuda.is_available()
    labels = []
    preds = []

    for i, batch in tqdm(enumerate(test_loader)):
        x, attention_masks, label = batch
        label = label.data
        labels.append(label)
        if cuda:
            x = x.cuda()
            attention_masks = attention_masks.cuda()

        prediction = net(x, attention_masks)
        
        prediction.to('cpu')
        pred_label = prediction.argmax(dim=1).to('cpu')
        
        preds.append(pred_label)
    
    all_labels = []
    all_preds = []
    for i in range(len(labels)):
        all_labels += labels[i].tolist()
        all_preds += preds[i].tolist()
    
    print("Testing report: ", classification_report(all_labels, all_preds))

In [11]:
def train(num_epochs=100):
    cuda = torch.cuda.is_available()
    train_loader = create_dataloader(X_train, y_train, pretrained_model_name=MODEL_NAME)
    test_loader = create_dataloader(X_test, y_test, pretrained_model_name=MODEL_NAME)
    net = QuestionClassifier(n_classes=len(set(y_train)), pretrained_model_name=MODEL_NAME)
    if cuda:
        net.cuda()
    def train_model(model, optimizer=None, loss_fn=None, scheduler=None, do_eval=False):
        model.train()
        epoch_loss = []
        all_labels = []
        all_preds = []
        for i, batch in tqdm(enumerate(train_loader)):
            x, attention_masks, labels = batch
            labels = labels.long()
            if cuda:
                x = x.cuda()
                labels = labels.cuda()
                attention_masks = attention_masks.cuda()

            prediction = model(x, attention_masks)
            loss = loss_fn(prediction, labels.view(-1))
            pred = prediction.argmax(dim=-1)
            all_labels.append(labels.data.to('cpu'))
            all_preds.append(pred.to('cpu'))
            
            epoch_loss.append(loss.item())

            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
        average_loss = np.mean(epoch_loss)
        new_all_labels = []
        new_all_preds = []
        for i in range(len(all_labels)):
            new_all_labels += all_labels[i].tolist()
            new_all_preds += all_preds[i].tolist()

        print("average RE loss : ", average_loss)
        print("train_cls report: \n", classification_report(new_all_labels, new_all_preds))
        if do_eval:
            evaluate(model, test_loader)
            
    criteria = torch.nn.CrossEntropyLoss().cuda()
#     no_decay = ["bias", "LayerNorm.weight"]
#     optimizer_grouped_parameters = [
#         {
#             "params": [p for n, p in net.named_parameters() if not any(nd in n for nd in no_decay)],
#             "weight_decay": 0.0,
#         },
#         {
#             "params": [p for n, p in net.named_parameters() if any(nd in n for nd in no_decay)],
#             "weight_decay": 0.0,
#         },
#     ]
    optimizer = torch.optim.Adam([{"params": net.parameters(), "lr": 0.0001}])
    for epoch in range(num_epochs):
        print('Epoch:', epoch)
        do_eval = False
        if epoch % 5 == 0 or epoch == num_epochs - 1:
            do_eval = True
        train_model(net, optimizer=optimizer, loss_fn=criteria, scheduler=None, do_eval=do_eval)

In [None]:
train(num_epochs=10)

In [13]:
import collections
collections.Counter(y_train)

Counter({4: 660, 5: 718, 3: 988, 1: 935, 2: 998, 0: 62})