# Unit 3 - Text Classification

Use BERT to classify texts

Also look at my solution using [LSTM](https://www.kaggle.com/update/sf-nlp-lstm-classifier)

Competition: https://www.kaggle.com/competitions/nlp-txt-classification

Based on [this article](https://towardsdatascience.com/text-classification-with-bert-in-pytorch-887965e5820f)

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [62]:
from pathlib import Path
from typing import List, Dict, Any

# import time

import torch
from torch import nn
from torch.optim import Adam

from transformers import BertTokenizer
from transformers import BertModel

from tqdm import tqdm

# import torch.nn.functional as F
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import LabelEncoder
# from keras.preprocessing.text import Tokenizer
# from keras.preprocessing.sequence import pad_sequences

# import matplotlib.pyplot as plt
# import plotly.express as px
# import scikitplot as skplt

In [4]:
DATA_PATH = Path('/kaggle/input/nlp-txt-classification')

SEED = 42
NUM_CLASSES = 5
# MAX_VOCAB_SIZE = 250000
BATCH_SIZE = 64

In [7]:
df_test = pd.read_csv(DATA_PATH / 'test.csv')
df_test.head(10)

In [24]:
df = pd.read_csv(DATA_PATH / 'train.csv')
df = df[['Text', 'Sentiment']].dropna()
df.head(10)

In [25]:
df['Sentiment'].unique()

In [26]:
df.groupby(['Sentiment']).size().plot.bar()

In [11]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

In [12]:
example_text = 'I will watch Memento tonight'
bert_input = tokenizer(
    example_text,
    padding='max_length',
    max_length = 10,
    truncation=True,
    return_tensors="pt",
)

print(bert_input['input_ids'])
print(bert_input['token_type_ids'])
print(bert_input['attention_mask'])

Here is the explanation of `BertTokenizer` parameters above:
- `padding`: to pad each sequence to the maximum length that you specify.
- `max_length`: the maximum length of each sequence. In this example we use 10, but for our actual dataset we will use 512, which is the maximum length of a sequence allowed for BERT.
- `truncation`: if `True`, then the tokens in each sequence that exceed the maximum length will be truncated.
- `return_tensors`: the type of tensors that will be returned. Since we’re using PyTorch, then we use `pt`. If you use Tensorflow, then you need to use `tf`.

In [15]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
labels = {
    'Extremely Negative': 0,
    'Negative': 1,
    'Neutral': 2,
    'Positive': 3,
    'Extremely Positive': 4,
}

class Dataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.labels = [
            labels[label]
            for label in df['Sentiment']
        ]
        self.texts = [
            tokenizer(
                text, 
                padding='max_length',
                max_length = 512,
                truncation=True,
                return_tensors="pt",
            )
            for text in df['Text']
        ]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):
        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y

In [31]:
np.random.seed(112)
df_train, df_val = np.split(df.sample(frac=1, random_state=42), [int(.8*len(df))])

print(len(df_train), len(df_val))

In [32]:
class BertClassifier(nn.Module):
    def __init__(self, num_classes:int, dropout:int=0.5):
        super(BertClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-cased')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, num_classes)
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):
        _, pooled_output = self.bert(input_ids=input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)

        return final_layer

In [37]:
def train(model, train_data, val_data, learning_rate, epochs):
    train = Dataset(train_data)
    val = Dataset(val_data)
    train_dataloader = torch.utils.data.DataLoader(train, batch_size=2, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=2)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr= learning_rate)

    if use_cuda:
        model = model.cuda()
        criterion = criterion.cuda()

    for epoch_num in range(epochs):
        total_acc_train = 0
        total_loss_train = 0

        for train_input, train_label in tqdm(train_dataloader):
            train_label = train_label.to(device)
            mask = train_input['attention_mask'].to(device)
            input_id = train_input['input_ids'].squeeze(1).to(device)

            output = model(input_id, mask)

            batch_loss = criterion(output, train_label)
            total_loss_train += batch_loss.item()

            acc = (output.argmax(dim=1) == train_label).sum().item()
            total_acc_train += acc

            model.zero_grad()
            batch_loss.backward()
            optimizer.step()

        total_acc_val = 0
        total_loss_val = 0

        with torch.no_grad():
            for val_input, val_label in val_dataloader:
                val_label = val_label.to(device)
                mask = val_input['attention_mask'].to(device)
                input_id = val_input['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask)

                batch_loss = criterion(output, val_label)
                total_loss_val += batch_loss.item()

                acc = (output.argmax(dim=1) == val_label).sum().item()
                total_acc_val += acc

        print(' | '.join([
            f'Epochs: {epoch_num + 1}',
            f'Train Loss: {total_loss_train / len(train_data):.3f}',
            f'Train Accuracy: {total_acc_train / len(train_data):.3f}',
            f'Val Loss: {total_loss_val / len(val_data):.3f}',
            f'Val Accuracy: {total_acc_val / len(val_data):.3f}',
        ]))

In [41]:
model = BertClassifier(num_classes=len(labels.keys()))

In [42]:
EPOCHS = 1
LR = 1e-6

train(model, df_train, df_val, LR, EPOCHS)

In [43]:
EPOCHS = 2
LR = 1e-6

train(model, df_train, df_val, LR, EPOCHS)

In [63]:
def flip_dict(x: Dict[Any, Any]) -> Dict[Any, Any]:
    return dict([
        (v, k)
        for k, v in x.items()
    ])

In [81]:
def predict(model, text: str, labels: Dict[int, str]):
    t = tokenizer(
        text, 
        padding='max_length',
        max_length = 512,
        truncation=True,
        return_tensors="pt",
    )
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    if use_cuda:
        model = model.cuda()
    with torch.no_grad():
        mask = t['attention_mask'].to(device)
        input_id = t['input_ids'].squeeze(1).to(device)
        output = model(input_id, mask)
        pred = output.cpu().numpy()
        idx = np.argmax(pred)
        return labels[idx]

In [72]:
x = df_test.loc[1, 'Text']

pred = predict(model, x, labels=flip_dict(labels))

print(x)
pred

In [76]:
pred_labels = flip_dict(labels)

df_test['Sentiment'] = df_test['Text'].apply(lambda text: predict(model, text, labels=pred_labels))
df_test.head(10)

In [80]:
submission = df_test[['id', 'Sentiment']]
submission.to_csv('submission.csv', index=False)