In [2]:
!pip install transformers
import pandas as pd
from transformers import BertTokenizer,  BertModel
import torch
import numpy as np
from torch import nn
from torch.optim import Adam
from tqdm import tqdm
np.random.seed(112)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m30.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.28.1


In [3]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/Colab Notebooks/new-category-classification


Mounted at /content/drive
/content/drive/MyDrive/Colab Notebooks/new-category-classification


In [4]:
data_set_file = "./Participants_Data_News_category/Data_Train.xlsx"
labels = {"Politics": 0,"Technology": 1, "Entertainment": 2, "Business": 3}
dataset = pd.read_excel(data_set_file)
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [5]:
class NewsDataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.labels = [label for label in df['SECTION']]
        self.texts = [tokenizer(text, padding='max_length', max_length = 512, truncation=True,return_tensors="pt") for text in df['STORY']]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y

In [6]:
class BertClassifier(nn.Module):

    def __init__(self, dropout=0.5):

        super(BertClassifier, self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-cased')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 5)
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):

        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)

        return final_layer

In [7]:
def train(model, train_data, val_data, learning_rate, epochs):

    train, val = NewsDataset(train_data), NewsDataset(val_data)

    train_dataloader = torch.utils.data.DataLoader(train, batch_size=2, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=2)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr= learning_rate)

    if use_cuda:
        model = model.cuda()
        criterion = criterion.cuda()

    for epoch_num in range(epochs):

        total_acc_train = 0
        total_loss_train = 0
        for train_input, train_label in tqdm(train_dataloader):

            train_label = train_label.to(device)
            mask = train_input['attention_mask'].to(device)
            input_id = train_input['input_ids'].squeeze(1).to(device)

            output = model(input_id, mask)
                
            batch_loss = criterion(output, train_label.long())
            total_loss_train += batch_loss.item()
                
            acc = (output.argmax(dim=1) == train_label).sum().item()
            total_acc_train += acc

            model.zero_grad()
            batch_loss.backward()
            optimizer.step()
            
        total_acc_val = 0
        total_loss_val = 0
        with torch.no_grad():
            for val_input, val_label in val_dataloader:

                val_label = val_label.to(device)
                mask = val_input['attention_mask'].to(device)
                input_id = val_input['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask)

                batch_loss = criterion(output, val_label.long())
                total_loss_val += batch_loss.item()
                    
                acc = (output.argmax(dim=1) == val_label).sum().item()
                total_acc_val += acc
            print(
                f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} \
                | Train Accuracy: {total_acc_train / len(train_data): .3f} \
                | Val Loss: {total_loss_val / len(val_data): .3f} \
                | Val Accuracy: {total_acc_val / len(val_data): .3f}'
            )

In [8]:
def evaluate(model, test_data):

    test = NewsDataset(test_data)

    test_dataloader = torch.utils.data.DataLoader(test, batch_size=2)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:
        model = model.cuda()

    total_acc_test = 0
    with torch.no_grad():

        for test_input, test_label in test_dataloader:

              test_label = test_label.to(device)
              mask = test_input['attention_mask'].to(device)
              input_id = test_input['input_ids'].squeeze(1).to(device)

              output = model(input_id, mask)

              acc = (output.argmax(dim=1) == test_label).sum().item()
              total_acc_test += acc
    
    print(f'Test Accuracy: {total_acc_test / len(test_data): .3f}')

In [9]:
df_train, df_val, df_test = np.split(dataset.sample(frac=1, random_state=42), [int(.8*len(dataset)), int(.9*len(dataset))])
EPOCHS = 5
model = BertClassifier()
LR = 1e-6
              
train(model, df_train, df_val, LR, EPOCHS)
torch.save(model.state_dict(), './model/checkpoint.pth')

Downloading pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 3051/3051 [10:53<00:00,  4.67it/s]


Epochs: 1 | Train Loss:  0.398                 | Train Accuracy:  0.710                 | Val Loss:  0.081                 | Val Accuracy:  0.975


100%|██████████| 3051/3051 [10:56<00:00,  4.65it/s]


Epochs: 2 | Train Loss:  0.054                 | Train Accuracy:  0.980                 | Val Loss:  0.041                 | Val Accuracy:  0.986


100%|██████████| 3051/3051 [10:56<00:00,  4.65it/s]


Epochs: 3 | Train Loss:  0.026                 | Train Accuracy:  0.991                 | Val Loss:  0.026                 | Val Accuracy:  0.988


100%|██████████| 3051/3051 [10:55<00:00,  4.65it/s]


Epochs: 4 | Train Loss:  0.016                 | Train Accuracy:  0.993                 | Val Loss:  0.023                 | Val Accuracy:  0.990


100%|██████████| 3051/3051 [10:55<00:00,  4.65it/s]


Epochs: 5 | Train Loss:  0.010                 | Train Accuracy:  0.996                 | Val Loss:  0.022                 | Val Accuracy:  0.990


In [10]:
evaluate(model, df_test)

Test Accuracy:  0.988


In [27]:
test = pd.read_excel('./Participants_Data_News_category/Data_Test.xlsx')
sentence = test['STORY'].tolist()
pred = []
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
for sent in sentence:
  inputs = tokenizer(sent, return_tensors="pt", truncation=True)
  input_id =inputs['input_ids'].squeeze(1).to(device)
  mask = inputs['attention_mask'].to(device)
  with torch.no_grad():
    output = model(input_id, mask)
  print("====> in try", output.argmax(dim=1).tolist(), len(sent))
  pred.append(output.argmax(dim=1).tolist())
print('-'*10)
print(pred)
df_out = pd.DataFrame(pred, columns=['SECTION'])
df_out.to_csv('./submission.csv')   

failed at 2301
failed at 3539
failed at 3251
failed at 3117
failed at 3830
failed at 3125
failed at 2768
failed at 2557
failed at 3676
failed at 2688
failed at 3583
failed at 3281
failed at 2579
failed at 2686
failed at 2960
failed at 2701
failed at 2525
failed at 2340
failed at 3322
failed at 6820
failed at 2599
failed at 2923
failed at 2943
failed at 3461
failed at 2760
failed at 3928
failed at 2420
failed at 2452
failed at 2244
failed at 2367
failed at 2709
failed at 2604
failed at 5274
failed at 2673
failed at 3162
failed at 6065
failed at 2813
----------
[[1], [2], [1], [1], [1], [1], [1], [2], [1], [2], [0], [3], [2], [1], [2], [1], [1], [2], [3], [2], [2], [2], [2], [0], [0], [2], [2], [3], [3], [0], [1], [3], [2], [0], [2], [2], [2], [2], [0], [1], [0], [1], [3], [0], [2], [2], [1], [1], [0], [1], [3], [2], [1], [2], [2], [0], [1], [1], [0], [1], [1], [2], [3], [2], [1], [1], [2], [0], [0], [1], [1], [1], [2], [0], [0], [1], [1], [1], [2], [2], [3], [0], [3], [0], [2], [2], [2]