In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from transformers import AutoModel, AutoTokenizer
from tqdm import tqdm

In [11]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')
test_labels_df = pd.read_csv('data/test_labels.csv')

train_df = train_df[['id', 'comment_text', 'toxic']]
negative_sample_train = train_df[train_df['toxic'] == 0].sample(frac=0.1)
positive_sample_train = train_df[train_df['toxic'] == 1]
train_df = pd.concat([negative_sample_train, positive_sample_train])
test_labels_df = test_labels_df[['id', 'toxic']]

test_df = pd.merge(test_df, test_labels_df, on='id', how='inner')
test_df = test_df[test_df['toxic'] != -1]

In [12]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModel.from_pretrained("bert-base-uncased")

In [13]:
class TextClassifier(nn.Module):
    def __init__(self, transformer_model):
        super(TextClassifier, self).__init__()
        self.model = AutoModel.from_pretrained(transformer_model)
        self.tokenizer = AutoTokenizer.from_pretrained(transformer_model)
        self.fc = nn.Linear(768, 1)
        self.sigmoid = nn.Sigmoid()
    
    def get_static_embeddings_matrix(self):
        embeddings_matrix = self.model.get_input_embeddings()
        return embeddings_matrix.weight
    
    def forward(self, input_ids, attention_mask):
        outputs = self.model(input_ids, attention_mask)
        last_hidden_state = outputs.last_hidden_state
        cls_token = last_hidden_state[:, 0]
        cls_token = self.fc(cls_token)
        return self.sigmoid(cls_token)

In [14]:
class TextDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        text = self.data['comment_text'].iloc[idx]
        label = self.data['toxic'].iloc[idx]
        inputs = self.tokenizer(text, return_tensors='pt', padding='max_length', max_length=128, truncation=True)
        inputs['label'] = torch.tensor(label)
        return inputs

In [None]:
train_dataset = TextDataset(train_df, tokenizer)
test_dataset = TextDataset(test_df, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=512, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=512, shuffle=False)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = TextClassifier('bert-base-uncased').to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-5)


In [None]:
for epoch in range(5):
    model.train()
    epoch_loss = 0
    for batch in tqdm(train_loader):
        input_ids = batch['input_ids'].to(device)
        input_ids = input_ids.squeeze(1)
        attention_mask = batch['attention_mask'].to(device)
        attention_mask = attention_mask.view(input_ids.shape)
        labels = batch['label'].to(device)
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs.squeeze(), labels.float())
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    print(f'Epoch {epoch+1}/{5}, Loss: {epoch_loss.item()/len(train_loader)}')

100%|██████████| 59/59 [02:06<00:00,  2.14s/it]


Epoch 1/5, Loss: 0.13523778319358826


100%|██████████| 59/59 [02:07<00:00,  2.16s/it]


Epoch 2/5, Loss: 0.11528488993644714


100%|██████████| 59/59 [02:07<00:00,  2.16s/it]


Epoch 3/5, Loss: 0.2600904405117035


100%|██████████| 59/59 [02:07<00:00,  2.16s/it]


Epoch 4/5, Loss: 0.200520321726799


100%|██████████| 59/59 [02:07<00:00,  2.16s/it]

Epoch 5/5, Loss: 0.25456953048706055





In [18]:
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for batch in tqdm(test_loader):
        input_ids = batch['input_ids'].to(device)
        input_ids = input_ids.squeeze(1)
        attention_mask = batch['attention_mask'].to(device)
        attention_mask = attention_mask.view(input_ids.shape)
        labels = batch['label'].to(device)
        outputs = model(input_ids, attention_mask)
        predicted = torch.round(outputs.squeeze())
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
    print(f'Accuracy: {correct/total}')

100%|██████████| 2000/2000 [02:04<00:00, 16.08it/s]

Accuracy: 0.8541373597174029



