In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from transformers import DebertaV2ForSequenceClassification, DebertaV2Tokenizer
from transformers import get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report
from tqdm import tqdm

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
train_df = pd.read_csv("/kaggle/input/dataset-for-sub/train.csv")
train_df = train_df[["id", "text", "target"]]
train_df["text"] = train_df["text"].apply(lambda x: x.lower())

In [None]:
train_df

In [None]:
model = DebertaV2ForSequenceClassification.from_pretrained("MoritzLaurer/deberta-v3-large-zeroshot-v1.1-all-33", num_labels=2, ignore_mismatched_sizes=True).to(device)
tokenizer = DebertaV2Tokenizer.from_pretrained("MoritzLaurer/deberta-v3-large-zeroshot-v1.1-all-33")

In [None]:
texts = train_df["text"].values
labels = train_df["target"].values

In [None]:
X_train, X_val, y_train, y_val = train_test_split(texts, labels, test_size=.2)

In [None]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt"
        )

        return {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "label": torch.tensor(label, dtype=torch.long)
        }

In [None]:
train_dataset = TextDataset(X_train, y_train, tokenizer, 99)
val_dataset = TextDataset(X_val, y_val, tokenizer, 99)

train_loader = DataLoader(
    train_dataset,
    batch_size=8,
    shuffle=True
)

val_loader = DataLoader(
    val_dataset,
    batch_size=8,
    shuffle=True
)

In [None]:
optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = nn.CrossEntropyLoss()

In [None]:
model.train()

for epoch in range(5):
    losses = []
    print(f"Epoch: {epoch + 1}/5")
    print("-" * 100)

    for batch in tqdm(train_loader):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        logits = outputs.logits
        
        loss = loss_fn(logits, labels)
        losses.append(loss.item())

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    epoch_loss = sum(losses) / len(losses)
    print(f"Epoch {epoch+1} average loss: {epoch_loss}")
    print("-"*100)
    print("")

In [None]:
model.eval()
val_losses = []
val_preds = []
val_true = []

with torch.no_grad():
    for batch in tqdm(val_loader, desc=f"Validation Epoch {epoch+1}"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        loss = loss_fn(logits, labels)
        val_losses.append(loss.item())

        preds = torch.argmax(logits, dim=1)
        val_preds.extend(preds.cpu().numpy())
        val_true.extend(labels.cpu().numpy())

train_loss = np.mean(losses)
val_loss = np.mean(val_losses)
val_accuracy = accuracy_score(val_true, val_preds)

print(f"\nEpoch {epoch+1}")
print(f"Train Loss: {train_loss:.4f}")
print(f"Val Loss: {val_loss:.4f}")
print(f"Val Accuracy: {val_accuracy:.4f}")
print(classification_report(val_true, val_preds))
print("-" * 50)

In [None]:
test_df = pd.read_csv("/kaggle/input/dataset-for-sub/test.csv")
test_df = test_df[["id", "text"]]
test_df["text"] = test_df["text"].apply(lambda x: x.lower())
test_df

In [None]:
test_texts = test_df['text'].values

In [None]:
class TestDataset(Dataset):
    def __init__(self, texts, tokenizer, max_len):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }

In [None]:
test_dataset = TestDataset(test_texts, tokenizer, 99)

In [None]:
test_loader = DataLoader(
    test_dataset,
    batch_size=32,
    shuffle=False
)

In [None]:
model = model.eval()
predictions = []
prediction_probs = []

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Predicting"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        
        _, preds = torch.max(outputs.logits, dim=1)
        
        predictions.extend(preds.cpu().numpy())

In [None]:
test_df["target"] = predictions
test_df = test_df[["id", "target"]]

In [None]:
test_df.to_csv("sub.csv", index=False)