In [1]:
import pandas as pd
train_data = pd.read_csv('data-for-classification-generated-text/train.csv')
val_data = pd.read_csv('data-for-classification-generated-text/val.csv')
test_data = pd.read_csv('data-for-classification-generated-text/test.csv')

In [2]:
print('train_data: \n')
train_data.head()

train_data: 



Unnamed: 0,id,prompt_id,text,generated
0,e_k7r6yo0s,8,"For the majority of students in the US, school...",1
1,e_dkh6chp9,2,Have you ever thought about what was the face ...,1
2,e_eenni7od,2,A face has been found by NASA's Viking 1 space...,1
3,e_pi969vyg,3,Do you know what the Seagoing Cowboys program ...,0
4,e_02r7rk2e,9,Students with a grade C average should be able...,0


In [None]:
print('val_data: \n')
val_data.head()

In [None]:
print('test_data: \n')
test_data.head()

# BERT

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset
from torch import nn
from tqdm import tqdm
import torch
from sklearn.metrics import accuracy_score

In [None]:
# Create a PyTorch Dataset
class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        text = self.dataframe.iloc[idx]['text']
        label = torch.tensor(self.dataframe.iloc[idx]['generated'], dtype=torch.long)
        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')
        return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'labels': label}

In [None]:
# Initialize the tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
num_labels = len(train_data['generated'].unique())  # Assuming 'generated' column has class labels
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)

# Create DataLoaders
train_dataset = CustomDataset(train_data, tokenizer, max_length=512)
val_dataset = CustomDataset(val_data, tokenizer, max_length=512)
test_dataset = CustomDataset(test_data, tokenizer, max_length=512)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

In [None]:
# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

In [None]:
# Initialize optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Training loop
num_epochs = 2
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    total_predictions = []
    true_labels = []

    for batch in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{num_epochs}'):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        # Get predicted labels
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=1)
        total_predictions.extend(predictions.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

        loss.backward()
        optimizer.step()

    average_loss = total_loss / len(train_loader)
    accuracy = accuracy_score(true_labels, total_predictions)
    print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {average_loss}, Accuracy: {accuracy}')

    # Validation step
    model.eval()
    val_predictions = []
    val_labels = []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=1)
            val_predictions.extend(predictions.cpu().numpy())
            val_labels.extend(labels.cpu().numpy())

    val_accuracy = accuracy_score(val_labels, val_predictions)
    print(f'Validation Accuracy: {val_accuracy}')

    # Save the model after each epoch
    model_save_path = f'model_bert_epoch_{epoch + 1}'
    model.save_pretrained(model_save_path)
    print(f'Model saved at {model_save_path}')

In [None]:
import shutil

# Nén thư mục model_bert (ví dụ: nén thư mục của epoch cuối cùng)
shutil.make_archive('/kaggle/working/model_bert_epoch_2', 'zip', 'model_bert_epoch_2')

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
# Testing step
model.eval()
test_predictions = []
test_labels = []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=1).cpu().numpy()
        test_predictions.extend(predictions)
        test_labels.extend(labels.cpu().numpy())

test_accuracy = accuracy_score(test_labels, test_predictions)
print(f'Test Accuracy: {test_accuracy}')
print('Report Classification: ', classification_report(test_labels, test_predictions))
print('Confusion matrix: \n', confusion_matrix(test_labels, test_predictions))

# # Save the model
# model.save_pretrained('model_bert')