In [4]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from torch.nn import CrossEntropyLoss
from tqdm import tqdm
from collections import Counter
from imblearn.over_sampling import RandomOverSampler
import torch.cuda.amp as amp  # For mixed precision training

# Set random seed for reproducibility
np.random.seed(42)
torch.manual_seed(42)

# Step 1: Load and Preprocess the Dataset
def load_txt_file(file_path):
    # Load the dataset with specified column names
    df = pd.read_csv(file_path, delimiter='\t', header=None, names=['sentence1', 'sentence2', 'label'])

    # Drop rows with missing values
    df.dropna(inplace=True)

    # Filter for binary labels (0 and 1)
    df = df[df['label'].isin([0, 1])]

    # Concatenate sentence1 and sentence2 with [SEP]
    df['text'] = df['sentence1'] + " [SEP] " + df['sentence2']

    # Check class distribution
    print("Class distribution before balancing:", Counter(df['label']))

    # Oversample minority class to balance the dataset (default behavior: equalize classes)
    oversampler = RandomOverSampler(random_state=42)  # Removed sampling_strategy=0.5
    X_resampled, y_resampled = oversampler.fit_resample(df[['text']], df['label'])
    df = pd.DataFrame({'text': X_resampled['text'], 'label': y_resampled})

    print("Class distribution after balancing:", Counter(df['label']))

    # Subsample the dataset to reduce training time (e.g., 50%)
    df = df.sample(frac=0.5, random_state=42)
    print(f"Dataset size after subsampling: {len(df)}")

    return df

def preprocess_text(text):
    text = text.lower()
    text = text.replace('\d+', '')  # Remove numbers
    return text

# Step 2: Prepare the Dataset Class
class PlagiarismDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=128):  # Reduced to 128
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        row = self.df.iloc[index]
        text = str(row['text']) if not pd.isna(row['text']) else ""
        inputs = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )
        label = torch.tensor(row['label'], dtype=torch.long)

        return {
            'input_ids': inputs['input_ids'].squeeze(0),
            'attention_mask': inputs['attention_mask'].squeeze(0),
            'label': label
        }

# Custom collate function for DataLoader
def collate_fn(batch):
    input_ids = torch.stack([item['input_ids'] for item in batch])
    attention_masks = torch.stack([item['attention_mask'] for item in batch])
    labels = torch.stack([item['label'] for item in batch])
    return {
        'input_ids': input_ids,
        'attention_mask': attention_masks,
        'label': labels
    }

# Step 3: Load and Split the Dataset
file_path = '/content/train_snli.txt'
df = load_txt_file(file_path)
df['text'] = df['text'].apply(preprocess_text)

# Split into train, validation, and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=42)

print(f"Train shape: {train_df.shape}, Validation shape: {val_df.shape}, Test shape: {test_df.shape}")

# Step 4: Initialize Tokenizer and Model
tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM-135M")
model = AutoModelForSequenceClassification.from_pretrained("HuggingFaceTB/SmolLM-135M", num_labels=2)

# Add padding token and resize embeddings
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model.resize_token_embeddings(len(tokenizer))
model.config.pad_token_id = tokenizer.pad_token_id

# Step 5: Create Datasets and DataLoaders
train_set = PlagiarismDataset(train_df, tokenizer, max_length=128)
valid_set = PlagiarismDataset(val_df, tokenizer, max_length=128)
test_set = PlagiarismDataset(test_df, tokenizer, max_length=128)

train_loader = DataLoader(
    train_set,
    batch_size=32,
    shuffle=True,
    collate_fn=collate_fn,
    num_workers=4
)
valid_loader = DataLoader(
    valid_set,
    batch_size=32,
    shuffle=False,
    collate_fn=collate_fn,
    num_workers=4
)
test_loader = DataLoader(
    test_set,
    batch_size=32,
    shuffle=False,
    collate_fn=collate_fn,
    num_workers=4
)

# Step 6: Setup Device and Model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
model = model.to(device)

# Step 7: Define Training and Evaluation Functions
def train_model(model, train_loader, val_loader, optimizer, loss_fn, epochs=2):
    scaler = amp.GradScaler()  # For mixed precision training
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            optimizer.zero_grad()
            with amp.autocast():
                outputs = model(input_ids, attention_mask=attention_mask)
                loss = loss_fn(outputs.logits, labels)
            total_loss += loss.item()

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1}/{epochs}, Training Loss: {avg_loss:.4f}")

        val_accuracy, _ = evaluate_model(model, val_loader)
        print(f"Epoch {epoch+1}/{epochs}, Validation Accuracy: {val_accuracy:.4f}")
    print("Training complete!")

def evaluate_model(model, data_loader):
    model.eval()
    correct = 0
    total = 0
    all_labels = []
    all_predictions = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            with amp.autocast():
                outputs = model(input_ids, attention_mask=attention_mask)
            predictions = torch.argmax(outputs.logits, dim=1)

            all_labels.extend(labels.cpu().numpy())
            all_predictions.extend(predictions.cpu().numpy())

            correct += (predictions == labels).sum().item()
            total += labels.size(0)

    accuracy = correct / total
    report = classification_report(all_labels, all_predictions, target_names=['Non-Plagiarized', 'Plagiarized'])
    print(f"Accuracy: {accuracy:.4f}")
    print("\nClassification Report:\n", report)
    return accuracy, report

# Step 8: Train the Model
optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = CrossEntropyLoss()
train_model(model, train_loader, valid_loader, optimizer, loss_fn, epochs=2)

# Step 9: Evaluate on Test Set
print("\nEvaluating the model on test set...")
test_accuracy, test_report = evaluate_model(model, test_loader)

Class distribution before balancing: Counter({0: 183964, 1: 183405})
Class distribution after balancing: Counter({0: 183964, 1: 183964})
Dataset size after subsampling: 183964
Train shape: (132453, 2), Validation shape: (14718, 2), Test shape: (36793, 2)


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at HuggingFaceTB/SmolLM-135M and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using device: cuda


  scaler = amp.GradScaler()  # For mixed precision training
  with amp.autocast():
Training Epoch 1: 100%|██████████| 4140/4140 [26:40<00:00,  2.59it/s]

Epoch 1/2, Training Loss: 0.1603



  with amp.autocast():


Accuracy: 0.9551

Classification Report:
                  precision    recall  f1-score   support

Non-Plagiarized       0.96      0.95      0.95      7347
    Plagiarized       0.95      0.96      0.96      7371

       accuracy                           0.96     14718
      macro avg       0.96      0.96      0.96     14718
   weighted avg       0.96      0.96      0.96     14718

Epoch 1/2, Validation Accuracy: 0.9551


  with amp.autocast():
Training Epoch 2: 100%|██████████| 4140/4140 [26:40<00:00,  2.59it/s]

Epoch 2/2, Training Loss: 0.0758



  with amp.autocast():


Accuracy: 0.9571

Classification Report:
                  precision    recall  f1-score   support

Non-Plagiarized       0.97      0.95      0.96      7347
    Plagiarized       0.95      0.97      0.96      7371

       accuracy                           0.96     14718
      macro avg       0.96      0.96      0.96     14718
   weighted avg       0.96      0.96      0.96     14718

Epoch 2/2, Validation Accuracy: 0.9571
Training complete!

Evaluating the model on test set...


  with amp.autocast():


Accuracy: 0.9600

Classification Report:
                  precision    recall  f1-score   support

Non-Plagiarized       0.97      0.95      0.96     18566
    Plagiarized       0.95      0.97      0.96     18227

       accuracy                           0.96     36793
      macro avg       0.96      0.96      0.96     36793
   weighted avg       0.96      0.96      0.96     36793

