# AI Detector - Model Training

## 1. Import Necessary Dependencies

At first, we need to import required libraries for preprocessing

In [1]:
import os
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer, InputExample, losses
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from torch.utils.data import DataLoader

  from tqdm.autonotebook import tqdm, trange


We should also specify `device` for GPU accelerated training (if GPU is available)

In [2]:
device = torch.device(f"cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


## 2. Define `train_model()` Function

- **Params:** 
  - `df` -> The preprocessed data
  - `model_name` -> The specified Sentence Transformer from [sbert.net](https://sbert.net)
  - `output_path` -> Fine-tuned model export path
  - `epochs` -> Number of iterations in the training loop (defaults to 5)
  - `batch_size` -> Size of batches of training data (defaults to 16)
- **Returns:** Nothing

The functions performs the following operations:
1. Splits the preprocessed data into training and validation data
2. Specifies data feature columns and label column for the training data. This is where feature extraction takes place (coding answers are converted to embeddings).
3. Loads training data into **DataLoaders** with ideal `batch_size`
4. Defines the model architecture and assigns to the `device`
5. Defines the loss function (`CosineSimilarityLoss` in this case)
6. Defines an evaluator with validation data
7. Finally, **Fine-tunes the specified SBERT model** and exports it to its directory

In [3]:
def train_model(df, model_name, output_path, epochs=5, batch_size=16):
    # Split the data into train and test sets
    train_df, valid_df = train_test_split(df, test_size=0.2, random_state=42)

    # Create examples for training
    train_examples = [InputExample(texts=[row['candidate_combined'], row['ai_combined']], label=float(
        row['similarity_score'])) for _, row in train_df.iterrows()]

    # Create DataLoader for training with appropriate batch size
    train_dataloader = DataLoader(
        train_examples, shuffle=True, batch_size=batch_size)

    # Initialize the specified SentenceTransformer model
    model = SentenceTransformer(model_name, device=device)

    # Define the loss function
    train_loss = losses.CosineSimilarityLoss(model)

    # Prepare validation data
    valid_samples = [(row['candidate_combined'], row['ai_combined'], row['similarity_score'])
                     for _, row in valid_df.iterrows()]
    valid_examples = [InputExample(
        texts=[s[0], s[1]], label=float(s[2])) for s in valid_samples]

    # Create an evaluator
    evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
        valid_examples, name='validation')

    # Train/fine-tune the model
    model.fit(train_objectives=[(train_dataloader, train_loss)],
              epochs=epochs,
              warmup_steps=100,
              evaluator=evaluator,
              evaluation_steps=500,
              output_path=output_path)
    
    return model

## 3. Train the Model
At first, specify the data and model export directories.

In [4]:
# Load the preprocessed data
data_dir = os.path.join(os.path.abspath(''), os.pardir, 'data')
df = pd.read_csv(os.path.join(data_dir, 'preprocessed_data.csv'))

# Define model export/output path
model_dir = os.path.join(
    os.path.abspath(''), os.pardir, 'models')
output_path = os.path.join(model_dir, 'fine-tuned_all-MiniLM-L6-v2')

Train the model

In [5]:
# Train the model
model = train_model(df, 'all-MiniLM-L6-v2', output_path)

print(f"Model training complete. Model saved as {output_path}")

100%|██████████| 95/95 [04:56<00:00,  3.12s/it]

{'train_runtime': 296.6126, 'train_samples_per_second': 5.091, 'train_steps_per_second': 0.32, 'train_loss': 0.1601176613255551, 'epoch': 5.0}
Model training complete. Model saved as e:\Data Science\AI-Detector\notebooks\..\models\fine-tuned_all-MiniLM-L6-v2



