# Preparation

In [None]:
# Show current GPU
from tensorflow.python.client import device_lib
device_lib.list_local_devices()

In [None]:
# Mount Google drive (optional, if you want to save the data here in case the environment disconnects)
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
!ls "/content/drive/My Drive/DP"

In [None]:
# Load tokens into environment from Google Colab User Data
import os
from google.colab import userdata
os.environ['HF_TOKEN'] = userdata.get('HF_TOKEN')

In [None]:
# Clone the repository
! rm -rf masters-thesis
! git clone https://github.com/therazix/masters-thesis.git

In [None]:
# Install requirements.
! pip install -r masters-thesis/requirements.txt
# If the session restart is requested, restart the session and run this cell again.

In [None]:
# You can always use --help for each subcommand to show help.

# Data scraping

In [None]:
# Scrape CSFD website
!python masters-thesis/main.py -v scrape csfd -o "scraped_data/csfd" --limit 6000

In [None]:
# Scrape Reddit website
!python masters-thesis/main.py -v scrape reddit -o "scraped_data/reddit"

In [None]:
# Scrape Reddit website
!python masters-thesis/main.py -v scrape tn-cz -o "scraped_data/tn-cz" --limit 1000

# Dataset

## Create dataset for encoder-only models (XLM-RoBERTa, Ensemble)

In [None]:
# Create a dataset from the data that was scraped from CSFD (3 files will be created - training set, validation set, and testing set).
# Dataset will have 5 authors and each author will have maximum of 1000 texts.
# There will also be added stylometric features (required for Ensemble model).
!python masters-thesis/main.py -v dataset create -i "scraped_data/csfd/csfd_EXAMPLE.csv" -o "datasets/csfd/" -n 5 -l 1000 --add-text-features

# You can repeat this process for each dataset

## Create dataset for decoder-only models (Llama, Mistral, GPT)

In [None]:
# Create a dataset from the data that was scraped from CSFD (1 files will be created - testing set only).
# When you specify mutiple repetitions (r), the model will be evaluated r-times during testing and the average number will be calculated.
# Each repetition should have different texts.
!python masters-thesis/main.py -v dataset create-prompting -i "scraped_data/csfd/csfd_EXAMPLE.csv" -o "datasets/csfd_prompt/" -n 5 -r 3

## Create dataset for fine-tuning decoder-only models (LLama, Mistral)

In [None]:
# Create a dataset from the data that was scraped from CSFD (1 files will be created - training set only).
!python masters-thesis/main.py -v dataset create-finetuning -i "scraped_data/csfd/csfd_EXAMPLE.csv" -o "datasets/csfd_prompt/" -n 5 -r 3

## Show info about the dataset

In [None]:
# The input can either be scraped data, as in this case, or an already parsed dataset. You can also use the optional argument --graph.
!python masters-thesis/main.py -v dataset info -i "scraped_data/csfd/csfd_EXAMPLE.csv" --graph

# Training

# XLM-RoBERTa

In [None]:
# Train the model for 5 authors on CSFD dataset
! python masters-thesis/main.py -v train xlm-roberta \
  #--checkpoint "models/csfd/xlm_roberta_top5/checkpoint-123" \  # If you want to continue training from the checkpoint
  --training-set "datasets/csfd/train_top5.csv" \
  --validation-set "dataset/csfd/val_top5.csv" \
  --testing-set "datasets/csfd/test_top5.csv" \  # Testing set is not needed during training, but if provided, the model will automatically evaluate at the end.
  -o "models/csfd/xlm_roberta_top5" \
  -e 5  # Number of epochs to train

## Ensemble

In [None]:
# Train the model for 5 authors on CSFD dataset
! python masters-thesis/main.py -v train ensemble \
  --model "models/csfd/xlm_roberta_top5/checkpoint-123" \  # Saved XLM-RoBERTa model (required)
  --training-set "datasets/csfd/train_top5.csv" \
  --testing-set "datasets/csfd/test_top5.csv" \  # Testing set is not needed during training, but if provided, the model will automatically evaluate at the end.
  -o "models/csfd/xlm_roberta_top5"

## Llama 3.1

In [None]:
# Install the latest version of unsloth
!pip install unsloth
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

In [None]:
# Fine-tune the model
! python masters-thesis/main.py -v train llama3 \
  --output-dir "results/csfd" \
  --repo-id "RaZiX/Llama-3.1-8B-Instruct-AA" \
  --training-set "datasets/finetune/dataset.csv" \
  --testing-set "datasets/csfd/test_prompts_5authors_3reps.csv" \
  --template "cz" \
  --epochs 3

## Mistral v0.3

In [None]:
# Fine-tune the model
! python masters-thesis/main.py -v train mistral \
  --output-dir "results/csfd" \
  --repo-id "RaZiX/Mistral-7B-Instruct-v0.3-AA" \
  --training-set "datasets/finetune/dataset.csv" \
  --testing-set "datasets/csfd/test_prompts_5authors_3reps.csv" \
  --template "cz" \
  --epochs 3

# Testing

## XLM-RoBERTa

In [None]:
# Test the model for 5 authors on CSFD dataset
! python masters-thesis/main.py -v test xlm-roberta \
  --model "models/csfd/xlm_roberta_top5/checkpoint-123" \
  --testing-set "datasets/csfd/test_top5.csv" \
  -o "models/csfd/ensemble_5"

## Ensemble

In [None]:
# Test the model for 5 authors on CSFD dataset
! python masters-thesis/main.py -v test ensemble \
  --model "models/csfd/xlm_roberta_top5/checkpoint-123" \
  --testing-set "datasets/csfd/test_top5.csv" \
  --classifiers-dir "models/csfd/ensemble_5"

## Llama 3.1

In [None]:
# Test the model for 5 authors on CSFD dataset
! python masters-thesis/main.py -v test llama3 \
  --output-dir "results/csfd" \
  --testing-set "datasets/csfd/test_prompts_5authors_3reps.csv" \
  # --model-name "RaZiX/Llama-3.1-8B-Instruct-AA" \  # You can provide a HuggingFace model name to test fine-tuned model
  --template "cz"  # "cz-1shot" for 1-shot scenario

## Mistral

In [None]:
# Test the model for 5 authors on CSFD dataset
! python masters-thesis/main.py -v test mistral \
  --output-dir "results/csfd" \
  --testing-set "datasets/csfd/test_prompts_5authors_3reps.csv" \
  # --model-name "RaZiX/Mistral-7B-Instruct-v0.3-AA" \  # You can provide a HuggingFace model name to test fine-tuned model
  --template "cz"  # "cz-1shot" for 1-shot scenario

## GPT-4o

In [None]:
# Test the model for 5 authors on CSFD dataset
! python masters-thesis/main.py -v test gpt-4o \
  --output-dir "results/csfd" \
  --testing-set "datasets/csfd/test_prompts_5authors_3reps.csv" \
  --template "cz"  # "cz-1shot" for 1-shot scenario \
  # --openai-api-key "KEY"  # Can also be provided as environment variable