# Mistral-7b + External Training Datasets (daigt)

Credit : https://www.kaggle.com/code/minhsienweng/mistral-7b-v0-detection-train-infer

Credit : https://www.kaggle.com/code/hotchpotch/infer-llm-detect-ai-comp-mistral-7b

This notebook investigates the use of an pretrained LLM to identify texts generated by another LLM. We employ `Mistral-7b-v0` as an initial approach.

# Install library

In [1]:
!pip install -q peft --no-index --find-links /kaggle/input/llm-detect-pip/peft-0.5.0-py3-none-any.whl
# Eanble 4-bit CUDA functions for PyTorch
!pip install -q bitsandbytes --no-index --find-link /kaggle/input/llm-detect-pip/bitsandbytes-0.41.1-py3-none-any.whl
!pip install -q accelerate --no-index --find-links /kaggle/input/llm-detect-pip/accelerate-0.24.1-py3-none-any.whl 
!pip install -q transformers --no-index --find-links /kaggle/input/llm-detect-pip/transformers-4.34.1-py3-none-any.whl
# Install language tool
!!pip install -q language-tool-python --no-index --find-links /kaggle/input/daigt-misc/language_tool_python-2.7.1-py3-none-any.whl
!!mkdir -p /root/.cache/language_tool_python/
!!cp -r /kaggle/input/daigt-misc/lang57/LanguageTool-5.7 /root/.cache/language_tool_python/LanguageTool-5.7

[]

In [2]:
from __future__ import annotations
import time, sys, gc, logging, random
from pathlib import Path
import numpy as np
import pandas as pd
from tqdm import tqdm
from datasets import Dataset
from sklearn.model_selection import StratifiedKFold
from peft import get_peft_config, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType # type: ignore
from transformers import BitsAndBytesConfig
import torch
from transformers import AutoTokenizer, LlamaForSequenceClassification
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorWithPadding
import transformers
import peft
from accelerate import Accelerator
import bitsandbytes
from sklearn.metrics import accuracy_score, roc_auc_score
from shutil import rmtree
import language_tool_python
import optuna
import concurrent
from concurrent.futures import ThreadPoolExecutor
from concurrent.futures import wait

print(transformers.__version__)
print(peft.__version__)
print(torch.__version__)

language_tool = language_tool_python.LanguageTool('en-US')
N_FOLD = 5
SEED = 42
DEBUG = True
IS_TRAIN = False

# Seed the same seed to all 
def seed_everything(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

seed_everything()
# Create new `pandas` methods which use `tqdm` progress
# (can use tqdm_gui, optional kwargs, etc.)
tqdm.pandas()

log_level = "DEBUG"

logger = logging.getLogger(__name__)
# Setup logging
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    handlers=[logging.StreamHandler(sys.stdout)],
    level=logging.WARNING
)

# set the main code and the modules it uses to the same log-level according to the node
transformers.utils.logging.set_verbosity(log_level)



4.33.0
0.5.0
2.0.0


# Load training data

In [3]:
# Cross validation
def cv_split(train_data):
    skf = StratifiedKFold(n_splits=N_FOLD, shuffle=True, random_state=SEED)
    X = train_data.loc[:, train_data.columns != "label"]
    y = train_data.loc[:, train_data.columns == "label"]

    for fold, (train_index, valid_index) in enumerate(skf.split(X, y)):
        train_data.loc[valid_index, "fold"] = fold

    print(train_data.groupby("fold")["label"].value_counts())
    display(train_data.head())
    return train_data

def pre_processing_text(text):
    text = text.replace('\n', ' ')
    typos = language_tool.check(text) # typo is a list
    # Check how many typos
    #if len(typos) > 0:
    #print(f"The number of typos = {len(typos)}\n {typos}")
    text = language_tool.correct(text)
    return text

# Run pre-processing texts in parallel
def parallel_pre_processing_text(texts):
    print(f"Total number of texts {len(texts)}")
    results = []
    # run 'pre_processing' fucntions in the process pool
    with ThreadPoolExecutor(4) as executor:
        # results = list(tqdm(executor.map(pre_processing_text, texts)))
        # send in the tasks
        futures = [executor.submit(pre_processing_text, text) for text in texts]
        # wait for all tasks to complete
        for future in futures:
            results.append(future.result())
            if len(results) % 100 == 0:
                print(f"Finished {len(results)} / {len(texts)}\n", end='', flush=True)
    # wait for all tasks to complete
    print("results", len(results))
    return results
    
    
def load_train_data():
    train_df = pd.read_csv("/kaggle/input/llm-detect-ai-generated-text/train_essays.csv", sep=',')
    train_prompts_df = pd.read_csv("/kaggle/input/llm-detect-ai-generated-text/train_prompts.csv", sep=',')

    # rename column generated to label and remove used 'id' and 'prompt_id' columns
    # Label: 1 indicates generated texts (by LLMs) 
    train_df = train_df.rename(columns={'generated': 'label'})
    train_df = train_df.reset_index(drop=True)
    train_df = train_df.drop(['id', 'prompt_id'], axis=1)
#     print("Start processing training data's text")
#     start = time.time()
#     # Clear text in both train and test dataset
#     train_df['text'] = train_df['text'].progress_apply(lambda text: pre_processing_text(text))
#     display(train_df.head())
#     print(f"Correct the training data's texts with {time.time() - start : .1f} seconds")
    
    # Include external data
    external_df = pd.read_csv("/kaggle/input/daigt-v2-train-dataset/train_v2_drcat_02.csv", sep=',')
    # We only need 'text' and 'label' columns
    external_df = external_df[["text", "label"]]
    external_df["label"] = 1
    
#     print("Start processing external data's texts")
#     start = time.time()
#     external_df['text'] = parallel_pre_processing_text(external_df['text'].to_list())
#     print(f"Correct the external data's texts with {time.time() - start : .1f} seconds")
#     # external_df['text'] = external_df['text'].map(lambda text: pre_processing_text(text))
#     display(external_df.head())
#     external_df.to_csv('train_v2_drcat_02_fixed.csv', index=False)
    # Merge train and external data into train_data
    train_data = pd.concat([train_df, external_df])
    train_data.reset_index(inplace=True, drop=True)
    # print(f"Train data has shape: {train_data.shape}")
    print(f"Train data {train_data.value_counts('label')}") # 1: generated texts 0: human texts
    return train_data

In [4]:
load_train_data()

Train data label
1    44871
0     1375
Name: count, dtype: int64


Unnamed: 0,text,label
0,Cars. Cars have been around since they became ...,0
1,Transportation is a large necessity in most co...,0
2,"""America's love affair with it's vehicles seem...",0
3,How often do you ride in a car? Do you drive a...,0
4,Cars are a wonderful thing. They are perhaps o...,0
...,...,...
46241,"Dear Senator,\n\nI am writing to you today to ...",1
46242,"Dear Senator,\n\nI am writing to you today to ...",1
46243,"Dear Senator,\n\nI am writing to you today to ...",1
46244,"Dear Senator,\n\nI am writing to you today to ...",1


# Load pretrained LLM model

In [5]:
# Load the pretrained model and add an extra layer with PEFT library for fine-tuning
def load_model(fold):
    TARGET_MODEL = "/kaggle/input/mistral/pytorch/7b-v0.1-hf/1"
    # TARGET_MODEL = "/kaggle/input/mistral-7b-v0-1/Mistral-7B-v0.1"
    # LoRA: Low-Rank Adaptation of Large Language Models
    peft_config = LoraConfig(
        r=64,
        lora_alpha=16,
        lora_dropout=0.1,
        bias="none",
        task_type=TaskType.SEQ_CLS,
        inference_mode=False,
        target_modules=[
            "q_proj",
            "v_proj"
        ],
    )
    # Enable GPU to run the model with 4bit
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.bfloat16
    )
    # Load the tokenizer
    tokenizer = AutoTokenizer.from_pretrained(TARGET_MODEL, use_fast=False)
    tokenizer.pad_token = tokenizer.eos_token
    # Load the model
    base_model = LlamaForSequenceClassification.from_pretrained(TARGET_MODEL,
                                                                num_labels=2, # label is 0 or 1
                                                                quantization_config=bnb_config,                                                                 
                                                                device_map="auto")
    base_model.config.pretraining_tp = 1 # 1 is 7b
    base_model.config.pad_token_id = tokenizer.pad_token_id
    
    if IS_TRAIN:
        # Parameter-Efficient Fine-Tuning (PEFT) methods enable efficient adaptation of 
        # pre-trained language models (PLMs) to various downstream applications 
        # without fine-tuning all the model's parameters. 
        # https://github.com/huggingface/peft
        model = get_peft_model(base_model, peft_config)
    else:
        OUTPUT_DIR = f"/kaggle/input/mistral-7b-v0-for-llm-detecting-competition/mistral_7b_fold{fold}"
        # OUTPUT_DIR = f"/kaggle/working/mistral_7b_fold{fold}"
        # Load the pretrained model with PEFT
        model = PeftModel.from_pretrained(base_model, str(OUTPUT_DIR))
    
    model.print_trainable_parameters() # Display the trainable parameters
    
    return model, tokenizer

# Train the LLM model

In [6]:
def preprocess_function(examples, tokenizer, max_length=512):
    examples["text"] = list(map(lambda text: pre_processing_text(text), examples["text"]))
    return tokenizer(examples["text"], truncation=True, max_length=max_length, padding=True)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    accuracy_val = accuracy_score(labels, predictions)
    roc_auc_val = roc_auc_score(labels, predictions)
    r = { "accuracy": accuracy_val,
          "roc_auc": roc_auc_val}
    # logging.debug(f'{r}')
    return r


def train_model_by_fold(fold):
    torch.cuda.empty_cache()
    gc.collect()
    print(f"Start training the fold {fold} model")
    # Create train and valid dataset for a fold
    fold_valid_df = train_data[train_data["fold"] == fold]
    fold_train_df = train_data[train_data["fold"] != fold]
    # Train the model with small (for debugging) or large samples
    if DEBUG:
        fold_train_df = fold_train_df.sample(frac =.05, random_state=SEED)
        fold_valid_df = fold_valid_df.sample(frac =.05, random_state=SEED)
    else:
        fold_train_df = fold_train_df.sample(frac =.3, random_state=SEED)
        fold_valid_df = fold_valid_df.sample(frac =.3, random_state=SEED)

    print(f'fold_train_df {fold_train_df.groupby("fold")["label"].value_counts()}')
    print(f'fold_valid_df {fold_valid_df.groupby("fold")["label"].value_counts()}')
    # create the dataset
    train_ds = Dataset.from_pandas(fold_train_df)
    valid_ds = Dataset.from_pandas(fold_valid_df)

    # Load the pretrained model and tokenizer
    model, tokenizer = load_model(fold)

    # Tokenize the train and valid dataset and pass tokenizer as function argument
    train_tokenized_ds = train_ds.map(preprocess_function, batched=True,
                                      fn_kwargs={"tokenizer": tokenizer})
    valid_tokenized_ds = valid_ds.map(preprocess_function, batched=True,
                                      fn_kwargs={"tokenizer": tokenizer})
    # Create data collator with padding (padding to the longest sequence)
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="longest")

    # Start training processing        
    TMP_DIR = Path(f"/kaggle/tmp/mistral_7b_fold{fold}/")
    TMP_DIR.mkdir(exist_ok=True, parents=True)

    STEPS = 5 if DEBUG else 20
    EPOCHS = 1 if DEBUG else 10
    BATCH_SIZE = 2
    training_args = TrainingArguments(output_dir=TMP_DIR,
                                      learning_rate=5e-5,
                                      per_device_train_batch_size=BATCH_SIZE,
                                      per_device_eval_batch_size=1,
                                      gradient_accumulation_steps=16,
                                      max_grad_norm=0.3,
                                      optim='paged_adamw_32bit',
                                      lr_scheduler_type="cosine",
                                      num_train_epochs=EPOCHS,
                                      weight_decay=0.01,
                                      evaluation_strategy="epoch",
                                      save_strategy="epoch",
                                      load_best_model_at_end=True,
                                      push_to_hub=False,
                                      warmup_steps=STEPS,
                                      eval_steps=STEPS,
                                      logging_steps=STEPS,
                                      report_to='none', # if DEBUG else 'wandb'
                                      log_level='warning', # 'warning' is default level 
                                     )


    # Create the trainer 
    trainer = Trainer(model=model,
                      args=training_args,
                      train_dataset=train_tokenized_ds,
                      eval_dataset=valid_tokenized_ds,
                      tokenizer=tokenizer,
                      data_collator=data_collator,
                      compute_metrics=compute_metrics)

    trainer.train()

    OUTPUT_DIR = Path(f"/kaggle/working/mistral_7b_fold{fold}/")
    OUTPUT_DIR.mkdir(exist_ok=True, parents=True)
    # Save the fine-tuned model
    trainer.save_model(output_dir=str(OUTPUT_DIR))
    print(f"=== Finish the training for fold {fold} ===")
    del model, trainer, tokenizer
    torch.cuda.empty_cache()
    gc.collect()

In [7]:
# Check if we need to fine-tune the LLM model
if IS_TRAIN:
    start = time.time()
    # Load train data
    train_data = load_train_data()
    # Cross validation with 5 fold
    train_data = cv_split(train_data)
    # Train the model  
    fold = 0
    train_model_by_fold(0)
    #     # Add multiple threads to run each fold model concurrently

    #with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
   #     futures = [executor.submit(train_model_by_fold, fold) for fold in range(2)]
   #     # wait for all tasks to complete
    #    wait(futures)
    #    print('All training tasks are done!')
    
    #for idx, fold in enumerate(range(N_FOLD)):
    sys.exit(f"Training time of fold {fold} = {time.time() - start: .1f} seconds")

# Infer the testing data

In [8]:
import concurrent
from concurrent.futures import ThreadPoolExecutor
from concurrent.futures import wait
from scipy.special import expit as sigmoid
# Load test data
test_df = pd.read_csv("/kaggle/input/llm-detect-ai-generated-text/test_essays.csv", sep=',')
test_df = test_df.rename(columns={'generated': 'label'})
test_df['text'] = test_df['text'].progress_apply(lambda text: pre_processing_text(text))
# print(f'test_df.shape: {test_df.shape}')
# test_df.head(3)

def clear_memory():
    torch.cuda.empty_cache()
    gc.collect()

# Sigmoid activation function can map 'x' between 0 and 1
def sigmoid(x):
    return 1 / (1 + np.exp(-x)) 

def predict_result_by_fold(fold):
    clear_memory()
    print(f"=== Start prediction with {fold} ===")
    model, tokenizer = load_model(fold) 
    # Load the test dataframe as dataset
    test_ds = Dataset.from_pandas(test_df)
    test_tokenized_ds = test_ds.map(preprocess_function, batched=True,
                                    fn_kwargs={"tokenizer": tokenizer})
    # Data collator
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer, 
                                            padding="longest")
    # Create the trainer
    trainer = Trainer(model=model,
                      tokenizer=tokenizer,
                      data_collator=data_collator)
    pred_output = trainer.predict(test_tokenized_ds)
    logits = pred_output.predictions
    # Apply sigmoid to 
    probs = sigmoid(logits[:, 1])
    print(f"fold = {fold} probs = {probs}")
    global predictions
    for i, prob in enumerate(probs):
        predictions[i].append(prob)  
    # Clear memory
    del model, trainer, tokenizer, test_ds, test_tokenized_ds, data_collator
    clear_memory()
    
def predict_result():
    global predictions
    predictions = [[] for i in range(len(test_df))]
    start = time.time()
    print(f"=== Begin prediction  ===")

    #for fold in range(N_FOLD):
    #    predict_result_by_fold(fold)
    fold = 0
    predict_result_by_fold(fold)
    print(f"Finish prediction in {time.time() - start: .1f} seconds")

    return predictions

100%|██████████| 3/3 [00:06<00:00,  2.19s/it]


In [9]:
predictions = predict_result()
probs = [np.mean(pred) for pred in predictions] 
print(probs)    

loading file tokenizer.model
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading configuration file /kaggle/input/mistral/pytorch/7b-v0.1-hf/1/config.json
You are using a model of type mistral to instantiate a model of type llama. This is not supported for all configurations of models and can yield errors.
Model config LlamaConfig {
  "architectures": [
    "MistralForCausalLM"
  ],
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 32768,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "sliding_window": 4096,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.33.0",
  "use_cache": true,
  "vocab_size": 32000

=== Begin prediction  ===
=== Start prediction with 0 ===


Detected 4-bit loading: activating 4-bit loading for this model


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of the model checkpoint at /kaggle/input/mistral/pytorch/7b-v0.1-hf/1 were not used when initializing LlamaForSequenceClassification: ['lm_head.weight']
- This IS expected if you are initializing LlamaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LlamaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/mistral/pytorch/7b-v0.1-hf/1 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 16,384 || all params: 7,114,084,352 || trainable%: 0.00023030370725635164


  0%|          | 0/1 [00:00<?, ?ba/s]

No `TrainingArguments` passed, using `output_dir=tmp_trainer`.
Found safetensors installation, but --save_safetensors=False. Safetensors should be a preferred weights saving format due to security and performance reasons. If your model cannot be saved by safetensors please feel free to open an issue at https://github.com/huggingface/safetensors!
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
You have loaded a model on multiple GPUs. `is_model_parallel` attribute will be force-set to `True` to avoid any unexpected behavior such as device placement mismatching.
The model is quantized. To train this model you need to add additional modules inside the model such as adapters using `peft` library and freeze the model weights. Please check the exam

fold = 0 probs = [0.999 0.998 0.999]
Finish prediction in  167.3 seconds
[0.999, 0.998, 0.999]


In [10]:
sub = pd.DataFrame()
sub['id'] = test_df['id']
sub['generated'] = probs
sub.to_csv('submission.csv', index=False)
sub.head()

Unnamed: 0,id,generated
0,0000aaaa,0.999023
1,1111bbbb,0.998047
2,2222cccc,0.999023


In [11]:
# !ls -alh /kaggle/working/
# !zip -r result.zip /kaggle/working
# from IPython.display import FileLink
# FileLink(r'result.zip')