# Instruction-tuning LLMs for Native Language Identification on VESPA

This notebook contains the code for instruction-tuning open-source large language models on the task of Native Language Identification. The code is heavily inspired by https://github.com/unslothai/unsloth and their example notebooks on fine-tuning LLMs. We will be using the Unsloth library to perform 4-bit QLoRA fine-tuning on a VESPA training set. We then evaluate the fine-tuned LLM on a small VESPA test set. We recommend running this notebook in Google Colaboratory to speed up the fine-tuning process.



In [None]:
# Install packages
!pip install "unsloth[cu121-ampere-torch230] @ git+https://github.com/unslothai/unsloth.git"
!pip install tiktoken
!pip install imbalanced-learn

In [None]:
# Mount Google Drive if using Google Colab
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from unsloth import FastLanguageModel
from datasets import load_dataset, Dataset
from pydantic import BaseModel, ValidationError, Field
from sklearn.model_selection import StratifiedKFold
import pandas as pd
import numpy as np
import torch
import os
from typing import Literal
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix, f1_score, accuracy_score
from collections import defaultdict
from trl import SFTTrainer
from transformers import TrainingArguments
import tiktoken
from imblearn.under_sampling import RandomUnderSampler

max_seq_length = 8000
dtype = torch.bfloat16
load_in_4bit = True # we use 4bit quantization

# 4bit pre quantized models supported by Unsloth
fourbit_models = [
    "unsloth/mistral-7b-bnb-4bit",
    "unsloth/llama-2-7b-bnb-4bit",
    "unsloth/gemma-7b-bnb-4bit",
    "unsloth/llama-3-8b-bnb-4bit",
    "unsloth/Phi-3-mini-4k-instruct"
]


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


In [None]:
# Load model
model_name = "unsloth/mistral-7b-bnb-4bit"
run_name = 'finetuned_mistral_undersampled'


model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
  )

# Add LoRA adapters
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

==((====))==  Unsloth 2024.8: Fast Mistral patching. Transformers = 4.43.3.
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.564 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0+cu121. CUDA = 8.0. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.26.post1. FA2 = True]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2024.8 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [None]:
def num_tokens_from_string(string: str):
  inputs = tokenizer(text, return_tensors="pt").input_ids.to("cuda")
  length = inputs[0].size(dim=0)
  return int(length)

def truncate_text(text, max_seq_length):
  """
  give truncated_text
  :param prompt: input prompt
  :param max_length:
  :type prompt: str
  :type max_length: int
  """
    # Tokenize the prompt
  inputs = tokenizer(text, max_length = max_seq_length, truncation=True, return_tensors="pt").input_ids.to("cuda")
  # print(inputs[0].size()) this is how to get the size of the tensor aka the number of tokens
    # Decode the response
  response = tokenizer.decode(inputs[0], skip_special_tokens=True)

  return response

# Data Prep
We use the VESPA training set to perform instruction-tuning with the prompts used in the closed-set experiments, in which we provide the set of possible L1s in the prompt.

In [None]:
alpaca_prompt = '''
### Instruction:
You are a forensic linguistics expert that reads English texts written by non-native authors to classify the native language of the author as one of:
"DUT": Dutch
"FRE": French
"NOR": Norwegian
"SPA": Spanish
"SWE": Swedish
Use clues such as spelling errors, word choice, syntactic patterns, and grammatical errors to decide on the native language of the author.\n\n

DO NOT USE ANY OTHER CLASS.
IMPORTANT: Do not classify any input as "ENG" (English). English is an invalid choice.

Valid output formats:
Class: "DUT"
Class: "SWE"
Class: "NOR"
Class: "SPA"

Classify the text above as one of DUT, FRE, NOR, SPA, or SWE. Do not output any other class - do NOT choose "ENG" (English). What is the closest native language of the author of this English text from the given list?

### Input:
{}

### Response:
{}'''



EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    inputs       = examples["text"]
    outputs      = examples["language"]
    texts = []
    for input, output in zip(inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

vespa_train = "/content/drive/MyDrive/thesis_NLI/VESPA/VESPA-train.csv"
text_max_seq_length = 7900

# random undersampling of training data
rus = RandomUnderSampler(sampling_strategy="not minority", random_state=0)
df = pd.read_csv(vespa_train)
X = df.drop('language',axis=1)
y = df['language'].tolist()
X_resampled, y_resampled = rus.fit_resample(X, y)
print(len(y_resampled))
# print(X_resampled['text'][0])
truncated_texts = []
lengths = []
for text in X_resampled['text'].tolist():
  text = truncate_text(text, text_max_seq_length)
  truncated_texts.append(text)
  length = num_tokens_from_string(text)
  lengths.append(length)
sample_df = pd.DataFrame({'text': truncated_texts, 'language': y_resampled})
print(max(lengths))

dataset = Dataset.from_pandas(sample_df)
print(Counter(y_resampled))

# uncomment the following section for full dataset
# from datasets import load_dataset
# dataset = load_dataset("csv", data_files=vespa_train, split='train')
# print(f'Number of samples: {len(dataset)}')
# # print(dataset[0])
# texts = dataset['text']
# truncated_texts = []
# for text in texts:
#   text = truncate_text(text, text_max_seq_length)
#   truncated_texts.append(text)
# df = pd.DataFrame({'text': truncated_texts, 'language': dataset['language']})
# dataset = Dataset.from_pandas(df)

texts = truncated_texts
labels = dataset ['language']
dataset = dataset.map(formatting_prompts_func, batched = True,)
print(dataset[:1])


165
7900
Counter({'DUT': 33, 'FRE': 33, 'NOR': 33, 'SPA': 33, 'SWE': 33})


Map:   0%|          | 0/165 [00:00<?, ? examples/s]

{'text': ['\n### Instruction:\nYou are a forensic linguistics expert that reads English texts written by non-native authors to classify the native language of the author as one of:\n"DUT": Dutch\n"FRE": French\n"NOR": Norwegian\n"SPA": Spanish\n"SWE": Swedish\nUse clues such as spelling errors, word choice, syntactic patterns, and grammatical errors to decide on the native language of the author.\n\n\n\nDO NOT USE ANY OTHER CLASS.\nIMPORTANT: Do not classify any input as "ENG" (English). English is an invalid choice.\n\nValid output formats:\nClass: "DUT"\nClass: "SWE"\nClass: "NOR"\nClass: "SPA"\n\nClassify the text above as one of DUT, FRE, NOR, SPA, or SWE. Do not output any other class - do NOT choose "ENG" (English). What is the closest native language of the author of this English text from the given list?\n\n### Input:\nUndergraduate writing has recently become an essential topic in the field of English for Academic Purposes. An important aspect of undergraduate writing is the s

# Train LLM on training set

In [None]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA A100-SXM4-40GB. Max memory = 39.564 GB.
4.5 GB of memory reserved.


In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments

max_seq_length = 8100

for ind, rs in zip(range(1), [100]): # perform 3 runs with 3 different random seeds if necessary
  model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/gemma-7b-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hfeds
  )
  model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = rs,
    use_rslora = False,
    loftq_config = None,
  )
  trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False,
    args = TrainingArguments(
        per_device_train_batch_size = 4, # The batch size per GPU for training
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs=3,
        learning_rate = 1e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

trainer_stats = trainer.train()
model.save_pretrained(f"/content/drive/MyDrive/thesis_NLI/VESPA_results/{run_name}") # Local saving
tokenizer.save_pretrained(f"/content/drive/MyDrive/thesis_NLI/VESPA_results/{run_name}")


Map (num_proc=2):   0%|          | 0/165 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 165 | Num Epochs = 3
O^O/ \_/ \    Batch size per device = 4 | Gradient Accumulation steps = 4
\        /    Total batch size = 16 | Total steps = 30
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
1,2.0583
2,2.1368
3,2.1191
4,2.0874
5,1.9655
6,2.0556
7,1.9769
8,2.0043
9,1.9614
10,1.9033


('/content/drive/MyDrive/thesis_NLI/VESPA_results/finetuned_mistral_undersampled/tokenizer_config.json',
 '/content/drive/MyDrive/thesis_NLI/VESPA_results/finetuned_mistral_undersampled/special_tokens_map.json',
 '/content/drive/MyDrive/thesis_NLI/VESPA_results/finetuned_mistral_undersampled/tokenizer.model',
 '/content/drive/MyDrive/thesis_NLI/VESPA_results/finetuned_mistral_undersampled/added_tokens.json',
 '/content/drive/MyDrive/thesis_NLI/VESPA_results/finetuned_mistral_undersampled/tokenizer.json')

# Evaluate on test set

:After fine-tuning, we evaluate the fine-tuned model on a small VESPA test set. We prompt the model to predict the native language for L2 texts in the VESPA test set.

## Defining the functions for running inference

In [None]:
def generate_text(prompt):
  """
  Generate text for LLM based on input prompt
  :param prompt: input prompt
  :param max_length:
  :type prompt: str
  :type max_length: int
  """
    # Tokenize the prompt
  inputs = tokenizer(prompt, max_length = max_seq_length, return_tensors="pt").input_ids.to("cuda")
  outputs = model.generate(inputs,
                           max_new_tokens=10,
                           pad_token_id=tokenizer.eos_token_id,
                           #temperature=0.001
                           ) # set temperature here?
    # Decode the response
  response = tokenizer.decode(outputs[0], skip_special_tokens=True)

  return response

def clean_output(output, eos_token, output_only=False):
  """
  This function specifically cleans up the output,
  to remove the prompt, make sure it is in the correct format, and remove any empty lines.
  :param output: model generated output
  :param eos_token: end-of-sequence token to split model output on
  :param output_only, default False: if True, extract only the newly generated output by model and remove the prompt. mostly used for debugging
  :type output: str
  :type eos_token: str
  """
  pure_output = output.split(eos_token)
  pure_output = pure_output[-1]
  pure_output = pure_output.strip()
  final_output = pure_output
  if output_only==False: # whether to extract only json-formatted string in the output or not
    predicted_classes=0
    language_class_dict = {'arabic': 'ARA',
                            'bulgarian': 'BUL',
                             'chinese': 'CHI',
                             'czech': 'CZE',
                             "french": "FRE",
                             "german": "GER",
                             "hindi": "HIN",
                             "italian": "ITA",
                             "japanese": "JPN",
                             "korean": 'KOR',
                             "spanish": "SPA",
                             "telugu": "TEL",
                             "turkish": "TUR",
                             "russian": "RUS",
                             "english": "ENG",
                            'sp': 'SPA',
                            'itl': 'ITA',
                           'deu': 'GER',
                           'norwegian': 'NOR',
                           'dutch': 'DUT',
                           'swedish': 'SWE'
                             }
    if '}' in final_output:
      x = output.split("}")
      for piece in x:
        if 'native_lang' in piece:
          x = piece.split(":")
          label = x[-1]
          label = label.strip()
          label = label.replace('"', '')
          label = label.replace('\n', '')
          final_output = '{"native_lang":"' + label + '"}'
    if 'Class:' in final_output:
      x = output.split("Class:")
      label = x[-1]
      label = label.strip()
      label = label.replace('"', '')
      label = label.replace('\n', '')
      label = label.replace('.', '')
      final_output = label
    # print(len(final_output))
    if len(final_output)>50:
      final_output = '{"native_lang": "unknown"}'
    else:
      for lang, label in language_class_dict.items():
        if lang in final_output.lower() or label in final_output:
          final_output = '{"native_lang":"' + label + '"}'
  return final_output

def classify(texts, goldlabels, filter_token):
  '''
  :param texts: list of texts
  :param goldlabels: list of gold labels
  :param filter_token: token to get cleaned output
  :type texts: list
  :type goldlabels: list
  :returns predictions: a list of model predictions
  '''
  predictions = []
  count = 1
  sys_prompt = prompt_VESPA
  prompt_retry = prompt_retry_VESPA
  all_labels = all_labels_VESPA
  NLI_prediction = NLI_prediction_VESPA
  main_task_prompt = main_task_prompt_VESPA
  for text, gold in zip(texts, goldlabels):
    promptcounter = 0
    # num_tokens = num_tokens_from_string(text, encoder)
    # if num_tokens>3000:
    text = truncate_text(text, 7900)
    while True:
      try:
        fullprompt = "Instruction: " + sys_prompt + '\n\n' + main_task_prompt + '\n\nInput: '+ text + "\nResponse:"
        #print(fullprompt)
        output = generate_text(fullprompt) # generate text per TOEFL text
        # print(output)
        output_only = clean_output(output, filter_token, output_only=True)
        print(output_only)
        final_output = clean_output(output, filter_token)
        validated_response = NLI_prediction.model_validate_json(final_output) # use class to validate json string
        response_dict = validated_response.model_dump() # dump validated response into dict
        predicted_native_lang = response_dict['native_lang'] # get the predicted native language
        if predicted_native_lang == "ENG": # reiterate prompt if model predicts english
          fullprompt = "Instruction: " + sys_prompt + '\n\n' + main_task_prompt + '\n' + prompt_retry_eng + '\n\nInput: '+ text + "\nResponse:"
          promptcounter+=1
          if promptcounter > 4: # try 5 times to reprompt, if still unable to extract predicted label, append other
            response_dict = {'native_lang': 'other'}
            predictions.append('other')
            break
        predictions.append(predicted_native_lang) # append it to list of predictions
        break
      # print(final_output)
      except ValidationError as e: # if there is a validation error, make model retry
        fullprompt = "Instruction: " + sys_prompt + '\n\n' + main_task_prompt + '\n' + prompt_retry + '\n\nInput: '+ text + "\nResponse:"
        promptcounter +=1
        if promptcounter > 4: # try 5 times to reprompt, if still unable to extract predicted label, append other
          response_dict = {'native_lang': 'other'}
          predictions.append('other')
          break
    print(count, response_dict)
    print('Accuracy score:', "{:.2f}".format(accuracy_score(goldlabels[0:count], predictions)))
    count +=1
  return predictions

## Defining the prompts

In [None]:
class NLI_prediction_VESPA(BaseModel):
  native_lang: Literal['DUT', 'FRE', 'NOR', 'SWE', 'SPA', 'ENG']

all_labels_VESPA = ['DUT', 'FRE', 'NOR', 'SWE', 'SPA']

vespa_dataset = f"/content/drive/MyDrive/thesis_NLI/VESPA/VESPA-test.csv"
dataset = pd.read_csv(vespa_dataset)
test_texts = dataset['text'].tolist()
test_labels = dataset ['language'].tolist()
prompt_VESPA = '''
  You are a forensic linguistics expert that reads English texts written by non-native authors to classify the native language of the author as one of:
  "DUT": Dutch
  "FRE": French
  "NOR": Norwegian
  "SPA": Spanish
  "SWE": Swedish
  Use clues such as spelling errors, word choice, syntactic patterns, and grammatical errors to decide on the native language of the author.\n\n

  DO NOT USE ANY OTHER CLASS.
  IMPORTANT: Do not classify any input as "ENG" (English). English is an invalid choice.

  Valid output formats:
  Class: "DUT"
  Class: "SWE"
  Class: "NOR"
  Class: "SPA"

  You ONLY respond in JSON files.
  The expected output from you has to be:"json {"native_lang": The chosen class, DUT, FRE, NOR, SPA, or SWE}"
'''

main_task_prompt_VESPA = '''Classify the text above as one of DUT, FRE, NOR, SPA, or SWE. Do not output any other class - do NOT choose "ENG" (English). What is the closest native language of the author of this English text from the given list?
'''

prompt_retry_eng = '''
  You previously mistakenly predicted this text as "ENG" (English). The class is NOT English.
  Please classify the native language of the author of the text again.
  '''

prompt_retry_VESPA = '''
  Your classification is not in the list of possible languages.
  Please try again and choose only one of the following classes:
  DUT, FRE, NOR, SPA, or SWE
'''

## NLI classification using fine-tuned LLMs

In [None]:
all_models = ['finetuned_llama2_7b', 'finetuned_llama3_8b', 'finetuned_mistral_7b', 'finetuned_gemma_7b, finetuned_phi3']

In [None]:
from unsloth import FastLanguageModel

accuracies = []
%cd /content/drive/MyDrive/thesis_NLI/VESPA_results
runs = 1
# for count in range(runs):
count=0
model, tokenizer = FastLanguageModel.from_pretrained(
      model_name = run_name, # YOUR MODEL YOU USED FOR TRAINING
      max_seq_length = max_seq_length,
      dtype = dtype,
      load_in_4bit = load_in_4bit,
  )
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

eos_token = 'Response:'
predictions = classify(test_texts, test_labels, eos_token)
accuracy = accuracy_score(test_labels, predictions)
accuracy2 = "{:.2f}".format(accuracy*100)

print(f'-------------Run: {count}')
print(f'-------------Accuracy: {accuracy2}')
accuracy=float(accuracy*100)
accuracies.append(accuracy)
cm = confusion_matrix(test_labels, predictions, labels=all_labels_VESPA)
cm_display = ConfusionMatrixDisplay(cm, display_labels=all_labels_VESPA).plot()
cm_display.figure_.savefig(f'/content/drive/MyDrive/thesis_NLI/VESPA_results/{run_name}.png')
df = pd.read_csv(vespa_dataset)
column_name = f'{run_name}'
if column_name in df.columns.values.tolist():
  df.pop(column_name)
num_columns = len(df.columns)
df.insert(num_columns, column_name, predictions)
    # df.head()
df.to_csv(vespa_dataset, index=False)

avg_acc=sum(accuracies)/runs
print(accuracies)
print(f"Average acc: {avg_acc}")
print(f"Standard deviation: {np.std(accuracies)}") #