# Running Open-Source LLMs for Native Language Identification



### Loading Required Libraries

In [None]:
import os
from random import randrange
from functools import partial
import torch
from transformers import (AutoModelForCausalLM,
                          AutoTokenizer,
                          BitsAndBytesConfig,
                          HfArgumentParser,
                          Trainer,
                          TrainingArguments,
                          DataCollatorForLanguageModeling,
                          EarlyStoppingCallback,
                          pipeline,
                          logging,
                          set_seed)

import bitsandbytes as bnb
import pandas as pd
from pydantic import BaseModel, ValidationError, Field # used for validation of the output
from typing import Literal
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix, f1_score, accuracy_score
import matplotlib.pyplot as plt
from sklearn.svm import LinearSVC
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict
from unsloth import FastLanguageModel
import numpy as np
# from openai import OpenAI

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!huggingface-cli login # make use of your own huggingface token here
# Llama models require you to fill out an agreement form


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) Y
Token is valid (permission: read).
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your term

In [None]:
def load_model(model_name, bnb_config):
    """
    Loads model and model tokenizer

    :param model_name: Hugging Face model
    :param bnb_config: Bitsandbytes configuration
    """

    # Get number of GPU devices available and set maximum memory
    n_gpus = torch.cuda.device_count()
    max_memory = f'{40960}MB'

    # Load model
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config = bnb_config,
        device_map = "auto", # dispatch the model efficiently on the available resources
        max_memory = {i: max_memory for i in range(n_gpus)}, # max memory for gpus available
        torch_dtype = torch.float16
    )

    # Load model tokenizer with the user authentication token
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=True, trust_remote_code=True)

    # Set padding token as end of sequence (EOS) token
    tokenizer.pad_token = tokenizer.eos_token

    return model, tokenizer

### Loading Dataset

In [None]:
# Filepaths to datasets
BASEPATH = "/content/drive/MyDrive/thesis_NLI/"
toefl_dataset = f"{BASEPATH}TOEFL11/toefl_results.csv"
icle_dataset = f"{BASEPATH}ICLE-NLI-results.csv"
vespa_dataset = f"{BASEPATH}/VESPA/VESPA-test.csv"

In [None]:
# Load dataset
dataset = pd.read_csv(vespa_dataset) # change this to ICLE/TOEFL/VESPA accordingly
print(f'Number of samples: {len(dataset)}')
# print(f'Column names are: {list(dataset.columns)}')
texts = dataset['text'].tolist()
labels = dataset ['language'].tolist()

Number of samples: 50


In [None]:
class NLI_prediction_TOEFL(BaseModel):
  native_lang: Literal['ARA', 'CHI', 'FRE', 'GER', 'HIN', 'ITA', 'JPN', 'KOR', 'SPA', 'TEL', 'TUR', 'ENG']

all_labels_TOEFL = ['ARA', 'CHI', 'FRE', 'GER', 'HIN', 'ITA', 'JPN', 'KOR', 'SPA', 'TEL', 'TUR']

class NLI_prediction_ICLE(BaseModel):
  native_lang: Literal['BUL', 'CHI', 'CZE', 'FRE', 'JPN', 'RUS', 'SPA', 'ENG']

all_labels_ICLE = ['BUL', 'CHI', 'CZE', 'FRE', 'JPN', 'RUS', 'SPA']

class NLI_prediction_VESPA(BaseModel):
  native_lang: Literal['DUT', 'FRE', 'NOR', 'SWE', 'SPA', 'ENG']

all_labels_VESPA = ['DUT', 'FRE', 'NOR', 'SWE', 'SPA']

all_labels_TOEFL_open = ['ARA', 'CHI', 'FRE', 'GER', 'HIN', 'ITA', 'JPN', 'KOR', 'SPA', 'TEL', 'TUR', 'ENG', 'other']
all_labels_ICLE_open = ['BUL', 'CHI', 'CZE', 'FRE', 'JPN', 'RUS', 'SPA', 'ENG', 'other']
all_labels_VESPA_open = ['DUT', 'FRE', 'NOR', 'SWE', 'SPA', 'ENG', 'other']

# Prompting LLMs

## Defining the prompts for closed-class

In [None]:
prompt_TOEFL = '''
  You are a forensic linguistics expert that reads English texts written by non-native authors to classify the native language of the author as one of:
  "ARA": Arabic
  "CHI": Chinese
  "FRE": French
  "GER": German
  "HIN": Hindi
  "ITA": Italian
  "JPN": Japanese
  "KOR": Korean
  "SPA": Spanish
  "TEL": Telugu
  "TUR": Turkish
  Use clues such as spelling errors, word choice, syntactic patterns, and grammatical errors to decide on the native language of the author.\n

  DO NOT USE ANY OTHER CLASS.
  IMPORTANT: Do not classify any input as "ENG" (English). English is an invalid choice.

  Valid output formats:
  Class: "ARA"
  Class: "CHI"
  Class: "FRE"
  Class: "GER"

  You ONLY respond in JSON files.
  The expected output from you is:"json {"native_lang": The chosen class, ARA, CHI, FRE, GER, HIN, ITA, JPN, KOR, SPA, TEL, or TUR}"
  '''

prompt_ICLE = '''
  You are a forensic linguistics expert that reads English texts written by non-native authors to classify the native language of the author as one of:
  "BUL": Bulgarian
  "CHI": Chinese
  "CZE": Czech
  "FRE": French
  "JPN": Japanese
  "RUS": Russian
  "SPA": Spanish
  Use clues such as spelling errors, word choice, syntactic patterns, and grammatical errors to decide on the native language of the author.\n\n

  DO NOT USE ANY OTHER CLASS.
  IMPORTANT: Do not classify any input as "ENG" (English). English is an invalid choice.

  Valid output formats:
  Class: "BUL"
  Class: "CHI"
  Class: "CZE"
  Class: "SPA"

  You ONLY respond in JSON files.
  The expected output from you has to be:"json {"native_lang": The chosen class, BUL, CHI, CZE, FRE, JPN, RUS, or SPA}"
  '''

prompt_VESPA = '''
  You are a forensic linguistics expert that reads English texts written by non-native authors to classify the native language of the author as one of:
  "DUT": Dutch
  "FRE": French
  "NOR": Norwegian
  "SPA": Spanish
  "SWE": Swedish
  Use clues such as spelling errors, word choice, syntactic patterns, and grammatical errors to decide on the native language of the author.\n\n

  DO NOT USE ANY OTHER CLASS.
  IMPORTANT: Do not classify any input as "ENG" (English). English is an invalid choice.

  Valid output formats:
  Class: "DUT"
  Class: "SWE"
  Class: "NOR"
  Class: "SPA"

  You ONLY respond in JSON files.
  The expected output from you has to be:"json {"native_lang": The chosen class, DUT, FRE, NOR, SPA, or SWE}"
'''

main_task_prompt_TOEFL = '''Classify the text below as one of ARA, CHI, FRE, GER, HIN, ITA, JPN, KOR, SPA, TEL, or TUR. Do not output any other class - do NOT choose "ENG" (English). What is the closest native language of the author of this English text from the given list?
'''

main_task_prompt_ICLE = '''Classify the text below as one of BUL, CHI, CZE, FRE, JPN, RUS, or SPA. Do not output any other class - do NOT choose "ENG" (English). What is the closest native language of the author of this English text from the given list?
'''

main_task_prompt_VESPA = '''Classify the text below as one of DUT, FRE, NOR, SPA, or SWE. Do not output any other class - do NOT choose "ENG" (English). What is the closest native language of the author of this English text from the given list? Provide ONE language even if you are unsure.
'''


prompt_retry_eng = '''
  You previously mistakenly predicted this text as "ENG" (English). The class is NOT English.
  Please classify the native language of the author of the text again.
  '''

prompt_retry_TOEFL = '''
  Your classification is not in the list of possible languages.
  Please try again and choose only one of the following classes:
  ARA, CHI, FRE, GER, HIN, ITA, JPN, KOR, SPA, TEL, or TUR
  '''

prompt_retry_ICLE = '''
  Your classification is not in the list of possible languages.
  Please try again and choose only one of the following classes:
  BUL, CHI, CZE, FRE, JPN, RUS, or SPA
  '''

prompt_retry_VESPA = '''
  Your classification is not in the list of possible languages.
  Please try again and choose only one of the following classes:
  DUT, FRE, NOR, SPA, or SWE
'''

## Defining the prompts for open class

In [None]:
class NLI_prediction_open(BaseModel):
  native_lang: str

nli_prediction_dict = NLI_prediction_open.model_json_schema()
nli_prediction_json = json.dumps(nli_prediction_dict, indent=2)

prompt_open = '''
You are a forensic linguistics expert that reads texts written by non-native authors in order to identify their native language.

Analyze each text and identify the native language of the author.

Use clues such as spelling errors, word choice, syntactic patterns, and grammatical errors to decide.

You ONLY respond in JSON files.
The expected output from you has to be:

json {"native_lang": ""}
'''

prompt_retry_open = '''
Your previous classification was not in the correct format. Please only respond in the following JSON format:
json {"native_lang": ""}
'''

## Defining functions for prompting

In [None]:
def generate_text(prompt):
  """
  Generate text for LLM based on input prompt
  :param prompt: input prompt
  :type prompt: str
  """
    # Tokenize the prompt
  inputs = tokenizer(prompt, max_length = 8000, truncation = True, return_tensors="pt").input_ids.to("cuda")
  outputs = model.generate(inputs,
                           pad_token_id=tokenizer.eos_token_id,
                           max_new_tokens=60,
                           )
    # Decode the response
  response = tokenizer.decode(outputs[0], skip_special_tokens=True)

  return response

In [None]:
def clean_output(output, eos_token, output_only=False):
  """
  This function cleans up the output by LLM to remove the prompt, make sure it is in the correct format, and remove any empty lines.
  :param output: LLM-generated output
  :param eos_token: end-of-sequence token to split model output on
  :param output_only, default False: if True, extract only the newly generated output by model and remove the prompt
  :type output: str
  :type labels: list
  :type eos_token: str
  """
  pure_output = output.split(eos_token)
  pure_output = pure_output[-1]
  pure_output = pure_output.strip()
  final_output = pure_output
  if output_only==False: # whether to extract only json-formatted string in the output or not
    predicted_classes=0
    language_class_dict = {'arabic': 'ARA',
                            'bulgarian': 'BUL',
                             'chinese': 'CHI',
                             'czech': 'CZE',
                             "french": "FRE",
                             "german": "GER",
                             "hindi": "HIN",
                             "italian": "ITA",
                             "japanese": "JPN",
                             "korean": 'KOR',
                             "spanish": "SPA",
                             "telugu": "TEL",
                             "turkish": "TUR",
                             "russian": "RUS",
                            #  "english": "ENG",
                             "itl": "ITA",
                             "deu": "GER",
                            "fra": 'FRE',
                            'dutch': 'DUT',
                            'norwegian': 'NOR',
                            'swedish': 'SWE',
                             }
    if '}' in final_output:
      x = output.split("}")
      for piece in x:
        if 'native_lang' in piece:
          x = piece.split(":")
          label = x[-1]
          label = label.strip()
          label = label.replace('"', '')
          label = label.replace('\n', '')
          final_output = '{"native_lang":"' + label + '"}'
    if 'Class:**' in final_output:
      x = output.split("Class:**")
      label = x[-1]
      label = label.strip()
      label = label.replace('"', '')
      label = label.replace('\n', '')
      label = label.replace('.', '')
      label = label.replace('*', '')
      final_output = label
    for lang, label in language_class_dict.items():
      if lang in final_output.lower():
        final_output = '{"native_lang":"' + label + '"}'
      elif label in final_output:
        final_output = '{"native_lang":"' + label + '"}'

  return final_output

## Classification functions

In [None]:
def classify(texts, goldlabels, dataset, filter_token, closedopen_setting="closed", openai_api=False):
  '''
  :param texts: list of texts
  :param goldlabels: list of gold labels
  :param dataset: TOEFL or ICLE
  :param sos_token: start-of-sequence token
  :param sys_token: start of system prompt token
  :param sys_end_token: end of system prompt token
  :param eos_token: end-of-sequence token
  :param filter_token: token to get cleaned output
  :type texts: list
  :type goldlabels: list
  :type dataset: str
  :returns predictions: a list of model predictions
  '''
  predictions = []
  if dataset == "TOEFL": # determine which dataset it is, because the prompts and classes are different for each
    sys_prompt = prompt_TOEFL
    prompt_retry = prompt_retry_TOEFL
    all_labels = all_labels_TOEFL
    NLI_prediction = NLI_prediction_TOEFL
    main_task_prompt = main_task_prompt_TOEFL
  elif dataset == "ICLE":
    sys_prompt = prompt_ICLE
    prompt_retry = prompt_retry_ICLE
    all_labels = all_labels_ICLE
    NLI_prediction = NLI_prediction_ICLE
    main_task_prompt = main_task_prompt_ICLE
  elif dataset == 'VESPA':
    sys_prompt = prompt_VESPA
    prompt_retry = prompt_retry_VESPA
    all_labels = all_labels_VESPA
    NLI_prediction = NLI_prediction_VESPA
    main_task_prompt = main_task_prompt_VESPA
  count = 1
  if closedopen_setting == 'open':
    main_task_prompt = ''
    sys_prompt = prompt_open
    NLI_prediction = NLI_prediction_open
  for text, gold in zip(texts, goldlabels):
    promptcounter = 0
    main_task = main_task_prompt + '\n\n' + text
    # messages = [
        # {'role': "system", "content": sys_prompt},
        # {'role': "user", "content": main_task}]
    messages = [{'role': "user", "content": sys_prompt + "\n" +main_task }]
    while True:
      try:
        if openai_api==True:
          response = client.chat.completions.create(
              model=model,
              messages=messages,
              seed=7)
          output_only = response.choices[0].message.content
        else:
          # fullprompt = "Instruction: " + sys_prompt + '\n\n' + main_task_prompt + '\n\nInput: '+ text + "\nResponse:"
          fullprompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
          output = generate_text(fullprompt) # generate text per TOEFL text
          output_only = clean_output(output, filter_token, output_only=True)
          # print(fullprompt)
          print(output_only)
          final_output = clean_output(output, filter_token)
        #print(final_output)
        # print(final_output)
        validated_response = NLI_prediction.model_validate_json(final_output) # use class to validate json string
        response_dict = validated_response.model_dump() # dump validated response into dict
        predicted_native_lang = response_dict['native_lang'] # get the predicted native language
        if predicted_native_lang == "ENG" and closedopen_setting=="closed": # reiterate prompt if model predicts english
          # print(final_output)
          # fullprompt = "Instruction: " + sys_prompt + '\n\n' + main_task_prompt + '\n' + prompt_retry_eng + '\n\nInput: '+ text + "\nResponse:"
          messages.append({'role': 'assistant', 'content': output_only})
          messages.append({'role': "user", "content": prompt_retry_eng})
          # fullprompt = "Context: " + sys_prompt + text + '\nQuestion: '+ main_task_prompt + prompt_retry_eng + "\nAnswer:"
          promptcounter+=1
          if promptcounter > 4: # try 5 times to reprompt, if still unable to extract predicted label, append other
            response_dict = {'native_lang': 'other'}
            predictions.append('other')
            break
        else:
          predictions.append(predicted_native_lang) # append it to list of predictions
          break
      # print(final_output)
      except ValidationError as e: # if there is a validation error, make model retry
        messages.append({'role': 'assistant', 'content': output_only})
        messages.append({'role': "user", "content": prompt_retry})
        # fullprompt = "Instruction: " + sys_prompt + '\n\n' + main_task_prompt + '\n' + prompt_retry + '\n\nInput: '+ text + "\nResponse:"
        promptcounter +=1
        if promptcounter > 4: # try 5 times to reprompt, if still unable to extract predicted label, append other
          response_dict = {'native_lang': 'other'}
          predictions.append('other')
          break
    print(count, response_dict)
    print('Accuracy:', "{:.2f}".format(accuracy_score(goldlabels[0:count], predictions)*100))
    count +=1
 #   if count%110 == 0: # save results every 110 samples
 #     add_results_to_csv(results_filepath, results_column, predictions)
  return predictions

In [None]:
def add_results_to_csv(filepath, column_name, predictions):
  """
  Adds model predictions as the last column to existing CSV file
  with the texts and gold labels of the dataset.


  :param filepath: the path to existing CSV file
  :param column_name: name of the new column (something like preds_zero_llama7b)
  :param predictions: model predictions
  :type filepath: str
  :type column_name: str
  :type predictions: list
  """
  df = pd.read_csv(filepath)
  if column_name in df.columns.tolist():
    df = df.drop(column_name, axis=1)
  if int(len(predictions)) != int(len(df)):
    num_empty_predictions = int(len(df))-int(len(predictions))
    empty_l = [""] * num_empty_predictions
    predictions = predictions+empty_l
  num_columns = len(df.columns)
  df.insert(num_columns, column_name, predictions)
  df.to_csv(filepath, index=False)

In [None]:
def evaluate(filepath, labels, pred_column, save_figure=True):
  """
  Print confusion matrix and F1 score for model predictions.
  Derives gold labels and predictions from a CSV file,
  that contains a column with gold labels that should be called 'language' and predictions from a column specified.

  :param filepath: the path to existing CSV file
  :param labels: a list of all labels/classes
  :param pred_column: the name of the column that contains the predictions
  :type filepath: str
  :type labels: list
  :type pred_column: str
  """
  df = pd.read_csv(filepath, on_bad_lines='warn')
  gold_labels = df['language'].tolist()
  predictions = df[pred_column].tolist()
  cm = confusion_matrix(gold_labels, predictions, labels=labels)
  cm_display = ConfusionMatrixDisplay(cm, display_labels=labels).plot(cmap='Blues')
  if save_figure == True:
    if 'ICLE' in filepath:
      ds='ICLE'
    elif 'TOEFL' in filepath:
      ds='TOEFL'
    elif 'VESPA' in filepath:
      ds = 'VESPA'
    cm_display.figure_.savefig(f'/content/drive/MyDrive/thesis_NLI/{ds}_results/{pred_column}.png')
  f1score = "{:.4f}".format(f1_score(gold_labels, predictions, average="macro"))
  accuracy = accuracy_score(gold_labels, predictions)
  accuracy = "{:.2f}".format(accuracy*100)
  print(pred_column)
  print('F1: ' + str(f1score) + '\nAccuracy: ' + str(accuracy))

# Running inference

In [None]:
all_models = ["meta-llama/Meta-Llama-3-8B-Instruct",
              "meta-llama/Llama-2-7b-chat-hf",
              "mistralai/Mistral-7B-Instruct-v0.2",
              "microsoft/Phi-3-mini-4k-instruct",
              "google/gemma-7b-it"
              ]

## Llama for NLI

### Loading and prompting Llama models

In [None]:
model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
  # model_name = "meta-llama/Llama-2-7b-chat-hf"

bnb_config = BitsAndBytesConfig(load_in_4bit=True,
                                bnb_4bit_compute_dtype=torch.float16
                                )

model, tokenizer = load_model(model_name, bnb_config)
for count in range(3):
  eos_token_llama = 'assistant'#'[/INST]'
  results_column = f"preds_llama3_{count}"
  predictions = classify(texts, labels, "VESPA", eos_token_llama)
  add_results_to_csv(vespa_dataset, results_column, predictions)
  evaluate(vespa_dataset, all_labels_VESPA, results_column, save_figure=True)

### Error analysis

In [None]:
commonly_confusedpairs = [("JPN", "KOR"),
                          ("SPA", "ITA"),
                          ("ITA", "FRE"),
                          ("SPA", "FRE"),
                          ("HIN", "TEL"),
                          ('ARA', 'TUR'),
                          ("KOR", "CHI")]
results_TOEFL = "/content/drive/MyDrive/thesis_NLI/TOEFL11/toefl_results.csv"
new_fp_reasonings = "/content/drive/MyDrive/thesis_NLI/TOEFL11/erroranalysis_llama-7b-zero-closed.csv"
df = pd.read_csv(results_TOEFL)
reasonings = error_analysis_LLM(df, "TOEFL", "preds_zero_llama7b", commonly_confusedpairs)

As a forensic linguistics expert, I have analyzed the text provided and based on the clues mentioned in the prompt, I have classified the native language of the author as "TUR" (Turkish). Here are some examples from the text that support my judgment:

1. Spelling errors: The author makes a spelling error in the word "MOTHER EARTH" (instead of "MOTHER EARTH"). This is a common feature of non-native English speakers, particularly those from Turkish. In Turkish, the phrase is "Anne Dünya" which means "Mother Earth".
2. Word choice: The author uses the word "rennaissance" (instead of "renaissance"). This is another common feature of non-native English speakers, particularly those from Turkish. In Turkish, the word for "renaissance" is "içtiri" which is a loanword from French.
3. Syntactic patterns: The author uses a syntactic pattern that is common in Turkish but less common in English. For example, in the sentence "Going by the designs that are introduced in the market...", the use of "go

## Gemma-7b

In [None]:
model_name = "google/gemma-7b-it"

bnb_config = BitsAndBytesConfig(load_in_4bit=True,
                                bnb_4bit_compute_dtype=torch.float16
                                )

model, tokenizer = load_model(model_name, bnb_config)
for count in range(3):
  eos_token = 'model'
  results_column = f"preds_gemma_{count}"
  predictions = classify(texts, labels, "VESPA", eos_token)
  add_results_to_csv(vespa_dataset, results_column, predictions)
  evaluate(vespa_dataset, all_labels_VESPA, results_column, save_figure=True)

## Mistral-7b-instruct for NLI

### Loading model and tokenizer

In [None]:
# Load model with model name and bitsandbytes configuration
for count in range(1,3):
  model_name = "mistralai/Mistral-7B-Instruct-v0.2"

  bnb_config = BitsAndBytesConfig(load_in_4bit=True,
                                bnb_4bit_compute_dtype=torch.float16
                                )

  model, tokenizer = load_model(model_name, bnb_config)
  eos_token = '[/INST]'
  results_column = f"preds_zero_mistral_7b_open_{count}"
  predictions = classify(texts, labels, "ICLE", eos_token, results_ICLE, results_column, closedopen_setting='open')
  add_results_to_csv(results_ICLE, results_column, predictions)
  evaluate(results_ICLE, all_labels_ICLE_open, results_column, save_figure=True)

## Phi-3

### Loading model and tokenizer

In [None]:
for count in range(1,3):
  # model_name = "microsoft/Phi-3-mini-4k-instruct"

  # bnb_config = BitsAndBytesConfig(load_in_4bit=True,
  #                               bnb_4bit_compute_dtype=torch.float16)

  # model, tokenizer = load_model(model_name, bnb_config)
  eos_token='|'
  results_ICLE = "/content/drive/MyDrive/thesis_NLI/ICLE-NLI-results.csv"
  results_TOEFL = "/content/drive/MyDrive/thesis_NLI/TOEFL11/toefl_results.csv"
  results_column = f"preds_zero_phi3_open_{count}"
  predictions = classify(texts, labels, "ICLE", eos_token, results_ICLE, results_column, closedopen_setting='open')
  add_results_to_csv(results_ICLE, results_column, predictions)
  evaluate(results_ICLE, all_labels_ICLE_open, results_column, save_figure=True)

# Error analysis

## Defining prompts

In [None]:
sysprompt_erroranalysis_TOEFL = '''
  You are a forensic linguistics expert that reads English texts written by non-native authors to classify the native language of the author as one of:
  "ARA": Arabic
  "CHI": Chinese
  "FRE": French
  "GER": German
  "HIN": Hindi
  "ITA": Italian
  "JPN": Japanese
  "KOR": Korean
  "SPA": Spanish
  "TEL": Telugu
  "TUR": Turkish
  Use clues such as spelling errors, word choice, syntactic patterns, and grammatical errors to decide on the native language of the author.\n

  DO NOT USE ANY OTHER CLASS.
  IMPORTANT: Do not classify any input as "ENG" (English). English is an invalid choice.'''

sysprompt_erroranalysis_ICLE = '''
  You are a forensic linguistics expert that reads English texts written by non-native authors to classify the native language of the author as one of:
  "BUL": Bulgarian
  "CHI": Chinese
  "CZE": Czech
  "FRE": French
  "JPN": Japanese
  "RUS": Russian
  "SPA": Spanish
  Use clues such as spelling errors, word choice, syntactic patterns, and grammatical errors to decide on the native language of the author.\n\n

  DO NOT USE ANY OTHER CLASS.
  IMPORTANT: Do not classify any input as "ENG" (English). English is an invalid choice.
'''

In [None]:
def get_sample(results_fp, preds_column, list_classpairs):
  '''
  Create small sample that contains some correctly and wrongly classified samples based on the provided list of pairs of classes
  :param results_fp: filepath to results
  :param preds_column: column that contains previous predictions
  :param list_classpairs: list with pairs of classes
  :type results_fp: str
  :type preds_column: str
  :type list_classpairs: list with dictionaries
  '''
  if "toefl" in results_fp: # determine which dataset it is, because the prompts and classes are different for each
    sys_prompt = sysprompt_erroranalysis_TOEFL
    all_labels = all_labels_TOEFL
    dataset='TOEFL'
  elif "ICLE" in results_fp:
    sys_prompt = sysprompt_erroranalysis_ICLE
    all_labels = all_labels_ICLE
    dataset='ICLE'

  df = pd.read_csv(results_fp)
  rows_list = []
  df = df.sample(frac=1)
  df['goldpred'] = df['language'].astype(str) + df[preds_column]
  filenames = df['filename'].tolist()
  texts = df['text'].tolist()
  labels = df['language'].tolist()
  preds = df[preds_column].tolist()
  goldpreds = df['goldpred'].tolist()

  selected_files = ['1049746', '1017679', '1072659', '1097242', '1108141', '1160675'] # files that were used by Zhang & Salle
  correct_predictions=0
  for filename, text, label, pred, goldpred in zip(filenames, texts, labels, preds, goldpreds):
    row_dict = {'filename': filename,
                'text':text,
                'language':label,
                'prediction': pred,
                'goldpreds': goldpred
                }
    if label!=pred:
      for label1, label2 in list_classpairs:
        if label == 'TUR':
          continue
        elif str(label)!=label1 and str(pred)!=label2 or str(label)!=label2 and str(pred)!= label1:
          rows_list.append(row_dict)
      # continue
  headers = ['filename', 'text', 'language', 'prediction', 'goldpreds']
  sample_df1 = pd.DataFrame(rows_list, columns=headers)
  sample_df1=sample_df1[:20]
  print(len(sample_df1))

  rows_list = []
  for filename, text, label, pred, goldpred in zip(filenames, texts, labels, preds, goldpreds):
    row_dict = {'filename': filename,
                'text':text,
                'language':label,
                'prediction': pred,
                'goldpreds': goldpred
                }
    for f in selected_files:
      if f in filename and dataset=='TOEFL':
        rows_list.append(row_dict)
    if label == pred:
      if correct_predictions >=15:
        continue
      else:
        rows_list.append(row_dict)
        correct_predictions+=1
  sample_df2 = pd.DataFrame(rows_list, columns=headers)
  sample_df = pd.concat([sample_df1, sample_df2], ignore_index=True)
  sample_df = sample_df.drop_duplicates()
  print(len(sample_df2))

  new_csv = f'/content/drive/MyDrive/thesis_NLI/{dataset}_results/explainability_sample.csv'
  sample_df = sample_df.drop('goldpreds', axis=1)
  print(len(sample_df))
  sample_df.to_csv(new_csv)
  sample_df.head()

In [None]:
results_ICLE = "/content/drive/MyDrive/thesis_NLI/ICLE-NLI-results-final.csv"
list_classpairs = [("BUL", "RUS"),
                   ("CZE", "RUS"),
                   ("SPA", "FRE"),
                   ("CZE", "BUL"),
                   ("SPA", "BUL"),
                   ("RUS", "FRE"),
                   ("JPN", "CHI"),
                   ("CZE", "SPA")]
get_sample(results_ICLE, 'preds_gpt4', list_classpairs)

19
40
59


In [None]:
def add_reasonings(sample_fp):
  if 'TOEFL' in sample_fp:
    sys_prompt = sysprompt_erroranalysis_TOEFL
  elif 'ICLE' in sample_fp:
    sys_prompt = sysprompt_erroranalysis_ICLE
  sample_df = pd.read_csv(sample_fp)
  reasonings = []
  count=1
  for text, pred in zip(sample_df['text'], sample_df['prediction']):
    main_task = f'''{text}You must provide a guess. Output two named sections: (1) "Native Language" with the name of the language, and (2) "Reasoning" with a detailed explanation of your judgement with examples from the text.'''
    messages = [
        {'role': "system", "content": sys_prompt},
        {'role': "user", "content": main_task}]
    # messages = [{'role': "user", "content": sys_prompt + "\n" +main_task }]
    fullprompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    # fullprompt = "Instruction: " + sys_prompt + '\n\n' + main_task + '\n' + '\n\nInput: '+ text + "\nResponse:"

    output = generate_text(fullprompt) # generate text per TOEFL text
    output_only = clean_output(output, filter_token, output_only=True)
    print(count)
    print(output_only)
    count+=1
    reasonings.append(output_only)
  column = len(sample_df.columns)
  sample_df.insert(column, 'explanations_llama3', reasonings)
  sample_df.to_csv(sample_fp, index=False)
  return reasonings

In [None]:
model_name = "meta-llama/Meta-Llama-3-8B-Instruct"

bnb_config = BitsAndBytesConfig(load_in_4bit=True,
                                bnb_4bit_compute_dtype=torch.float16
                                )

model, tokenizer = load_model(model_name, bnb_config)
filter_token = 'assistant'
results_icle = "/content/drive/MyDrive/thesis_NLI/ICLE_results/icle_explainability.csv"
results_TOEFL = "/content/drive/MyDrive/thesis_NLI/TOEFL_results/explainability_sample.csv"

llama_toefl_explanations = add_reasonings(results_TOEFL)

# Postprocessing
The following code was used for post-processing, to make sure that all open-set predictions are parsed correctly. For example, predictions like 'FRA' or 'FREN' should be identified as 'FRE', i.e., French.

In [None]:
df = pd.read_csv(results_TOEFL)
columns = df.columns.tolist()
for c in columns:
  new_preds = []
  unknown_labels = []
  all_labels = all_labels_TOEFL_open
  # print(c)
  if 'open' in c:
    preds = df[c].tolist()
    for pred in preds:
      pred = str(pred)
      if pred not in all_labels: # if prediction is not in the predefined set of classes
        if pred == 'es':
          pred = 'SPA'
        elif pred == 'en' or pred == 'EN' or pred == 'en-GB' or pred == 'eng' or pred == 'Enlish':
          pred = 'ENG'
        elif pred == 'Unknown' or pred == 'nan' or pred == 'familiar' or pred == 'local' or pred == 'nowadays' or pred == 'advertisements':
          pred = 'other'
        elif '0' in str(pred):
          pred = 'ENG'
        elif pred == 'pt':
          pred = 'Portuguese'
        elif pred == 'fr' or pred == 'Français':
          pred = 'FRE'
        elif pred == 'ar':
          pred = 'ARA'
        elif len(pred)>30:
          pred = 'other'
        elif '{' in pred:
          pred = 'other'
        elif pred=='zh-CN' or pred == 'zh-Hans' or pred == '中文':
          pred='CHI'
        elif pred == 'it':
          pred='ITA'
        elif pred == 'de':
          pred='GER'
        elif pred == 'pl' or pred == ' Polish' or pred == 'Polska':
          pred = 'Polish'
        elif pred == 'cs':
          pred = 'CZE'
        elif pred == 'ja':
          pred = 'JPN'
        elif pred =='ru' or pred == 'rus':
          pred = 'RUS'
        elif pred == 'nl':
          pred = 'Dutch'
        elif len(pred)<=1 or pred == 'und' or pred == 'non-':
          pred = 'other'
        else:
          unknown_labels.append(pred)
      new_preds.append(pred)
    print(c)
    df = df.drop(c, axis=1)
    num_columns = len(df.columns)
    df.insert(num_columns, c, new_preds)
    for label in set(unknown_labels):
      print(label)
    print(len(new_preds))
    print('\n\n')
df.to_csv(results_TOEFL)
    # print(set(unknown_labels))

preds_zero_llama3_8b_openclass
Persian (Farsi)
RUS
Indonesian
Portuguese
Vietnamese
Persian/Farsi
Indian
Malay
Farsi/Persian
urdu
Persian
1100



preds_zero_llama2_7b_openclass
Brazilian Portuguese
RUS
Tamil
Indonesian
1100



preds_zero_gemma_7b_openclass
Taiwanese
Indonesian
Malaysian
Egyptian-American
1100



preds_zero_mistral_7b_open
India
Portuguese
Persian
1100



preds_zero_phi3_open
RUS
Indonesian
Portuguese
Taiwanese
BUL
Malaysian
Dutch
Malay
Bangla
Persian
1100



preds_zero_llama3_8b_open_1
Persian (Farsi)
RUS
Indonesian
Portuguese
Vietnamese
Thai
Persian/Farsi
Indian
Malay
Urdu
Persian
1100



preds_zero_llama3_8b_open_2
Persian (Farsi)
RUS
Indonesian
Portuguese
Vietnamese
Thai
Persian/Farsi
Indian
Malay
Farsi/Persian
Persian
1100



preds_zero_llama2_7b_open_1
Taiwanese Hokkien
Brazilian Portuguese
RUS
Tamil
1100



preds_zero_llama2_7b_open_2
Brazilian Portuguese
RUS
Tamil
1100



preds_zero_gemma_7b_open_1
Taiwanese
Indonesian
Malaysian
Egyptian-American
1100



preds_z