In [4]:
# import libraries
import os
import json
import numpy as np
import pandas as pd
from datasets import load_dataset
# check current path
current_path = os.getcwd()
os.chdir(current_path)
print(current_path)

b:\Ernst\Italian-Ladin Translation\SA MCQA


# 1. Sentiment Analysis

#### Open the SA dataset from huggingface

In [None]:
# Set your Hugging Face token
import huggingface_hub
huggingface_hub.login("your_huggingface_token")

In [6]:
SA_dataset = load_dataset("ulinnuha/sentiment_analysis_ladin_italian")
SA_df = pd.DataFrame(SA_dataset["train"])
SA_df.head()

Generating train split: 100%|██████████| 12511/12511 [00:00<00:00, 70653.90 examples/s]


Unnamed: 0,italian,ladin,label
0,Siamo stati qui per 1 notte prima della nostra...,i sun stá chiló por 1 nöt dan na nosta partida...,pos
1,Abbiamo soggiornato per due notti alla fine de...,i sun stá döes nes ala fin de nosta croaziera....,pos
2,Ho soggiornato nell’hotel Acca Palace per una ...,i sun sté te hotel Acca Palace por ma na nöt c...,pos
3,"Prima volta in questo hotel, è stata un’esperi...",la pröma iada te chësc hotel é stada na esperi...,pos
4,Abbiamo soggiornato in questo hotel in passato...,i sun sté te chësc hotel denant y i ne se aspe...,pos


In [7]:
# Mapping dictionary to convert 'pos' to 0 and 'neg' to 1
label_map = {'pos': 0, 'neg': 1}

# Apply the mapping to the 'label' column
SA_df['label'] = SA_df['label'].map(label_map)

#### Split the data into training and test (For Ladin entries)
You can change the column for Italian operation

In [None]:
from sklearn.model_selection import train_test_split
#Set which the language
language = 'ladin'

X_train, X_test, y_train, y_test = train_test_split(
                                  SA_df[language], SA_df['label'],
                                  test_size=0.20,
                                  random_state=42,
                                  stratify = SA_df['label'])

In [9]:
data_train_SA = pd.DataFrame()
data_train_SA['review'] = X_train
data_train_SA['label'] = y_train
data_train_SA.label.value_counts()

label
0    7873
1    2135
Name: count, dtype: int64

In [10]:
data_test_SA = pd.DataFrame()
data_test_SA['review'] = X_test
data_test_SA['label'] = y_test
data_test_SA.label.value_counts()

label
0    1969
1     534
Name: count, dtype: int64

In [11]:
len(data_train_SA), len(data_test_SA)

(10008, 2503)

In [None]:
# Save the SA train and test data to 'dataset' Directory
data_train_SA.to_csv(f'dataset/data_train_MCQA_{language}.csv', index=False)
data_test_SA.to_csv(f'dataset/data_test_MCQA_{language}.csv', index=False)

## Perfom Few-shot Learning using LLM for Sentiment Analysis

#### Run the main file

In [None]:
!python fsl_main.py \
  --task SA \
  --language ladin \
  --model_name llama_31_70b \
  --dataset_dir ./dataset \
  --batch_size 10 \
  --save_dir ./save_results

### Evaluate the prediction results 

In [84]:
import re
from sklearn.metrics import balanced_accuracy_score, f1_score

# Get the prediction results
def get_json_files(task, llm_model, batch_size):
    # Define the file prefix file name
    file_prefix = (f'{task}_{language}_{llm_model}_size of_{batch_size}_batch_')

    save_dir = 'save_results'
    matching_files = [f for f in os.listdir(f'{save_dir}') if f.startswith(file_prefix)] #current_path+'/save_results'
    # List all files in the directory that start with the specified prefix
    # Count the number of matching files
    num_files = len(matching_files)
    print(f"Found {num_files} files.")
    batch_start = 0

    # Open real data (Ground Truth)
    ref_data = pd.read_csv(f'dataset/data_test_{task}_{language}.csv')
    num_no_resp = 0
    all_scores = []
    for i in range(num_files):
        # Slicing for the current batch of data
        real_data = []
        real_data = ref_data.iloc[batch_start:batch_start + batch_size]
        batch_start = (i + 1) * batch_size
        print(f"Processing batch {i+1}, starting at index {batch_start}")
        # Open and read the JSON files of translation result
        file_loc=os.path.join(save_dir+f'/{file_prefix}{i}.json') #save_dir
        print("load the json file", file_loc)
        f = open(file_loc, encoding='utf8')
        data = json.load(f)
        
        # if json data is in str, convert to dict
        if isinstance(data, str):
            data = json.loads(data)
        # Ensure 'choices' exists and contains data
        if "choices" in data and data["choices"]:
            data_output = data["choices"][0].get("message", {}).get("content", "")
            #print(translation_output)
            if data_output.strip():  # Check if translation output is not empty
                try:
                    # Clean the input by removing leading and trailing brackets
                    data_output_cleaned = data_output.strip()[1:-1]
                    #data_output_cleaned = re.sub(r'\]\n*\[', '], [', data_output_cleaned)
                    data_output_cleaned = re.sub(r'\]\n*\[|\],\n*\[|\], \\n\[', '], [', data_output_cleaned)  

                    # Optionally, replace other unwanted patterns, e.g., if there are stray newlines
                    data_output_cleaned = data_output_cleaned.replace("\n", " ")
                    # Split the reviews into separate strings
                    predictions = list(map(int, data_output_cleaned.split(', ')))
                    labels = real_data['label'].tolist()
                    
                    if len(predictions) == len(labels):

                        # Calculate Balanced Accuracy and F1 Score
                        balanced_acc = balanced_accuracy_score(labels, predictions)
                        f1 = f1_score(labels, predictions, average='weighted')
                        all_scores.append(
                                {'ACC': balanced_acc,
                                'F1':f1,}
                                )
                    else:
                        print(f"Length mismatch for batch {i + 1}: {len(predictions)} != {len(real_data)}")
      
                except json.JSONDecodeError as ex:
                    print(f"An error occurred while processing the reviews: {ex}")
        else:
            print("No choices found in the response.")
    print('The number of batches without response of LLM', num_no_resp)
    return all_scores

#### Get evaluation metric scores

In [85]:
# Set parameter for performing Evaluations
llm_model = 'llama_31_70b' # gpt/ mt5
batch_size = 10
task = 'SA'

In [None]:
# Combine all translation results
translation_result=get_json_files(task, llm_model, batch_size)
print(len(translation_result))

In [88]:
# Convert the list of dictionaries to a DataFrame
filtered_data = [entry for entry in translation_result if not all(isinstance(v, list) and len(v) == 0 for v in entry.values())]
fr = pd.DataFrame(filtered_data)
print(len(filtered_data))
# Calculate the mean for each column
mean_scores = fr.mean()
# Print the mean scores
print("The Accuracy score for the SA tasks is", mean_scores['ACC'])
print("The F1 score for the SA tasks is", mean_scores['F1'])


250
The Accuracy score for the SA tasks is 0.9791468253968255
The F1 score for the SA tasks is 0.9817533302851259


# 2. MCQA

#### Get the dataset from Hugging Face

In [None]:
# Set your Hugging Face token
import huggingface_hub
huggingface_hub.login("your_huggingface_token")

In [8]:
MCQA_dataset = load_dataset("ulinnuha/mcqa_ladin_italian")
df_mcqa = pd.DataFrame(MCQA_dataset["train"])
df_mcqa.head()

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Generating train split: 764 examples [00:00, 9260.77 examples/s]


Unnamed: 0,question_italian,question_ladin,choices_all_italian,choices_all_ladin,max_choices,answer
0,Lo stato giuridico ed economico del personale ...,le stat juridich y economich dl personal dles ...,['secondo i principi generali e comuni del rap...,['aladô di prinzips generai y comuni dl raport...,3,2
1,Il miocardio è il muscolo:\n\n,le miocardium é le muscul:,"['del polmone', 'del cuore', 'no, non è un mus...","['dl pulmon', 'de cör', 'no, al né nia n muscul']",3,1
2,Il mantenimento del pediatra di famiglia fino ...,le mantignimënt dl pediater de familia cina 16...,['se il compimento del sedicesimo anno di età ...,['sce limplenida dl sëdesim ann deté vëgn dant...,3,2
3,Cosa sono i batteri?\n\n,ci é pa i bacteri?,['Parassiti intracellulari che per potersi rip...,['parassic intracellulars che por podëi se rep...,3,1
4,Lo scioglimento della Croce Rossa Italiana può...,la desliagna dla Crusc Röda Italiana pó ester ...,"['con D.C.P.M. o con D.P.R.', 'solo con refere...","['cun D.C.P.M. o cun D.P.R.', 'ma cun referend...",3,2


### Set the training and testing data

In [57]:
#Set which the language
language = 'ladin'

In [39]:
# Calculate the proportion of each class in the 'label' column
class_proportions = df_mcqa['max_choices'].value_counts(normalize=True)

# Create empty DataFrames for the train and test sets
df_train = pd.DataFrame()
testing_data = pd.DataFrame()

# Split the data for each class based on the proportion
for label, proportion in class_proportions.items():
    # Get all rows for the current class
    label_df = df_mcqa[df_mcqa['max_choices'] == label]

    # Calculate the number of samples for train and test sets based on class proportion
    n_samples = len(label_df)
    train_size = int(0.80 * n_samples)  # 80% of samples for training
    test_size = n_samples - train_size  # 20% of samples for testing

    # Shuffle the rows within this class
    label_df_shuffled = label_df.sample(frac=1, random_state=42)

    # Split into train and test based on the calculated sizes
    label_train = label_df_shuffled.iloc[:train_size]
    label_test = label_df_shuffled.iloc[train_size:]

    # Append to the corresponding train and test DataFrames
    df_train = pd.concat([df_train, label_train], axis=0)
    testing_data = pd.concat([testing_data, label_test], axis=0)

# Reset indices for better handling
df_train = df_train.reset_index(drop=True)
testing_data = testing_data.reset_index(drop=True)

# Optionally, display the class distribution in both train and test sets
print("Class Distribution in Training Set:")
print(df_train['max_choices'].value_counts(normalize=True))

print("\nClass Distribution in Testing Set:")
print(testing_data['max_choices'].value_counts(normalize=True))

Class Distribution in Training Set:
max_choices
3    0.398361
5    0.345902
4    0.255738
Name: proportion, dtype: float64

Class Distribution in Testing Set:
max_choices
3    0.396104
5    0.344156
4    0.259740
Name: proportion, dtype: float64


In [40]:
# Save the dataset
df_train.to_csv(f'dataset/data_train_MCQA_{language}.csv', index=False)
testing_data.to_csv(f'dataset/data_test_MCQA_{language}.csv', index=False)

# Run the Inferencing of LLM using Few-short learning

In [None]:
!python fsl_main.py \
  --task MCQA \
  --language ladin \
  --model_name llama_31_70b \
  --dataset_dir ./dataset \
  --batch_size 10 \
  --save_dir ./save_results

### Evaluate the prediction results 

In [70]:
import re
from sklearn.metrics import balanced_accuracy_score, f1_score
def get_json_files_mcqa(task, llm_model, batch_size, language):
    # Define the file prefix file name
    file_prefix = (f'{task}_{language}_{llm_model}_size of_{batch_size}_batch_')

    # List all files in the directory that start with the specified prefix
    save_dir = 'save_results'
    matching_files = [f for f in os.listdir(f'{save_dir}') if f.startswith(file_prefix)] #current_path+'/save_results'
    # Count the number of matching files
    num_files = len(matching_files)
    print(f"Found {num_files} files.")
    batch_start = 0

    # Open real data (Ground Truth)
    ref_data = pd.read_csv(f'dataset/data_test_{task}_{language}.csv')
    num_no_resp = 0
    all_scores = []
    for i in range(num_files):
        # Slicing for the current batch of data
        real_data = []
        real_data = ref_data.iloc[batch_start:batch_start + batch_size]
        batch_start = (i + 1) * batch_size
        print(f"Processing batch {i+1}, starting at index {batch_start}")
        # Open and read the JSON files of translation result
        file_loc=os.path.join(save_dir+f'/{file_prefix}{i}.json') #save_dir
        print("load the json file", file_loc)
        f = open(file_loc, encoding='utf8')
        data = json.load(f)

        # if json data is in str, convert to dict
        if isinstance(data, str):
            data = json.loads(data)
        # Ensure 'choices' exists and contains data
        if "choices" in data and data["choices"]:
            data_output = data["choices"][0].get("message", {}).get("content", "")
            #print(translation_output)
            if data_output.strip():  # Check if translation output is not empty
                try:
                    # Clean the input by removing leading and trailing brackets
                    data_output_cleaned = data_output.strip()[1:-1]
                    #data_output_cleaned = re.sub(r'\]\n*\[', '], [', data_output_cleaned)
                    data_output_cleaned = re.sub(r'\]\n*\[|\],\n*\[|\], \\n\[', '], [', data_output_cleaned)  

                    # Optionally, replace other unwanted patterns, e.g., if there are stray newlines
                    data_output_cleaned = data_output_cleaned.replace("\n", " ")
                    # Split the reviews into separate strings
                    predictions = list(map(int, data_output_cleaned.split(', ')))
                    labels = real_data['answer'].tolist()
                    
                    if len(predictions) == len(labels):

                        # Calculate Balanced Accuracy and F1 Score
                        balanced_acc = balanced_accuracy_score(labels, predictions)
                        f1 = f1_score(labels, predictions, average='weighted')
                        all_scores.append(
                                {'ACC': balanced_acc,
                                'F1':f1,}
                                )
                    else:
                        print(f"Length mismatch for batch {i + 1}: {len(predictions)} != {len(real_data)}")
      
                except json.JSONDecodeError as ex:
                    print(f"An error occurred while processing the reviews: {ex}")
        else:
            print("No choices found in the response.")
    print('The number of batches without response of LLM', num_no_resp)
    return all_scores

#### Get evaluation metric scores

In [71]:
# Set parameter for performing Evaluations
llm_model = 'llama_31_70b' # gpt/ mt5
batch_size = 10
task = 'MCQA' # italian

In [None]:
# Combine all translation results
translation_result_mcqa=get_json_files_mcqa(task, llm_model, batch_size, language)
print(len(translation_result_mcqa))

In [None]:
# Convert the list of dictionaries to a DataFrame
filtered_data_mcqa = [entry for entry in translation_result_mcqa if not all(isinstance(v, list) and len(v) == 0 for v in entry.values())]
fr_mcqa = pd.DataFrame(filtered_data_mcqa)
print(len(filtered_data_mcqa))
# Calculate the mean for each column
mean_scores_mcqa = fr_mcqa.mean()
# Print the mean scores
print("The Accuracy score for the MCQA tasks is", mean_scores_mcqa['ACC'])
print("The F1 score for the MCQA tasks is", mean_scores_mcqa['F1'])
