# Instruction tuning dataset construction
First, we need to construct the IMHI instruction tuning dataset tailored for the requirement of LLaMAFactory， consisting of the training set and validation set.
Note, the constructed SAD dataset is used only for perplexity analysis.
The SAD dataset used for instruction tuning is provided in the next block.

## Training Set Construction

In [1]:
import pandas as pd
import json
import os
from nltk.tokenize import word_tokenize

dataset_list = ['DR', 'dreaddit', 'Irf', 'MultiWD', 'SAD']
directory = "mental_dataset//IMHI//train_data"

def write_train_dataset(dataset_name='DR'):
    data_list_train = []
    data_list_eval = []
    for sub_directory in os.listdir(directory):
        if sub_directory == dataset_name:
            file_path = os.path.join(directory, sub_directory)
            file_path_train = os.path.join(file_path, "train.csv")
            file_path_val = os.path.join(file_path, "val.csv")
            df_train = pd.read_csv(file_path_train)
            df_val = pd.read_csv(file_path_val)

            for index, row in df_train.iterrows():
                data_dict = {"instruct": row["query"], "output": row["gpt-3.5-turbo"]}
                data_list_train.append(data_dict)

            for index, row in df_val.iterrows():
                data_dict = {"instruct": row["query"], "output": row["gpt-3.5-turbo"]}
                data_list_eval.append(data_dict)

    json_data_train = json.dumps(data_list_train, indent=4)
    json_data_eval = json.dumps(data_list_eval, indent=4)

    with open(".//mental_dataset//mental_istct_train_{}.json".format(dataset_name), "w") as json_file:
        json_file.write(json_data_train)
    print("Data has been saved to mental_istct_train_{}.json, length:{}".format(dataset_name, len(data_list_train)))
    with open(".//mental_dataset//mental_istct_val_{}.json".format(dataset_name), "w") as json_file:
        json_file.write(json_data_eval)
    print("Data has been saved to mental_istct_val_{}.json, length:{}".format(dataset_name, len(data_list_eval)))


for i in dataset_list:
    write_train_dataset(i)

Data has been saved to mental_istct_train_DR.json, length:1003
Data has been saved to mental_istct_val_DR.json, length:430
Data has been saved to mental_istct_train_dreaddit.json, length:2837
Data has been saved to mental_istct_val_dreaddit.json, length:300
Data has been saved to mental_istct_train_Irf.json, length:3943
Data has been saved to mental_istct_val_Irf.json, length:985
Data has been saved to mental_istct_train_MultiWD.json, length:15743
Data has been saved to mental_istct_val_MultiWD.json, length:1500
Data has been saved to mental_istct_train_SAD.json, length:5547
Data has been saved to mental_istct_val_SAD.json, length:616


Remove the label in the output sentence for the SAD dataset and contruct the instruction tuning dataset for the SAD dataset.

In [2]:
import pandas as pd
import json
import os


directory = ".//mental_dataset//IMHI//train_data"


def write_SAD_dataset(file_name=''):
    data_list = []
    for sub_directory in os.listdir(directory):
        if sub_directory == 'SAD':
            file_path = os.path.join(directory, sub_directory)        
            file_path = os.path.join(file_path, file_name)  

            df = pd.read_csv(file_path)
            for index, row in df.iterrows():
                instruct = row["query"]
                output = row["gpt-3.5-turbo"]
                output_an = output.split("Reasoning:")[0]
                reasoning = output.split("Reasoning:")[1]

                if 'school' in output_an.lower():
                    label = 'school'
                elif 'financial problem' in output_an.lower():
                    label = 'financial problem'
                elif 'family issues' in output_an.lower():
                    label = 'family issues'
                elif 'social relationships' in output_an.lower():
                    label = 'social relationships'
                elif 'work' in output_an.lower():
                    label = 'work'
                elif 'health issues' in output_an.lower():
                    label = 'health issues'
                elif 'emotion turmoil' in output_an.lower():
                    label = 'emotion turmoil'
                elif 'everyday decision making' in output_an.lower():
                    label = 'everyday decision making'
                elif 'other stress causes' in output_an.lower():
                    label = 'other stress causes'

                post = instruct.split("Question:")[0]
                question = "Question: This post shows the stress cause related to {}, explain the reasoning of it step by step".format(label)

                data_dict = {"instruct": post+question, "output": reasoning}
                data_list.append(data_dict)
   
    return data_list


data_list_train = write_SAD_dataset('train.csv')
print(len(data_list_train))
data_list_eval = write_SAD_dataset('val.csv')
print(len(data_list_eval))


with open("./mental_dataset/mental_istct_train_SAD_without_label.json", "w") as json_file:
    json_data = json.dumps(data_list_train, indent=4)
    json_file.write(json_data)
    print('Data has been saved to mental_istct_train_SAD_without_label.json')

with open("./mental_dataset/mental_istct_val_SAD_without_label.json", "w") as json_file:
    json_data = json.dumps(data_list_eval, indent=4)
    json_file.write(json_data)
    print('Data has been saved to mental_istct_val_SAD_without_label.json')

5547
616
Data has been saved to mental_istct_train_SAD_without_label.json
Data has been saved to mental_istct_val_SAD_without_label.json


The next step is to construct the test set.
First, we can visulize the input_length and output_length after vectorization.

## Test Set Construction

Construct the test set for the 'DR', 'dreaddit', 'Irf', and 'MultiWD'  subdataset.

In [7]:
import pandas as pd
import json
import os


dataset_list = ['DR', 'dreaddit', 'Irf', 'MultiWD', 'SAD']
directory = "mental_dataset//IMHI//test_data"

def write_test_dataset(dataset_name):
    data_list = []
    file_name = dataset_name + '.csv'
    file_path = os.path.join(directory, file_name)
    df = pd.read_csv(file_path)

    for index, row in df.iterrows():
        data_dict = {"instruct": row["query"], "output": row["gpt-3.5-turbo"]}
        data_list.append(data_dict)

    json_data = json.dumps(data_list, indent=4)

    with open(".//mental_dataset//mental_istct_test_{}.json".format(dataset_name), "w") as json_file:
        json_file.write(json_data)

    print("mental_istct_test_{}.json has been saved".format(dataset_name))
    print(len(data_list))


for i in dataset_list:
    write_test_dataset(i)

mental_istct_test_DR.json has been saved
405
mental_istct_test_dreaddit.json has been saved
414
mental_istct_test_Irf.json has been saved
2113
mental_istct_test_MultiWD.json has been saved
2441
mental_istct_test_SAD.json has been saved
684


Construct the test set for the 'SAD' subdataset.

In [6]:
import pandas as pd
import json
import os

data_list = []
test_dataset = 'SAD.csv'

directory = ".//mental_dataset//IMHI//test_data"

for file_name in os.listdir(directory):
    if file_name == test_dataset:
        file_path = os.path.join(directory, file_name)
        df = pd.read_csv(file_path)

        # 遍历DataFrame中的每一行，并将其转换为字典
        for index, row in df.iterrows():
            instruct = row["query"]
            output = row["gpt-3.5-turbo"]
            output_an = output.split("Reasoning:")[0]
            reasoning = output.split("Reasoning:")[1]                
            if 'school' in output_an.lower():
                label = 'school'
            elif 'financial problem' in output_an.lower():
                label = 'financial problem'
            elif 'family issues' in output_an.lower():
                label = 'family issues'
            elif 'social relationships' in output_an.lower():
                label = 'social relationships'
            elif 'work' in output_an.lower():
                label = 'work'
            elif 'health issues' in output_an.lower():
                label = 'health issues'
            elif 'emotion turmoil' in output_an.lower():
                label = 'emotion turmoil'
            elif 'everyday decision making' in output_an.lower():
                label = 'everyday decision making'
            elif 'other stress causes' in output_an.lower():
                label = 'other stress causes'
            
            post = instruct.split("Question:")[0]
            question = "Question: This post shows the stress cause related to {}, explain the reasoning of it step by step".format(label)
            data_dict = {"instruct": post+question, "output": reasoning}
            data_list.append(data_dict)

json_data = json.dumps(data_list, indent=4)

with open(".//mental_dataset//mental_istct_test_SAD_without_label.json", "w") as json_file:
    json_file.write(json_data)

print("mental_istct_test_SAD_without_label.json has been saved")
print(len(data_list))

mental_istct_test_SAD_without_label.json has been saved
684


We can try to evaluate the results on a single dataset.

In [8]:
import json
from functions import *

dataset_list = ['DR', 'dreaddit', 'Irf', 'MultiWD', 'SAD']

dataset_name = dataset_list[4]
generated = {dataset_name: []}
golden = {dataset_name: []}
path = 'D://Mental-checkpoint//'+ str(dataset_name) + '.jsonl'

with open(path, 'r', encoding='utf-8') as file:
    for line in file:
        data = json.loads(line.strip())
        prompt = data['prompt']
        label = data['label']
        predict = data['predict']
        
        generated[dataset_name].append(predict)
        
        golden[dataset_name].append(label)

calculate_f1(generated, golden)

{'SAD': {'avg_accuracy': 63.89,
  'weighted_f1': 61.41,
  'micro_f1': 63.89,
  'macro_f1': 59.39}}

We can also try to evaluate the results on all the  sub-datasets.

In [6]:
import json
from functions import *

# split the generated results based on different length of the datasets.
dataset_list = ['DR', 'dreaddit', 'Irf', 'MultiWD', 'SAD']
start_index = [0, 405, 819, 2932, 5373]
end_index = [405, 819, 2932, 5373, 6057]
generated = {}
golden = {}
dataset_name = "generated_predictions"

path = 'E://python_prj_D//MentalLLaMA//output//bf16//'+ str(dataset_name) + '.jsonl'

all_data = []
with open(path, 'r', encoding='utf-8') as file:
    for line in file:
        data = json.loads(line.strip())
        all_data.append(data)
    
for i, (start, end, name) in enumerate(zip(start_index, end_index, dataset_list)):
    data = all_data[start:end]
    generated[name] = []
    golden[name] = []
    for line in data:
        prompt = line['prompt']
        label = line['label']
        predict = line['predict']
        generated[name].append(predict)
        golden[name].append(label)
    
calculate_f1(generated, golden)

Dataset: DR, average acc:74.32, weighted F1 74.23, micro F1 74.32, macro F1 64.75, OOD count: 0

Dataset: dreaddit, average acc:78.02, weighted F1 77.97, micro F1 78.02, macro F1 77.95, OOD count: 2

Dataset: Irf, average acc:70.18, weighted F1 69.62, micro F1 70.18, macro F1 66.23, OOD count: 8

Dataset: MultiWD, average acc:65.18, weighted F1 65.82, micro F1 65.18, macro F1 64.72, OOD count: 4

Dataset: SAD, average acc:62.87, weighted F1 61.1, micro F1 62.87, macro F1 58.96, OOD count: 81

