In [3]:
import os
import json
import numpy as np
from itertools import combinations
from tqdm.notebook import tqdm
from sklearn.metrics import f1_score
from main import calculate_pearson, calculatePRF_MLabel
import pandas as pd

In [12]:
def regression_ensemble_sweep(split = [], task='empathy', post_process=True, reduction='mean', choices='all', ):
    dir_path = f"./{task}/dev/"
    gold_dev_file_path = "/users10/zjli/workspace/WASSA/new_data/2023/dev.json"
    with open(gold_dev_file_path) as f:
        gold_dev_results = json.load(f)
    gold = [item[task] for item in gold_dev_results]
    if split:
        gold = [gold[i] for i in split]
    file_names = os.listdir(dir_path)
    file_names.sort()
    all_pred_results = []
    all_combined_results = []
    final_results = []
    
    for file_name in file_names:
        if "pearson" in file_name:
            pre_result = []
            with open(os.path.join(dir_path, file_name)) as f:
                for line in f.readlines():
                    line = json.loads(line)
                    if post_process:
                        if line[0] < 1.0:
                            line[0] = 1.0
                        elif line[0] > 7.0:
                            line[0] = 7.0
                    pre_result.extend(line)
            if choices == 'sweep' or choices == 'all':
                all_pred_results.append({'pre_result':pre_result, 'file_name': file_name})
            elif choices == 'MT' and 'MT' in file_name:
                all_pred_results.append({'pre_result':pre_result, 'file_name': file_name})
            elif choices == 'base' and 'base' in file_name:
                all_pred_results.append({'pre_result':pre_result, 'file_name': file_name})

    if choices == 'sweep':
        first = 1
    else:
        first = len(all_pred_results)
    for i in range(first, len(all_pred_results)+1): # 组合个数选取
        for combination in combinations(all_pred_results, i): # 遍历n中选i个的所有组合
            all_combined_results.append(combination) # 一种组合，即i组预测结果
    for combined_result in tqdm(all_combined_results): #每一种组合
        if reduction == 'mean':
            ensemble_result = [pred_result['pre_result'] for pred_result in combined_result]
            ensemble_result_array = np.mean(np.array(ensemble_result), axis=0)
            ensemble_file_names = [pred_result['file_name'] for pred_result in combined_result]
            
            if split:
                pred = [ensemble_result_array[i] for i in split]
                pearson = calculate_pearson(gold, pred)
                final_results.append({
                    "ensemble_pred_result": ensemble_result_array,
                    "ensemble_file_names": ensemble_file_names,
                    "ensemble_metric": pearson
                })
            else:
                pearson = calculate_pearson(gold, ensemble_result_array)
                final_results.append({
                    "ensemble_pred_result": ensemble_result_array,
                    "ensemble_file_names": ensemble_file_names,
                    "ensemble_metric": pearson
                })
    final_results = sorted(final_results, key=lambda k: k['ensemble_metric'], reverse=True)

    return final_results

In [None]:
print(regression_ensemble_sweep(task='distress')[0]['ensemble_metric'])

In [None]:
# distress 全部数据结果
print(regression_ensemble_sweep(task='distress', choices='all')[0]['ensemble_metric'])
# distress 多任务数据结果
print(regression_ensemble_sweep(task='distress', choices='MT')[0]['ensemble_metric'])
# distress roberta-base数据结果
print(regression_ensemble_sweep(task='distress', choices='base')[0]['ensemble_metric'])


In [None]:
# empathy 全部数据结果
print(regression_ensemble_sweep(task='empathy', choices='all')[0]['ensemble_metric'])
# empathy 多任务数据结果
print(regression_ensemble_sweep(task='empathy', choices='MT')[0]['ensemble_metric'])
# empathy roberta-base数据结果
print(regression_ensemble_sweep(task='empathy', choices='base')[0]['ensemble_metric'])


In [19]:
def classification_ensemble_sweep(split = [], task='emotion', post_process=True, reduction='label_mean', choices='all'):
    dir_path = f"./{task}/dev"
    gold_dev_file_path = "/users10/zjli/workspace/WASSA/new_data/2023/dev.json"
    with open(gold_dev_file_path) as f:
        gold_dev_results = json.load(f)
    gold = [item[task] for item in gold_dev_results]
    if split:
        gold = [gold[i] for i in split]
        
    file_names = os.listdir(dir_path)
    file_names.sort()
    all_pred_results = []
    all_combined_results = []
    final_results = []
    
    for file_name in file_names:
        if "macro_F" in file_name:
            pred_prob = []
            pred_label = []
            with open(os.path.join(dir_path, file_name)) as f:
                for line in f.readlines():
                    line = json.loads(line)
                    pred_prob.append(line['prob'])
                    pred_label.append(line['p_label'])
            all_pred_results.append({'pred_prob':pred_prob, 'pred_label':pred_label, 'file_name': file_name})
    if choices == 'sweep':
        first = 1
    else:
        first = len(all_pred_results)
    for i in range(first, len(all_pred_results)+1): # 组合个数选取
        for combination in combinations(all_pred_results, i): # 遍历n中选i个的所有组合
            all_combined_results.append(combination) # 一种组合，即i组预测结果
    for combined_result in tqdm(all_combined_results): #每一种组合
        ensemble_prob_result = [pred_result['pred_prob'] for pred_result in combined_result]
        ensemble_prob_result_array = np.array(ensemble_prob_result)
        ensemble_prob_result_array = np.apply_along_axis(lambda x: np.mean(x), axis=0, arr=ensemble_prob_result_array)
        if reduction == 'label_mean':
            ensemble_label_result = [pred_result['pred_label'] for pred_result in combined_result]
            ensemble_label_result_array = np.array(ensemble_label_result)
            ensemble_label_result_array = np.apply_along_axis(lambda x: np.argmax(np.bincount(x)), axis=0, arr=ensemble_label_result_array)
            if post_process:
                for idx, label_item in enumerate(ensemble_label_result_array):
                    if sum(label_item) == 0:
                        max_index = np.argmax(ensemble_prob_result_array[idx])
                        ensemble_label_result_array[idx][max_index] = 1
                        # label_item = np.zeros_like(label_item)
                        # label_item[max_index] = 1
            ensemble_file_names = [pred_result['file_name'] for pred_result in combined_result]
            if split:
                pred = [ensemble_label_result_array[i] for i in split]
                metric = calculatePRF_MLabel(gold, pred)
                final_results.append({
                    "ensemble_pred_result": ensemble_label_result_array,
                    "ensemble_file_names": ensemble_file_names,
                    "ensemble_metric": metric
                })
            else:
                metric = calculatePRF_MLabel(gold, ensemble_label_result_array)
                final_results.append({
                    "ensemble_pred_result": ensemble_label_result_array,
                    "ensemble_file_names": ensemble_file_names,
                    "ensemble_metric": metric
                })
        elif reduction == 'prob_mean':
            ensemble_label_result_array = np.where(ensemble_prob_result_array > 0.5, 1, 0)
            if post_process:
                for idx, label_item in enumerate(ensemble_label_result_array):
                    if sum(label_item) == 0:
                        max_index = np.argmax(ensemble_prob_result_array[idx])
                        ensemble_label_result_array[idx][max_index] = 1
                        # label_item = np.zeros_like(label_item)
                        # label_item[max_index] = 1
            ensemble_file_names = [pred_result['file_name'] for pred_result in combined_result]
            if split:
                pred = [ensemble_label_result_array[i] for i in split]
                metric = calculatePRF_MLabel(gold, pred)
                final_results.append({
                    "ensemble_pred_result": ensemble_label_result_array,
                    "ensemble_file_names": ensemble_file_names,
                    "ensemble_metric": metric
                })
            else:
                metric = calculatePRF_MLabel(gold, ensemble_label_result_array)
                final_results.append({
                    "ensemble_pred_result": ensemble_label_result_array,
                    "ensemble_file_names": ensemble_file_names,
                    "ensemble_metric": metric
                })
        else:
            raise NotImplementedError
        
    final_results = sorted(final_results, key=lambda k: k['ensemble_metric'], reverse=True)

    return final_results
    # final_results = sorted(final_results, key=lambda k: k['ensemble_metric'], reverse=True)
    # print(final_results[:3])

In [None]:
# label_mean 全部数据结果
print(classification_ensemble_sweep(split='dev', task='emotion', choices='all', reduction='label_mean')[0]['ensemble_metric'])
# prob_mean 全部数据结果
print(classification_ensemble_sweep(split='dev', task='emotion', choices='all', reduction='prob_mean')[0]['ensemble_metric'])

# 划分验证集评估以验证不同ensemble的鲁棒性

期望：在不同的划分上，模型的效果近似，而不是偏向于某一种划分

In [5]:
df = pd.read_csv("/users10/zjli/workspace/WASSA/new_data/2023/WASSA23_essay_level_dev.tsv", delimiter='\t', header=0)

In [8]:
print(sorted(list(df['article_id']))[len(df)//4])
print(sorted(list(df['article_id']))[len(df)*2//4])
print(sorted(list(df['article_id']))[len(df)*3//4])
print('##########################################')
print(sorted(list(df['gender']))[len(df)//4])
print(sorted(list(df['gender']))[len(df)*2//4])
print(sorted(list(df['gender']))[len(df)*3//4])
print('##########################################')
print(sorted(list(df['age']))[len(df)//4])
print(sorted(list(df['age']))[len(df)*2//4])
print(sorted(list(df['age']))[len(df)*3//4])
print('##########################################')
print(sorted(list(df['education']))[len(df)//4])
print(sorted(list(df['education']))[len(df)*2//4])
print(sorted(list(df['education']))[len(df)*3//4])
print('##########################################')
print(sorted(list(df['income']))[len(df)//4])
print(sorted(list(df['income']))[len(df)*2//4])
print(sorted(list(df['income']))[len(df)*3//4])


73
163
292
##########################################
1
1
2
##########################################
25
29
33
##########################################
3
4
6
##########################################
25000
30000
55000


In [11]:
def split_dev(df, key, start, end):
    res = []
    for idx, item in df[key].items():
        if start <= item <= end:
            res.append(idx)
    return res

In [20]:
def dev_split_metric(pred, gold, split_indexs, task):
    s_pred = [pred[idx] for idx in split_indexs]
    s_gold = [gold[idx] for idx in split_indexs]
    if task == 'emotion':
        metric = calculatePRF_MLabel(s_gold, s_pred)
    elif task in ['empathy', 'distress']:
        metric = calculate_pearson(s_gold, s_pred)
    return metric

In [12]:
import os
gold_dev_file_path = "/users10/zjli/workspace/WASSA/new_data/2023/dev.json"


key_se = {"article_id":[(0,73),(74,163),(164,292),(293,1000000000000)], 
          "age":[(0,25),(26,29),(30,33),(34,10000000000000)],
        "education":[(0,3),(4,4),(5,6),(7,1000000000000)],
        "income":[(0,25000),(25001,30000),(30001,55000),(55001,1000000000000)]}

for task in ['emotion', 'empathy', 'distress']:
    with open(gold_dev_file_path) as f:
        gold_dev_results = json.load(f)
        gold = [item[task] for item in gold_dev_results]
    file_names = os.listdir(f'./{task}/dev/')
    file_names.sort()
    all_pred_results = []
    all_combined_results = []
    final_results = []
    metrics = []
    for file_name in file_names:
        if "macro_F" in file_name or 'pearson' in file_name:
            pred_prob = []
            pred_label = []
            with open(os.path.join(f'./{task}/dev/', file_name)) as f:
                for line in f.readlines():
                    line = json.loads(line)
                    if task == 'emotion':
                        if sum(line['p_label']) == 0:
                            max_index = np.argmax(line['prob'])
                            tmp = [0]*8
                            tmp[max_index] = 1
                        else:
                            tmp = line['p_label']
                        pred_label.append(tmp)
                    elif task in ['empathy', 'distress']:
                        if line[0] < 1:
                            val = 1.0
                        elif line[0] > 7:
                            val = 7.0
                        else:
                            val = line[0]
                        pred_label.append(val)
            row_metric = {'file_name':file_name}
            for key in ["article_id", "age", "education", "income"]:
                for idx, (start, end) in enumerate(key_se[key]):
                    split_indexs = split_dev(df, key, start, end)
                    metric = dev_split_metric(pred_label, gold, split_indexs, task)
                    row_metric[f'{key}_{idx}'] = metric
            metrics.append(row_metric)
    wdf = pd.DataFrame(metrics)
    # 将 DataFrame 写入 TSV 文件
    wdf.to_csv(f'./{task}/contrast_result.tsv', sep='\t', index=False)
                            

  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(tru

## 按照 article_id划分

In [9]:
article_id_set_1 = []
article_id_set_2 = []
for idx, item in df['article_id'].items():
    if item <= 163:
        article_id_set_1.append(idx)
    else:
        article_id_set_2.append(idx)
print(len(article_id_set_2) - len(article_id_set_1))
gender_set_1 = []
gender_set_2 = []
for idx, item in df['gender'].items():
    if item == 1:
        gender_set_1.append(idx)
    else:
        gender_set_2.append(idx)
print(len(gender_set_2) - len(gender_set_1))
age_set_1 = []
age_set_2 = []
for idx, item in df['age'].items():
    if item <= 29:
        age_set_1.append(idx)
    else:
        age_set_2.append(idx)
print(len(age_set_2) - len(age_set_1))
education_set_1 = []
education_set_2 = []
for idx, item in df['education'].items():
    if item <= 4:
        education_set_1.append(idx)
    else:
        education_set_2.append(idx)
print(len(education_set_2) - len(education_set_1))
income_set_1 = []
income_set_2 = []
for idx, item in df['income'].items():
    if item <= 30000:
        income_set_1.append(idx)
    else:
        income_set_2.append(idx)
print(len(income_set_2) - len(income_set_1))

-4
-12
-40
-34
-2


In [10]:
dev_set_splits = {"article_id_set_1": article_id_set_1,
                  "article_id_set_2": article_id_set_2,
                  "gender_set_1": gender_set_1,
                  "gender_set_2": gender_set_2,
                  "age_set_1": age_set_1,
                  "age_set_2": age_set_2,
                  "education_set_1": education_set_1,
                  "education_set_2": education_set_2,
                  "income_set_1": income_set_1,
                  "income_set_2": income_set_2,
                  }

In [20]:
emp_results, dis_results, emo_results = {}, {}, {}
for split_name in dev_set_splits.keys():
    split_set = dev_set_splits[split_name]
    empathy_result = regression_ensemble_sweep(split=split_set, choices='sweep', task='empathy')
    distress_result = regression_ensemble_sweep(split=split_set, choices='sweep', task='distress')
    emotion_result = classification_ensemble_sweep(split=split_set, choices='sweep')
    emp_results[split_name] = [item['ensemble_file_names'] for item in empathy_result[:100]]
    dis_results[split_name] = [item['ensemble_file_names'] for item in distress_result[:100]]
    emo_results[split_name] = [item['ensemble_file_names'] for item in emotion_result[:100]]


  0%|          | 0/524287 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [18]:
with open("./result/empathy_100.json", 'w') as f:
    json.dump(emp_results, f)
with open("./result/distress_100.json", 'w') as f:
    json.dump(dis_results, f)
with open("./result/emotion_100.json", 'w') as f:
    json.dump(emo_results, f)

In [5]:
emotion_results, empathy_results, distress_results = {}, {}, {}

In [None]:
final_result1, final_result2 = classification_ensemble_sweep(split=[article_id_set_1, article_id_set_2], choices='sweep')

In [None]:
final_result1, final_result2 = regression_ensemble_sweep(split=[article_id_set_1, article_id_set_2], choices='sweep', task='empathy')

In [None]:
final_result1, final_result2 = regression_ensemble_sweep(split=[article_id_set_1, article_id_set_2], choices='sweep', task='distress')

In [15]:
import json
from pprint import pprint
tasks = ['distress', 'empathy', 'emotion']

for task in tasks:
    with open(f'./result/{task}_100.json') as f:
        data = json.load(f)
        candidate_file_name_groups = set()
        candidate_file_name_weights = dict()
        for file_name_groups in data.values(): # 每个dev
            for file_name_group in file_name_groups: # 前100的模型预测组合
                candidate_file_name_groups.add(str(file_name_group))
        print(f"{task}: {len(candidate_file_name_groups)}")
        candidate_file_name_weights = {item:[] for item in candidate_file_name_groups}
        for file_name_groups in data.values():
            str_file_name_groups = [str(item) for item in file_name_groups]
            for cfng in candidate_file_name_groups:
                if cfng in str_file_name_groups:
                    candidate_file_name_weights[cfng].append(str_file_name_groups.index(cfng))
                else:
                    candidate_file_name_weights[cfng].append(100000)
        candidate_file_name_weights = sorted(candidate_file_name_weights.items(), key=lambda k: sum(k[1]))
        pprint([(eval(item[0]), item[1]) for item in candidate_file_name_weights[:5]])

distress: 678
[(['MT_1_pearson_0.626.json', 'roberta-base_0_pearson_0.6346.json'],
  [100000, 90, 4, 100000, 42, 42, 100000, 1, 100000, 0]),
 (['MT_1_pearson_0.626.json',
   'roberta-base_0_pearson_0.6346.json',
   'roberta-base_9_pearson_0.606.json'],
  [100000, 100000, 7, 100000, 100000, 9, 100000, 4, 100000, 2]),
 (['MT_0_pearson_0.6059.json',
   'MT_3_pearson_0.5949.json',
   'roberta-base_0_pearson_0.6346.json',
   'roberta-base_7_pearson_0.6047.json'],
  [13, 100000, 100000, 100000, 3, 100000, 9, 100000, 6, 100000]),
 (['MT_0_pearson_0.6059.json',
   'MT_2_pearson_0.6052.json',
   'MT_3_pearson_0.5949.json',
   'roberta-base_0_pearson_0.6346.json'],
  [100000, 100000, 100000, 15, 7, 100000, 28, 100000, 2, 100000]),
 (['MT_0_pearson_0.6059.json',
   'MT_3_pearson_0.5949.json',
   'roberta-base_0_pearson_0.6346.json'],
  [100000, 100000, 100000, 54, 0, 100000, 4, 100000, 1, 100000])]
empathy: 930
[(['MT_3_pearson_0.6322.json',
   'roberta-base_14_pearson_0.6367.json',
   'roberta-b

# 按照label划分

In [22]:
import os
gold_dev_file_path = "/users10/zjli/workspace/WASSA/new_data/2023/dev.json"

from sklearn.metrics import classification_report
# key_se = {"emotion":[(0,73),(74,163),(164,292),(293,1000000000000)], 
#           "age":[(0,25),(26,29),(30,33),(34,10000000000000)],
#         "education":[(0,3),(4,4),(5,6),(7,1000000000000)],
#         "income":[(0,25000),(25001,30000),(30001,55000),(55001,1000000000000)]}

for task in ['emotion']:
    with open(gold_dev_file_path) as f:
        gold_dev_results = json.load(f)
        gold = [item[task] for item in gold_dev_results]
    file_names = os.listdir(f'./{task}/dev/')
    file_names.sort()
    all_pred_results = []
    all_combined_results = []
    final_results = []
    metrics = []
    for file_name in file_names:
        if "macro_F" in file_name or 'pearson' in file_name:
            pred_prob = []
            pred_label = []
            with open(os.path.join(f'./{task}/dev/', file_name)) as f:
                for line in f.readlines():
                    line = json.loads(line)
                    if task == 'emotion':
                        if sum(line['p_label']) == 0:
                            max_index = np.argmax(line['prob'])
                            tmp = [0]*8
                            tmp[max_index] = 1
                        else:
                            tmp = line['p_label']
                        pred_label.append(tmp)
                    elif task in ['empathy', 'distress']:
                        if line[0] < 1:
                            val = 1.0
                        elif line[0] > 7:
                            val = 7.0
                        else:
                            val = line[0]
                        pred_label.append(val)
            row_metric = {'file_name':file_name}
            report = classification_report(gold, pred_label, output_dict=True)
            for i in range(8):
                row_metric[f'{key}_{i}'] = report[str(i)]['f1-score']
            metrics.append(row_metric)
    wdf = pd.DataFrame(metrics)
    # 将 DataFrame 写入 TSV 文件
    wdf.to_csv(f'./{task}/emo_contrast_result.tsv', sep='\t', index=False)
                            

In [18]:
def split_dev(df, emo_idx):
    res = []
    for idx, item in enumerate(df):
        if item[emo_idx] == 1:
            res.append(idx)
    return res