In [252]:
import pandas as pd
import json
from statsmodels.stats.inter_rater import fleiss_kappa,aggregate_raters
import numpy as np
from collections import Counter
from scipy import stats
import krippendorff
import os


In [253]:
dir = '../phase2_annotations_part1'

In [None]:
annotation_files = [file for file in os.listdir(dir) if file.endswith('.json')]
annotation_files

In [None]:
test_types = set()
for file in annotation_files:
    task = file.split('-')[1]
    task = task.replace('.json', '')
    # print(task)
    test_types.add(task)
len(test_types)

In [256]:
annotations_all = {}
for file in annotation_files:
    fn = os.path.join(dir, file)
    with open(fn, 'rb') as f:
        batch_name = file.split('.json')[0]
        annotations = json.load(f)
        obj = {batch_name: annotations}
        annotations_all.update(obj)

In [None]:
len(annotations_all)

In [258]:
tasks_name = ['conversational_qa', 'coreference_resolution', 'dialogue_contradiction_detection', 'named_entity_recognition', 'sentiment_analysis']

In [259]:
modification_name = ['active_to_passive', 'capitalization', 'casual','compound_word', 'concept_replacement', 'coordinating_conjunction', 'derivation', 'dialectal', 'discourse', 'geographical_bias', 'grammatical_role', 'length_bias', 'negation', 'punctuation', 'sentiment', 'temporal_bias', 'typo_bias']

In [260]:
have_subtest = ['concept_replacement', 'dialectal', 'discourse', 'negation']

In [None]:
len(annotations_all['grammatical_role-phaseTwo1103-coreference_resolution-batch2']['5c16a14c5ca98a00018d30c4']['answers'])

In [None]:
len(annotations_all['grammatical_role-phaseTwo1103-coreference_resolution-batch2']['5fcfedeec803e31be21a9483']['answers'])

In [263]:
def construct_data(filter_list, filter = False):
    annotation_dict = {}
    for task in modification_name:
        
        annotation_dict[task] = []
        # test_name = task
        
    for batch_name in annotations_all.keys():
        # print(batch_name)
        task = batch_name.split('-')[0]
        bigtask_name = batch_name.split('-')[2]
        test_name = task 
        release = batch_name.split('-')[1]
        
        for task_name in modification_name:
            if task_name == task:
                
                df_task = []
                for annotator, value in annotations_all[batch_name].items():
                    if filter:
                        if annotator in filter_list:
                            continue
                    # for sample in annotator['answers']:
                    # print(annotator)
                    # print(value['answers'])
                    df = pd.DataFrame(value['answers'])
                    subtests = []
                    tests = [test_name]*len(df)
                    tasks = [bigtask_name]*len(df)
                    new_row = {}
                    for i, row in df.iterrows():
                        if row['is_control'] == True:
                            subtest = 'control'
                        else:
                            if release == 'release0808':
                                if task_name == 'concept_replacement':
                                    subtest = 'synonym'
                                elif task_name == 'discourse':
                                    subtest = 'reverse'
                                elif task_name == 'negation':
                                    subtest = 'verbal'
                                else:
                                    subtest = ''
                            else:
                                subtest = row['original_question'].get('type','')
                            if test_name not in have_subtest:
                                subtest = test_name
                        subtests.append(subtest)
                    if len(df) == 22:
                        worker_id = list(df['worker_id'])[0]
                        if df['is_control'].value_counts().get(True) < 3:
                            is_control = True
                            answer = 1
                            subtest = 'control'
                        else:
                            is_control = False
                            answer = 0
                        # if bigtask_name == 'active_to_passive':
                            # print(df)
                        new_row = {'time': None, 'instance_index': len(df)+1, 'worker_id': worker_id,	'explanation': None, 'answer': str(answer),	'original_question': 0,	'is_control':is_control, 'subtest': subtest, 'test': test_name, 'task': bigtask_name}
                        
                    elif len(df) < 16:
                        print(task, annotator) #some batches with less than 12 questions
                        continue
                    df['test'] = tests
                    df['task'] = tasks
                    df['subtest'] = subtests
                    if len(df) == 22:
                        new_row_df = pd.DataFrame([new_row])
                        df = pd.concat([df,new_row_df], ignore_index = True)
                    df_task.append(df)
                if len(df_task) <= 1:
                    continue
                df_task = pd.concat(df_task)    
                annotation_dict[task_name].append(df_task)
        # break
    return annotation_dict


In [None]:
annotations = construct_data(filter_list = None)

In [None]:
annotations['grammatical_role'][1]

In [None]:
annotations['grammatical_role'][2]['subtest'].value_counts()

In [267]:
def find_modes_and_counts(arr):
    # print(len(arr))
    # for row in arr:
        # print(len(row))
    arr = list(arr)
    arr = np.array(arr)
    # print(arr)
    _, cols = arr.shape
    modes = []
    counts = []
    
    # Process each column separately
    for j in range(cols):
        values, value_counts = np.unique(arr[:, j], return_counts=True)
        max_count = value_counts.max()
        # Find all values that appear max times
        col_modes = values[value_counts == max_count]
        # print(col_modes[0])
        # print(max_count[0])
        
        modes.append(col_modes[0])
        counts.append(max_count)
    
    modes = np.array(modes)
    counts = np.array(counts)
    return modes, counts


In [268]:
def string_xor_logical(arr1, arr2):
    # Using logical_xor for element-wise comparison
    arr1 = np.asarray(arr1)
    arr2 = np.asarray(arr2)
    
    # Convert strings to boolean based on non-emptiness
    # bool1 = arr1 != ''
    # bool2 = arr2 != ''
    # print(bool1)
    # print(bool2)
    # Perform logical XOR
    return np.logical_xor(arr1, arr2)


In [269]:
def calculate_krippendof_for_each_batch(df_batch):
    annotation_data = []
    # print(df_batch)
    for group in df_batch.groupby('worker_id'):
        preds = np.array(group[1]['answer']).astype(str)
        annotation_data.append(preds)
    # print(annotation_data)

    data = np.array(annotation_data)
    # print(data)
    # Transpose the data
    # transposed_data = data.T

    # Transform the transposed data
    # aggregated_data, categories = aggregate_raters(transposed_data)
    # print(aggregated_data)
    # Calculate Fleiss' Kappa
    alpha = krippendorff.alpha(data, level_of_measurement='nominal')
    # kappa = fleiss_kappa(aggregated_data, method='randolph')
    # print(f"Krippendorff alpha: {alpha:.4f}")
    return alpha

In [270]:
def calculate_fleiss_kappa_for_each_batch(df_batch):
    annotation_data = []
    for group in df_batch.groupby('worker_id'):
        preds = np.array(group[1]['answer']).astype(str)
        annotation_data.append(preds)
    # print(len(annotation_data))
    # for row in annotation_data:
    #     print(len(row))
    data = np.array(annotation_data)

    # print(data)
    # print(data)
    # Transpose the data
    transposed_data = data.T

    # Transform the transposed data
    aggregated_data, categories = aggregate_raters(transposed_data)
    # print(aggregated_data)
    # Calculate Fleiss' Kappa
    kappa = fleiss_kappa(aggregated_data, method='randolph')
    # print(f"Fleiss' Kappa: {kappa:.4f}")
    return kappa

In [271]:
def get_majority(batch, df_performance = None):
    annotation_data = []
    annotation_data_no_control = []
    annotation_data_control_only = []
    cur_best_annotator_score = 0
    best_preds = None
    for group in batch.groupby('worker_id'):
        # print(df_annotator[df_annotator[['success_rate_control_only']] == group[0]])
        preds = list(group[1]['answer'])
        answers_no_control = group[1][group[1]['is_control'] != True]['answer'] 
        answers_control_only = group[1][group[1]['is_control'] == True]['answer'] 
        if df_performance.empty != True:
            # print(df_performance)
            # print(group[0])
            annotator_score = df_performance[df_performance['annotator_id'] == group[0]]['success_rate_control_only'].values[0]
            # print(annotator_score)
            if annotator_score > cur_best_annotator_score:
                best_preds = preds
                cur_best_annotator_score = annotator_score

        # answers_control_only = group[1][group[1]['is_control'] == True]['answer'] 


        annotation_data.append(preds)
        annotation_data_no_control.append(answers_no_control)
        annotation_data_control_only.append(answers_control_only)


    # print(annotation_data)
    # modes  = stats.mode(annotation_data,axis = 0,keepdims = True).mode[0]
    # counts   = stats.mode(annotation_data,axis = 0,keepdims = True).count[0]
    # print(annotation_data)
    modes, counts = find_modes_and_counts(annotation_data)
    # print(stats.mode(annotation_data_no_control, axis=0, keepdims = True))
    # print(annotation_data_no_control)
    # modes_no_control = stats.mode(annotation_data_no_control, axis=0, keepdims = True).mode[0]
    # counts_no_control = stats.mode(annotation_data_no_control, axis=0, keepdims = True).count[0]
    modes_no_control, counts_no_control = find_modes_and_counts(annotation_data_no_control)
    if df_performance.empty != True:
        for i, count in enumerate(counts):
            if count <= len(batch.groupby('worker_id')) // 2:
                modes[i] = best_preds[i]

    # modes_control_only = stats.mode(annotation_data_control_only, axis=0).mode[0]
    modes_control_only, counts_control_only = find_modes_and_counts(annotation_data_control_only)
    # modes_control_only = np.zeros(len(modes) - len(modes_no_control), dtype = int)
    
    # print(modes.mode[0])
    return modes, modes_no_control, modes_control_only, counts

In [272]:
def calculate_success_rate_majority(batch, df_performance = None):
    rates = []
    rates_no_control = []
    rates_control_only = []
    rates_gpt4 = []
    modes, modes_no_control, modes_control_only, majority_counts = get_majority(batch, df_performance)
    
    # print(modes)
    # print(modes_no_control)
    # print(modes_control_only)
    df_annotator = []
    # answers = np.array(batch.groupby('worker_id')[0][1]['answer'])
    # print(len(batch.groupby('worker_id')))
    majority = len(batch.groupby('worker_id')) // 2
    # annotator_counts = len(batch.groupby('worker_id'))
    retain_count = 0
    total_count = 0
    retain_samples = [None]*len(modes)
    # retain = False
    
    for j,group in enumerate(batch.groupby('worker_id')):
        # if j == 1:
            # print(len(group[1]))
        for i,sample in group[1].iterrows():
            if sample['is_control'] == False:
                item = {}
                if type(sample['original_question']) == int:
                        continue
                item['original_question'] = sample['original_question']                       
                item['task'] = sample['task']
                item['test'] = sample['test']
                item['subtest'] = sample['subtest']
                # item['label'] = sample['answer']
                item['label'] = modes[i]
                item['explanation'] = sample['explanation']
                retain_samples[i] = item
        retain = True
        answers = np.array(list(group[1]['answer']))
        answers_no_control = np.array(group[1][group[1]['is_control'] != True]['answer']) 
        answers_control_only = np.array(group[1][group[1]['is_control'] == True]['answer'])

        
        correct_answer = answers == modes
        success_rate = np.count_nonzero(correct_answer) / len(answers)

        # print('Answer', answers_no_control)
        # print('Modes', modes_no_control)
        # print(np.logical_xor(answers_no_control, modes_no_control))
        correct_answer_no_control =  answers_no_control == modes_no_control

        success_rate_no_control = np.count_nonzero(correct_answer_no_control) / len(answers_no_control)

        correct_answer_control_only = answers_control_only == modes_control_only
        success_rate_control_only = np.count_nonzero(correct_answer_control_only) / len(answers_control_only)

        # retain_count += np.count_nonzero(modes_no_control)
        # total_count += len(answers_no_control)

        success_rate_gpt4 = np.count_nonzero(modes_no_control) / len(modes_no_control)
        negative_count = np.count_nonzero(answers_no_control == 0) 
        # print('Success rate for annotator with control', group[0], success_rate)
        # print('Success rate for annotator without control', group[0], success_rate_no_control)
        # print('Success rate for annotator control only', group[0], success_rate_control_only)
        df_annotator_row = {}
        df_annotator_row['annotator_id'] = group[0]
        df_annotator_row['success_rate_with_control'] = success_rate
        df_annotator_row['success_rate_without_control'] = success_rate_no_control
        df_annotator_row['success_rate_control_only'] = success_rate_control_only
        df_annotator_row['negative_count'] = negative_count
        # df_annotator_row['counts'] = counts
        df_annotator.append(df_annotator_row)

        rates.append(success_rate)
        rates_no_control.append(success_rate_no_control)
        rates_control_only.append(success_rate_control_only)
        rates_gpt4.append(success_rate_gpt4)

    # print(majority_counts)
    # print(majority)
    majority_rate = (majority_counts>majority).sum()/ len(answers)
    # print(len(retain_samples))
    total_count = len(answers_no_control)
    retain_count = len(retain_samples)
    # retain_count = np.count_nonzero(modes_no_control)
    # print(total_count, retain_count)
    # print(retain_samples)
    retain_samples = [x for x in retain_samples if x is not None]

    return np.mean(rates), np.mean(rates_no_control), np.mean(rates_control_only), np.mean(rates_gpt4), majority_rate, retain_samples, total_count, retain_count, df_annotator


In [273]:
def calculate_all(filter_list, filter, hit_function, df_performance = None):
    df_task = []
    df_annotator  = []
    df_subtask = []
    df_retain_task = []
    annotation_dict = construct_data(filter_list, filter)
    for key,value in annotation_dict.items():
        # print(key)
        kappa_task = []
        krippendorff_task = []
        success_rate_task = []
        success_rate_task_no_control = []
        success_rate_task_control_only = []
        success_rate_gpt4_task = []
        majority_rate_task = []
        for i, batch in enumerate(value):
            # print(batch)
            # if i==0:
            test_name = key
            task_name = list(batch['task'])[0]
            subtest_name = list(batch['subtest'])
            for name in subtest_name:
                if name != 'control':
                    subtest_name = name
                    break
            print(task_name, test_name, subtest_name)
                
            
            kappa_batch = calculate_fleiss_kappa_for_each_batch(batch)
            krippendorff_batch = calculate_krippendof_for_each_batch(batch)
            if hit_function == 'majority':
                success_rate_batch, success_rate_batch_no_control, success_rate_batch_control_only, success_rate_gpt4_batch, majority_rate_batch, retain_samples_batch, total_count_batch, retain_count_batch,  df_annotator_row = calculate_success_rate_majority(batch, df_performance)
            # print('Fleiss kappa batch', i+1, kappa_batch)
            # print('Krippendorff alpha batch', i+1, krippendorff_batch)

            kappa_task.append(kappa_batch)
            krippendorff_task.append(krippendorff_batch)

            # print('Success rate batch with control', i+1, success_rate_batch)
            # print('Success rate batch without control', i+1, success_rate_batch_no_control)
            success_rate_task.append(success_rate_batch)
            success_rate_task_no_control.append(success_rate_batch_no_control)
            success_rate_task_control_only.append(success_rate_batch_control_only)
            success_rate_gpt4_task.append(success_rate_gpt4_batch)
            majority_rate_task.append(majority_rate_batch)
            # annotator_counts_task.append(annotator_counts_batch)
            df_annotator.extend(df_annotator_row) 
            # df_majority_counts.extend(majority_counts)
            # df_annotator_counts.append(annotator_counts_batch)
            df_retain_task.extend(retain_samples_batch)
            df_subtask_row = {}
            df_subtask_row['task'] = task_name
            df_subtask_row['test'] = test_name
            df_subtask_row['subtest'] = subtest_name
            df_subtask_row['total'] = total_count_batch
            df_subtask_row['retain'] = retain_count_batch
            df_subtask.append(df_subtask_row)
        # print('Fleiss kappa for', key, np.mean(kappa_task))
        # print('Krippendorff alpha for', key, np.mean(krippendorff_task))

        # print('Success rate task with control', key, np.mean(success_rate_task))
        # print('Success rate task without control', key, np.mean(success_rate_task_no_control))
        # print('Success rate task control only', key, np.mean(success_rate_task_control_only))

        df_task_row = {}
        df_task_row['task'] = key
        df_task_row['kappa'] = np.mean(kappa_task)
        df_task_row['krippendorff'] = np.mean(krippendorff_task)

        df_task_row['success_rate_with_control'] = np.mean(success_rate_task)
        df_task_row['success_rate_without_control'] = np.mean(success_rate_task_no_control)
        df_task_row['success_rate_control_only'] = np.mean(success_rate_task_control_only)
        df_task_row['success_rate_gpt4'] = np.mean(success_rate_gpt4_task)
        df_task_row['majority_rate'] = np.mean(majority_rate_task)
        # df_task_row['annotator_counts'] = sum(annotator_counts_task)
        # print('annotator count',df_annotator_counts)
        df_task.append(df_task_row)
    df_task = pd.DataFrame(data = df_task)
    df_subtask = pd.DataFrame(data = df_subtask)
    df_annotator = pd.DataFrame(data = df_annotator)
    # print(df_retain_task)
    df_retain_task = pd.DataFrame(data = df_retain_task)
    return df_task, df_annotator, df_retain_task, df_subtask

In [None]:
df_task, df_annotator, df_retain_task, df_subtask = calculate_all(filter_list = None, filter = False, hit_function = 'majority', df_performance = pd.DataFrame())

In [275]:
df_task = df_task.dropna()

In [276]:
df_task = df_task.drop(columns=['success_rate_gpt4'])

In [None]:
df_task

In [None]:
df_annotator_group = df_annotator.groupby('annotator_id').mean().reset_index()
df_annotator_group['success_rate_control_only'].value_counts()

In [None]:
df_retain_task['original_question'][0]

In [None]:
df_retain_task

In [281]:
dialog_label_mapping = {0:'No', 1: 'Yes', 2: 'Hard to say'}
sentiment_label_mapping = {0: 'Negative', 1: 'Positive'}

In [282]:
rows = []
for i, row in df_retain_task.iterrows():
    task  = row['task']
    test = row['test']
    if task == 'coreference_resolution':
        task_dir = 'coref'
    elif task == 'dialog_contradiction_detection':
        task_dir = 'dialogue'
    elif task == 'sentiment_analysis':
        task_dir = 'sa'
    file_to_load = '../data_for_phase2/' + task_dir + '/' + test + '.json'
    df_original = pd.read_json(file_to_load)
    # print(row['original_question'])
    if row['original_question'].get('index_in_phase1_annotated_data')!=None:
        index = row['original_question']['index_in_phase1_annotated_data']
        # print(df_original.iloc[index])
        # if df_original.iloc[index].get('original_label')!=None:
        original_label = df_original.iloc[index]['original_label']
        candidates = df_original.iloc[index]['original_candidates']
        # print(candidates)
        original_label = candidates[original_label]

        # else:
            # original_label = df_original.iloc[index]['label']
            
    elif row['original_question'].get('index_in_original_testset')!=None:
        index = row['original_question']['index_in_original_testset']
        # print(index)
        # print(df_original)
        original_label = df_original.loc[df_original[0] == index][1].values[0]['label']
        original_label = dialog_label_mapping[original_label]
        # print()
        # print(original_label)
    else:
        # print(row['original_question'])
        index = row['original_question']['idx']
        original_label = df_original.loc[df_original['idx'] == index]['label'].values[0]
        original_label = sentiment_label_mapping[original_label]

        # print(original_label)
    row['original_label'] = original_label
    rows.append(row)


In [283]:
df_phase2  = pd.DataFrame(data = rows)

In [None]:
df_phase2[df_phase2['task'] == 'coreference_resolution']

In [285]:
df_phase2['label_change'] = (df_phase2['label'] != df_phase2['original_label']).astype(int)

In [None]:
df_phase2

In [None]:
df_phase2[df_phase2['task'] == 'coreference_resolution']

In [None]:
label_change_rate = (df_phase2.groupby(['task', 'test'])['label_change']
                    .mean()  # Calculate mean of label_change (will give us the rate)
                    .multiply(100)  # Convert to percentage
                    .round(2)  # Round to 2 decimal places
                    .reset_index())  # Convert from Series to DataFrame

# Display the results
print("Label Change Rate (%) by Task and Test:")
print(label_change_rate)

In [None]:
# Calculate label change statistics for each group
label_change_stats = (df_phase2.groupby(['task', 'test'])
                     .agg({
                         'label_change': ['size','sum', 'mean' ]  # mean for rate, sum for mismatches, size for total
                     })
                     .round(4))

# Flatten column names and reset index
label_change_stats.columns = ['total_samples', 'samples_with_label_change' ,'change_rate' ]
label_change_stats = label_change_stats.reset_index()

# Convert rate to percentage
label_change_stats['change_rate'] = label_change_stats['change_rate'] * 100

# Sort by change rate in descending order (optional)
label_change_stats = label_change_stats.sort_values('change_rate', ascending=False)

print("Label Change Statistics by Task and Test:")
print(label_change_stats)

In [None]:
label_change_stats

In [291]:
filter_annotators = list(df_annotator_group.loc[df_annotator_group['success_rate_control_only'] < 0.5]['annotator_id'])

In [None]:
len(filter_annotators)

In [None]:
df_task_filter, df_annotator_filter, df_retain_task_filter, df_subtask_filter = calculate_all(filter_list = filter_annotators, filter = True, hit_function = 'majority', df_performance= df_annotator_group)

In [None]:
rows = []
rows_coref = []
rows_sentiment = []
rows_dialog = []

for i, row in df_retain_task_filter.iterrows():
    task  = row['task']
    test = row['test']
    if task == 'coreference_resolution':
        task_dir = 'coref'
    elif task == 'dialog_contradiction_detection':
        task_dir = 'dialogue'
    elif task == 'sentiment_analysis':
        task_dir = 'sa'
    file_to_load = '../data_for_phase2/' + task_dir + '/' + test + '.json'
    df_original = pd.read_json(file_to_load)
    # print(row['original_question'])
    if row['original_question'].get('index_in_phase1_annotated_data')!=None:
        index = row['original_question']['index_in_phase1_annotated_data']
        # print(df_original.iloc[index])
        # if df_original.iloc[index].get('original_label')!=None:
        original_label = df_original.iloc[index]['original_label']
        candidates = df_original.iloc[index]['original_candidates']
        # print(candidates)
        original_label = candidates[original_label]
        original_row = df_original.iloc[index]
        original_row['modified_label'] = row['label']
        original_row['test'] = row['test']
        rows_coref.append(original_row)
        # else:
            # original_label = df_original.iloc[index]['label']
            
    elif row['original_question'].get('index_in_original_testset')!=None:
        index = row['original_question']['index_in_original_testset']
        # print(index)
        # print(df_original)
        original_label = df_original.loc[df_original[0] == index][1].values[0]['label']
        original_label = dialog_label_mapping[original_label]
        original_row = df_original.loc[df_original[0] == index][1].values[0]
        # print()
        original_row['modified_label'] = row['label']
        original_row['test'] = row['test']

        # print(original_label)
        rows_dialog.append(original_row)
    else:
        # print(row['original_question'])
        index = row['original_question']['idx']
        original_label = df_original.loc[df_original['idx'] == index]['label'].values[0]
        original_label = sentiment_label_mapping[original_label]
        original_row = df_original.loc[df_original['idx'] == index]
        original_row['modified_label'] = row['label']
        original_row['test'] = row['test']
        for i, item in original_row.iterrows():
            print(item)
            rows_sentiment.append(item)
        
        # print(original_label)
    row['original_label'] = original_label

    
    rows.append(row)


In [None]:
df_coref = pd.DataFrame(data = rows_coref)
df_coref

In [None]:
df_sentiment = pd.DataFrame(data = rows_sentiment)
df_sentiment

In [None]:
df_dialog = pd.DataFrame(data = rows_dialog)
df_dialog

In [295]:
df_phase2  = pd.DataFrame(data = rows)
df_phase2['label_change'] = (df_phase2['label'] != df_phase2['original_label']).astype(int)

In [None]:
label_change_stats = (df_phase2.groupby(['task', 'test'])
                     .agg({
                         'label_change': ['size','sum', 'mean' ]  # mean for rate, sum for mismatches, size for total
                     })
                     .round(4))

# Flatten column names and reset index
label_change_stats.columns = ['total_samples', 'samples_with_label_change' ,'change_rate' ]
label_change_stats = label_change_stats.reset_index()

# Convert rate to percentage
label_change_stats['change_rate'] = label_change_stats['change_rate'] * 100

# Sort by change rate in descending order (optional)
label_change_stats = label_change_stats.sort_values('change_rate', ascending=False)

print("Label Change Statistics by Task and Test:")
print(label_change_stats)

In [None]:
label_change_stats

In [None]:
df_annotator_group = df_annotator.groupby('annotator_id').mean().reset_index()
df_annotator_group['success_rate_control_only'].value_counts()

In [299]:
filter_annotators = list(df_annotator_group.loc[df_annotator_group['success_rate_control_only'] < 0.5]['annotator_id'])

In [None]:
df_subtask_filter

In [301]:
df_retain_task.to_csv('df_data_phase2.csv',index = None)

In [None]:
df_phase2['task']

In [303]:
df_phase2.to_csv('df_data_phase2.csv',index = None)

In [304]:
df_phase2_coref = df_phase2[df_phase2['task'] == 'coreference_resolution']

In [None]:
df_phase2_coref['original_question'][2]

In [None]:
rows = []
for i, row in df_phase2_coref.iterrows():
    test = row['test']
    

In [422]:
df_coref_task = df_coref.groupby('test')

In [None]:
for group, frame in df_coref_task:
    to_write = '../data_after_phase2/coref/' + group + '.json'
    obj = frame.to_dict(orient='records')
    print(obj)
    with open(to_write,'w') as f:
        json.dump(obj, f)
    # break

In [None]:
df_sentiment_task = df_sentiment.groupby('test')
for group, frame in df_sentiment_task:
    to_write = '../data_after_phase2/sa/' + group + '.json'
    obj = frame.to_dict(orient='records')
    print(obj)
    with open(to_write,'w') as f:
        json.dump(obj, f)
    # break

In [None]:
df_dialog_task = df_dialog.groupby('test')
for group, frame in df_dialog_task:
    to_write = '../data_after_phase2/dialogue/' + group + '.json'
    obj = frame.to_dict(orient='records')
    print(obj)
    with open(to_write,'w') as f:
        json.dump(obj, f)
    # break