In [2]:
import os
os.chdir('/home/rm/ssd2/langcao/workspace/tablemaster')
!pwd

/home/rm/ssd2/langcao/workspace/tablemaster


In [3]:
import os
import sys
import json
import glob
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

sys.path.append('./')
from evaluate.evaluator import eval_qa, eval_fact

In [25]:
dataset = 'wikitq'
# dataset = 'tabfact'

# files = glob.glob(f'outputs/analysis/number/{dataset}/*.json')
files = glob.glob(f'outputs/analysis/number-4m-self/{dataset}/*.json')
files = sorted(files, key=lambda x: int(x.split('/')[-1].split('.')[0]))
need_calculation = []
for file in files:
    with open(file, 'r') as f:
        D = json.load(f)
        need_calculation.append(D['need_calculation'])

# statistics
print('Count: ', len(need_calculation))
print('True: ', sum(need_calculation))
print('False: ', len(need_calculation) - sum(need_calculation))
print('Percentage: ', sum(need_calculation) / len(need_calculation))

Count:  4344
True:  1373
False:  2971
Percentage:  0.31606813996316757


# Reasoning Method Analysis

In [None]:
models = ['gpt35', 'gpt4m', 'gpt4o']
# models = ['gpt4m']
# models = ['gpt35']

methods = ['pot', 'cot', 'guided_pot']
# methods = ['direct', 'pot', 'cot', 'guided_pot']


for model in models:

    for method in methods:
        # if method == 'guided_pot' and model != 'gpt35':
            # continue
        print(f'Model: {model} - Method: {method}')
        pred_path = f'outputs/baselines/{dataset}/{model}/{method}'

        pred_files = glob.glob(f'{pred_path}/*.json')
        pred_files = sorted(pred_files, key=lambda x: int(x.split('/')[-1].split('.')[0]))

        results = []
        results_of_need_calculation = []
        results_of_not_need_calculation = []

        for i, file in enumerate(pred_files):
            with open(file, 'r') as f:
                D = json.load(f)
                if dataset == 'wikitq':
                    result = eval_qa(D['predicted_answer'], D['answer'])
                elif dataset == 'tabfact':
                    result = eval_fact(D['predicted_answer'], D['answer'])
                results.append(result)

                if need_calculation[i]:
                    results_of_need_calculation.append(result)
                else:
                    results_of_not_need_calculation.append(result)

        results = np.array(results).mean().tolist()
        results_of_need_calculation = np.array(results_of_need_calculation).mean().tolist()
        results_of_not_need_calculation = np.array(results_of_not_need_calculation).mean().tolist()

        print(f'Overall: {results}')
        print(f'Need Calculation: {results_of_need_calculation}')
        print(f'Not Need Calculation: {results_of_not_need_calculation}\n')

    print('-' * 20)

# Adaptive Reasoning Analysis

In [51]:
# adaptive with o1
# model = 'gpt35'
model = 'gpt4m'
# model = 'gpt4o'

dataset = 'wikitq'
# dataset = 'tabfact'


files = glob.glob(f'outputs/analysis/number-4m-self/{dataset}/*.json')
# files = glob.glob(f'outputs/analysis/number/{dataset}/*.json')
files = sorted(files, key=lambda x: int(x.split('/')[-1].split('.')[0]))
need_calculation = []
for file in files:
    with open(file, 'r') as f:
        D = json.load(f)
        need_calculation.append(D['need_calculation'])


# statistics
print('Count: ', len(need_calculation))
print('True: ', sum(need_calculation))
print('False: ', len(need_calculation) - sum(need_calculation))
print('Percentage: ', sum(need_calculation) / len(need_calculation), '\n')


cot_pred_path = f'outputs/baselines/{dataset}/{model}/cot'
# pot_pred_path = f'outputs/baselines/{dataset}/{model}/pot'
pot_pred_path = f'outputs/baselines/{dataset}/{model}/guided_pot'

cot_pred_files = glob.glob(f'{cot_pred_path}/*.json')
cot_pred_files = sorted(cot_pred_files, key=lambda x: int(x.split('/')[-1].split('.')[0]))

pot_pred_files = glob.glob(f'{pot_pred_path}/*.json')
pot_pred_files = sorted(pot_pred_files, key=lambda x: int(x.split('/')[-1].split('.')[0]))


cot_results = []
pot_results = []
combined_results = []
best_results = []
best_results_pot_count = 0

for i, (cot_file, pot_file) in enumerate(zip(cot_pred_files, pot_pred_files)):


    with open(cot_file, 'r') as f:
        cot_D = json.load(f)
        if dataset == 'wikitq':
            cot_result = eval_qa(cot_D['predicted_answer'], cot_D['answer'])
        elif dataset == 'tabfact':
            cot_result = eval_fact(cot_D['predicted_answer'], cot_D['answer'])
        cot_results.append(cot_result)

    with open(pot_file, 'r') as f:
        pot_D = json.load(f)
        if dataset == 'wikitq':
            pot_result = eval_qa(pot_D['predicted_answer'], pot_D['answer'])
        elif dataset == 'tabfact':
            pot_result = eval_fact(pot_D['predicted_answer'], pot_D['answer'])
        best_results.append(max(cot_result, pot_result))

        if pot_result and not cot_result:
            best_results_pot_count += 1
        
        cot_results.append(cot_result)
        pot_results.append(pot_result)

    if i >= len(need_calculation):
        continue
    if need_calculation[i]:
        combined_results.append(pot_result)
    else:
        combined_results.append(cot_result)
    

cot_results = np.array(cot_results).mean().tolist() * 100
pot_results = np.array(pot_results).mean().tolist() * 100
combined_results = np.array(combined_results).mean().tolist() * 100
best_results = np.array(best_results).mean().tolist() * 100

print(f'CoT: {cot_results:.2f}')
print(f'POT: {pot_results:.2f}')
print(f'Combined: {combined_results:.2f}\n')
print(f'Best: {best_results:.2f}')
print(f'Best POT Count: {best_results_pot_count}')

Count:  4344
True:  1590
False:  2754
Percentage:  0.3660220994475138 



CoT: 72.97
POT: 68.83
Combined: 74.08

Best: 85.06
Best POT Count: 525


# TableMaster Adapative Reasoning

In [14]:
# adaptive with o1
model = 'tablemaster-4m'
# model = 'tablemaster-4o'
# model = 'tablemaster-35'

dataset = 'wikitq'
# dataset = 'tabfact'


pred_path = f'outputs/main/{dataset}/{model}'

pred_files = glob.glob(f'{pred_path}/*.json')
pred_files = sorted(pred_files, key=lambda x: int(x.split('/')[-1].split('.')[0]))




results = []
self_strategy = []
o1_strategy = need_calculation

for i, pred_file in enumerate(pred_files):


    with open(pred_file, 'r') as f:
        pred_D = json.load(f)
        if dataset == 'wikitq':
            result = eval_qa(pred_D['predicted_answer'], pred_D['answer'])
        elif dataset == 'tabfact':
            result = eval_fact(pred_D['predicted_answer'], pred_D['answer'])
        
        results.append(result)

        if pred_D['reasoning_process']['symbolic_reasoning_process'] == '':
            self_strategy.append(1)
        else:
            self_strategy.append(0)



print('TableMaster adaptive reasoning performance')
strategy = self_strategy
cot_results = []
pot_results = []

for i, result in enumerate(results):
    if strategy[i]:
        pot_results.append(result)
    else:
        cot_results.append(result)    

mean_cot_results = np.array(cot_results).mean().tolist() * 100
mean_pot_results = np.array(pot_results).mean().tolist() * 100

print(f'CoT: {mean_cot_results:.2f}, number of questions: {len(cot_results)}')
print(f'POT: {mean_pot_results:.2f}, number of questions: {len(pot_results)}')

print('TableMaster performance in two types of questions')
strategy = o1_strategy
cot_results = []
pot_results = []

for i, result in enumerate(results):
    if strategy[i]:
        pot_results.append(result)
    else:
        cot_results.append(result)    

mean_cot_results = np.array(cot_results).mean().tolist() * 100
mean_pot_results = np.array(pot_results).mean().tolist() * 100

print(f'No need calculation: {mean_cot_results:.2f}, number of questions: {len(cot_results)}')
print(f'Need calculation: {mean_pot_results:.2f}, number of questions: {len(pot_results)}')



TableMaster adaptive reasoning performance
CoT: 72.78, number of questions: 2520
POT: 85.53, number of questions: 1824
TableMaster performance in two types of questions
No need calculation: 83.90, number of questions: 1652
Need calculation: 74.59, number of questions: 2692
