In [58]:
import csv
import pandas as pd
from os import listdir
from os.path import isfile, join
from scipy.stats import ttest_rel

pd.set_option('display.max_rows', 100)

In [8]:
survey_result_path = '../survey'
survey_result_file_lst = [f for f in listdir(survey_result_path) if isfile(join(survey_result_path, f)) and f.endswith(".csv")]

file_of_interest = survey_result_file_lst[0]

In [53]:
def parse_survey_results(file, verbose = 0):
    with open(join(survey_result_path, file)) as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        line_count = 0
        question_index_lst = []
        
        input_dict3_lst = []
        actual_score_lst = []
        gpt2_tuned_score_lst = []
        xlnet_score_lst = []
        lstm_attention_score_lst = []
        gpt2_untuned_bert_score_lst = []
        gpt2_untuned_score_lst = []
        for row in csv_reader:
            if line_count == 0:
                for (i,item) in enumerate(row):
                    if '_' in item:
                        question_index_lst.append(i)
            elif line_count == 1:
                for (i,item) in enumerate(row):
                    if i in question_index_lst:
                        input_dict3_lst.append(int(item.split('. ')[0]))
            elif line_count > 2:
                j = 0
                for (i,item) in enumerate(row):
                    if i in question_index_lst:
                        if len(item) == 0:
                            score_num = ''
                        else:
                            if item == '1':
                                score_num = 1
                            elif item == '2':
                                score_num = 2
                            elif item == '6':
                                score_num = 3
                            elif item == '7':
                                score_num = 4
                            elif item == '8':
                                score_num = 5
                                
                        if j % 6 == 0:
                            actual_score_lst.append(score_num)
                        elif j % 6 == 1:
                            gpt2_tuned_score_lst.append(score_num)
                        elif j % 6 == 2:
                            xlnet_score_lst.append(score_num)
                        elif j % 6 == 3:
                            lstm_attention_score_lst.append(score_num)
                        elif j % 6 == 4:
                            gpt2_untuned_bert_score_lst.append(score_num)
                        elif j % 6 == 5:
                            gpt2_untuned_score_lst.append(score_num)
                        j += 1
            line_count += 1
            
    response_count = line_count - 3    
    
    # Due to my stupidity, manual part3 - 20 has 119 repeated twice
    # The code below won't work for this scenario
    # input_dict3_nodup_lst = sorted(set(input_dict3_lst),key=input_dict3_lst.index)
    input_dict3_nodup_lst = []
    prev = -1
    for item in input_dict3_lst:
        if item != prev:
            input_dict3_nodup_lst.append(item)
        prev = item
    
    if verbose == 1:
        print(len(input_dict3_nodup_lst))
        print(len(actual_score_lst))
    
    df = pd.DataFrame(list(zip( 
            input_dict3_nodup_lst*response_count, 
            actual_score_lst, 
            gpt2_tuned_score_lst, 
            xlnet_score_lst, 
            lstm_attention_score_lst, 
            gpt2_untuned_bert_score_lst, 
            gpt2_untuned_score_lst 
            )), 
            columns =['Sim_index', 'Actual', 'gpt2_tuned', 'xlnet', 'lstm_attention', 'gpt2_untuned_bert', 'gpt2_untuned']) 
    
    if verbose == 1:
        print(df)
        
    df = df.apply(pd.to_numeric, errors='coerce')
    df = df.dropna()
    
    return df

In [56]:
df_all = None
for file_of_interest in survey_result_file_lst:
    df_of_interest = parse_survey_results(file_of_interest)
    print(df_of_interest.shape)
    if df_all is None:
        df_all = df_of_interest
    else:
        df_all = df_all.append(df_of_interest, ignore_index=True)
    
print(df_all.shape)

(70, 7)
(50, 7)
(30, 7)
(50, 7)
(200, 7)


In [63]:
df_all.drop(columns=['Sim_index']).mean()

Actual               3.425
gpt2_tuned           2.905
xlnet                3.215
lstm_attention       1.580
gpt2_untuned_bert    2.795
gpt2_untuned         3.135
dtype: float64

In [73]:
# paired sample t-test

for method in df_all.drop(columns=['Sim_index', 'Actual']).columns:
    print('--- Compare Actual against %s ---' %method)
    # compare samples
    stat, p = ttest_rel(df_all['Actual'], df_all[method])
    print('t-Statistics=%.3f, 2-tailed p-value=%.3f' % (stat, p))
    # interpret
    alpha = 0.05
    if p > alpha:
        print('Same distributions (fail to reject H0)')
    else:
        print('Different distributions (reject H0)')
    print('\n')
        
for method in df_all.drop(columns=['Sim_index', 'Actual', 'xlnet']).columns:
    print('--- Compare xlnet against %s ---' %method)
    # compare samples
    stat, p = ttest_rel(df_all['xlnet'], df_all[method])
    print('t-Statistics=%.3f, 2-tailed p-value=%.3f' % (stat, p))
    # interpret
    alpha = 0.05
    if p > alpha:
        print('Same distributions (fail to reject H0)')
    else:
        print('Different distributions (reject H0)')
    print('\n')
    
for method in df_all.drop(columns=['Sim_index', 'Actual', 'xlnet', 'gpt2_untuned']).columns:
    print('--- Compare gpt2_untuned against %s ---' %method)
    # compare samples
    stat, p = ttest_rel(df_all['gpt2_untuned'], df_all[method])
    print('t-Statistics=%.3f, 2-tailed p-value=%.3f' % (stat, p))
    # interpret
    alpha = 0.05
    if p > alpha:
        print('Same distributions (fail to reject H0)')
    else:
        print('Different distributions (reject H0)')
    print('\n')

--- Compare Actual against gpt2_tuned ---
t-Statistics=4.315, 2-tailed p-value=0.000
Different distributions (reject H0)


--- Compare Actual against xlnet ---
t-Statistics=1.747, 2-tailed p-value=0.082
Same distributions (fail to reject H0)


--- Compare Actual against lstm_attention ---
t-Statistics=18.492, 2-tailed p-value=0.000
Different distributions (reject H0)


--- Compare Actual against gpt2_untuned_bert ---
t-Statistics=5.974, 2-tailed p-value=0.000
Different distributions (reject H0)


--- Compare Actual against gpt2_untuned ---
t-Statistics=2.627, 2-tailed p-value=0.009
Different distributions (reject H0)


--- Compare xlnet against gpt2_tuned ---
t-Statistics=2.764, 2-tailed p-value=0.006
Different distributions (reject H0)


--- Compare xlnet against lstm_attention ---
t-Statistics=16.995, 2-tailed p-value=0.000
Different distributions (reject H0)


--- Compare xlnet against gpt2_untuned_bert ---
t-Statistics=3.846, 2-tailed p-value=0.000
Different distributions (reject H