### DroPTC vs TransSentLog

In [1]:
def get_result(embedding, scenario, loss_fc, class_weight, results):
    return results[(results['scenario'] == scenario) & (results['embedding'] == embedding) & (results['loss_fc'] == loss_fc) & (results['class_weight'] == class_weight) & (results['freeze_param'] == 'unfreeze')].sort_values(by='seed')

In [2]:
import os
import pandas as pd

# load all experimental results
results = pd.read_excel(os.path.join('..', 'experiments', 'recap', 'sentence', 'recap_overall_sentence_new.xlsx'))

In [None]:
# df[(df['Age'] > 25) & (df['City'] == 'New York')]
# droptc_best = results[(results['scenario'] == 'droptc') & (results['embedding'] == 'DroPTC-all-mpnet-base-v2-sentence') & (results['freeze_param'] == 'unfreeze')]
# DroPTC-all-mpnet-base-v2-sentence
droptc_best = get_result('DroPTC-all-mpnet-base-v2-sentence', 'droptc', 'ce', 'inverse', results)

In [4]:
# transsentlog = results[(results['scenario'] == 'transsentlog')]
transsentlog = get_result('bert-base-uncased', 'transsentlog', 'ce', 'uniform', results)

In [11]:
import pandas as pd
import numpy as np
from scipy.stats import ttest_rel, wilcoxon
import scipy.stats as st

def statistical_test(proposed, baseline, metric='f1_score'):
    # Extract runs
    proposed_scores = proposed[metric].to_numpy()
    baseline_scores = baseline[metric].to_numpy()

    # Statistical tests
    t_stat, p_t = ttest_rel(proposed_scores, baseline_scores)
    w_stat, p_w = wilcoxon(proposed_scores, baseline_scores)

    print(f"Paired t-test p-value: {t_stat:.4f} - {p_t:.4f}")
    print(f"Wilcoxon p-value: {w_stat:.4f} - {p_w:.4f}")

    # Report summary
    print("Proposed mean ± std:", proposed_scores.mean(), "±", proposed_scores.std())
    print("Baseline mean ± std:", baseline_scores.mean(), "±", baseline_scores.std())

    # compute paired differences
    diffs = proposed_scores - baseline_scores
    cohens_d = diffs.mean() / diffs.std(ddof=1)

    print("Cohen's d:", cohens_d)

    # Paired differences
    diffs = proposed_scores - baseline_scores
    n = len(diffs)

    # Mean difference
    mean_diff = diffs.mean()
    std_diff = diffs.std(ddof=1)

    # t critical value
    t_crit = st.t.ppf(1 - 0.05/2, df=n-1)

    # Margin of error
    margin = t_crit * (std_diff / np.sqrt(n))

    ci_low, ci_high = mean_diff - margin, mean_diff + margin
    print(f"Mean diff: {mean_diff:.4f}")
    print(f"95% CI: [{ci_low:.4f}, {ci_high:.4f}]")


In [6]:
statistical_test(droptc_best, transsentlog)

Paired t-test p-value: 3.1741 - 0.0113
Wilcoxon p-value: 2.0000 - 0.0152
Proposed mean ± std: 0.9305 ± 0.004964876634922545
Baseline mean ± std: 0.9183 ± 0.009550392662084642
Cohen's d: 1.0037384991765754
Mean diff: 0.0122
95% CI: [0.0035, 0.0209]


### Pre-trained vs Fine-tuned DroPTC

In [10]:
pretrained = get_result('all-mpnet-base-v2', 'droptc', 'ce', 'uniform', results)
finetuned = get_result('DroPTC-all-mpnet-base-v2-sentence', 'droptc', 'ce', 'uniformold', results)
statistical_test(finetuned, pretrained)

Paired t-test p-value: 0.2583 - 0.8020
Wilcoxon p-value: 20.0000 - 0.7669
Proposed mean ± std: 0.9282 ± 0.006808817812219671
Baseline mean ± std: 0.9273000000000001 ± 0.006067124524847011
Cohen's d: 0.08167206777017808
Mean diff: 0.0009
95% CI: [-0.0070, 0.0088]


### Fine-tuned all-mpnet-base-v2 vs all-MiniLM-L6-v2

In [11]:
# df[(df['Age'] > 25) & (df['City'] == 'New York')]
# mpnet = results[(results['scenario'] == 'droptc') & (results['embedding'] == 'DroPTC-all-mpnet-base-v2-sentence') & (results['freeze_param'] == 'unfreeze')]
mpnet = get_result('DroPTC-all-mpnet-base-v2-sentence', 'droptc', 'ce', 'uniformold', results)

In [12]:
# df[(df['Age'] > 25) & (df['City'] == 'New York')]
# minilm = results[(results['scenario'] == 'droptc') & (results['embedding'] == 'DroPTC-all-MiniLM-L6-v2-sentence') & (results['freeze_param'] == 'unfreeze')]
minilm = get_result('DroPTC-all-MiniLM-L6-v2-sentence', 'droptc', 'ce', 'uniformold', results)

In [13]:
statistical_test(mpnet, minilm)

Paired t-test p-value: 1.1182 - 0.2924
Wilcoxon p-value: 18.0000 - 0.3750
Proposed mean ± std: 0.9282 ± 0.006808817812219671
Baseline mean ± std: 0.924 ± 0.005727128425310546
Cohen's d: 0.35362021856676357
Mean diff: 0.0042
95% CI: [-0.0043, 0.0127]


### Pre-trained bert-base-uncased vs all-MiniLM-L6-v2

In [14]:
bertbase = get_result('bert-base-uncased', 'droptc', 'ce', 'uniform', results)

In [15]:
statistical_test(minilm, bertbase)

Paired t-test p-value: 2.4740 - 0.0353
Wilcoxon p-value: 7.0000 - 0.0371
Proposed mean ± std: 0.924 ± 0.005727128425310546
Baseline mean ± std: 0.9173 ± 0.006148983655857289
Cohen's d: 0.7823321735382406
Mean diff: 0.0067
95% CI: [0.0006, 0.0128]


### DroPTC vs DroPTC-WoCW

#### Effectiveness

In [3]:
droptc_best = get_result('DroPTC-all-mpnet-base-v2-sentence', 'droptc', 'ce', 'inverse', results)

In [4]:
droptc_wocw = get_result('DroPTC-all-mpnet-base-v2-sentence', 'droptc', 'ce', 'uniformold', results)

In [7]:
statistical_test(droptc_best, droptc_wocw)

Paired t-test p-value: 1.7464 - 0.1147
Wilcoxon p-value: 8.5000 - 0.0963
Proposed mean ± std: 0.9305 ± 0.004964876634922545
Baseline mean ± std: 0.9282 ± 0.006808817812219671
Cohen's d: 0.5522651509239452
Mean diff: 0.0023
95% CI: [-0.0007, 0.0053]


#### Word importance

In [15]:
# load the score 
word_importance = pd.read_excel(os.path.join('..', 'experiments', 'analysis', 'interpretability_detailed_results.xlsx'))

In [14]:
word_importance

Unnamed: 0,model_name,model_name.1,top_k,precision,avg_precision
0,3,DroPTC-WoCW,1,0.935065,0.935065
1,15,TransSentLog,1,0.922078,0.922078
2,9,DroLoVe,1,0.909091,0.909091
3,4,DroPTC-WoCW,2,0.902597,0.902597
4,12,NeuralLog,1,0.883117,0.883117
5,16,TransSentLog,2,0.863636,0.863636
6,13,NeuralLog,2,0.857143,0.857143
7,10,DroLoVe,2,0.837662,0.837662
8,1,DroPTC,2,0.831169,0.831169
9,5,DroPTC-WoCW,3,0.813084,0.82684


In [22]:
droptc_best = word_importance[word_importance['model_name'] == 'DroPTC']
droptc_best_1 = word_importance[(word_importance['model_name'] == 'DroPTC') & (word_importance['top_k'] == 1)]
droptc_best_2 = word_importance[(word_importance['model_name'] == 'DroPTC') & (word_importance['top_k'] == 2)]
droptc_best_3 = word_importance[(word_importance['model_name'] == 'DroPTC') & (word_importance['top_k'] == 3)]
droptc_wocw = word_importance[word_importance['model_name'] == 'DroPTC-WoCW']
droptc_wocw_1 = word_importance[(word_importance['model_name'] == 'DroPTC-WoCW') & (word_importance['top_k'] == 1)]
droptc_wocw_2 = word_importance[(word_importance['model_name'] == 'DroPTC-WoCW') & (word_importance['top_k'] == 2)]
droptc_wocw_3 = word_importance[(word_importance['model_name'] == 'DroPTC-WoCW') & (word_importance['top_k'] == 3)]

In [21]:
statistical_test(droptc_wocw, droptc_best, 'precision')

Paired t-test p-value: 4.2176 - 0.0000
Wilcoxon p-value: 225.5000 - 0.0001
Proposed mean ± std: 0.8881673881673883 ± 0.23132397832479967
Baseline mean ± std: 0.8080808080808082 ± 0.3093261325667731
Cohen's d: 0.27749595824047735
Mean diff: 0.0801
95% CI: [0.0427, 0.1175]


In [23]:
statistical_test(droptc_wocw_1, droptc_best_1, 'precision')

Paired t-test p-value: 2.7876 - 0.0067
Wilcoxon p-value: 15.0000 - 0.0075
Proposed mean ± std: 0.935064935064935 ± 0.24641124624688673
Baseline mean ± std: 0.8051948051948052 ± 0.3960506670643352
Cohen's d: 0.317681559606804
Mean diff: 0.1299
95% CI: [0.0371, 0.2227]


In [24]:
statistical_test(droptc_wocw_2, droptc_best_2, 'precision')

Paired t-test p-value: 2.6177 - 0.0107
Wilcoxon p-value: 40.0000 - 0.0116
Proposed mean ± std: 0.9025974025974026 ± 0.21379321601499127
Baseline mean ± std: 0.8311688311688312 ± 0.2624854926156849
Cohen's d: 0.29831690236873254
Mean diff: 0.0714
95% CI: [0.0171, 0.1258]


In [25]:
statistical_test(droptc_wocw_3, droptc_best_3, 'precision')

Paired t-test p-value: 2.2390 - 0.0281
Wilcoxon p-value: 30.5000 - 0.0246
Proposed mean ± std: 0.8268398268398269 ± 0.21894664270825617
Baseline mean ± std: 0.7878787878787878 ± 0.24564954041750836
Cohen's d: 0.2551553974162828
Mean diff: 0.0390
95% CI: [0.0043, 0.0736]


### Confidence score

In [None]:
droptc_best = pd.read_excel(os.path.join('..', 'experiments', ))