In [47]:
from scipy import stats
import typing as t
import numpy as np

class PairedTTest:
    @staticmethod
    def common_users(arr_0: t.Dict[int, float], arr_1:  t.Dict[int, float]):
        return list(arr_0.keys() & arr_1.keys())

    @staticmethod
    def compare(arr_0: t.Dict[int, float], arr_1:  t.Dict[int, float]):
        common_users = PairedTTest.common_users(arr_0, arr_1)
        list_0 = list(map(arr_0.get, common_users))
        list_1 = list(map(arr_1.get, common_users))
        return stats.ttest_rel(list_0, list_1)[1]

In [48]:
import pandas as pd
from collections import defaultdict

def compute_avg_recall(recall_at_k):

    rec = 0
    for user in recall_at_k:
        rec += recall_at_k[user]
    return rec / len(recall_at_k)

def compute_recall_at_k(ground_truth_path, predictions_path, k=10):
    
    ground_truth = pd.read_csv(ground_truth_path, sep='\t', names=['user', 'item', 'rating'])
    predictions = pd.read_csv(predictions_path, sep='\t', names=['user', 'item'])
    
    
    ground_truth_dict = defaultdict(set)
    for _, row in ground_truth.iterrows():
        if int(row['rating']) >= 3:
            ground_truth_dict[row['user']].add(row['item'])
    
    predictions_dict = defaultdict(list)
    for _, row in predictions.iterrows():
        predictions_dict[row['user']].append((row['item']))
    
    recall_at_k_dict = {}
    
    
    for user, true_items in ground_truth_dict.items():
        if user in predictions_dict:
            
            # no needed since RecBole return predictions that are already sorted
            top_k_predictions = predictions_dict[user][0:k]
            top_k_items = {item for item in top_k_predictions}
            
            
            hits = len(true_items.intersection(top_k_items))
            recall_at_k = hits / len(true_items)
        else:
            
            recall_at_k = 0.0
        
        recall_at_k_dict[user] = recall_at_k
    
    return recall_at_k_dict




In [49]:
import pandas as pd
from collections import defaultdict

def compute_avgpop(pop_at_k):
    return sum(pop_at_k.values()) / len(pop_at_k)

def compute_avgpop_at_k(ground_truth_path, predictions_path, k=10):
    
    ground_truth = pd.read_csv(ground_truth_path, sep='\t', names=['user', 'item', 'rating'])
    predictions = pd.read_csv(predictions_path, sep='\t', names=['user', 'item'])
    
    
    item_pop = ground_truth['item'].value_counts().to_dict()
    
    
    predictions_dict = defaultdict(list)
    pop_dict = defaultdict(float)
    for _, row in predictions.iterrows():
        user, item = row['user'], row['item']
        if len(predictions_dict[user]) < k:
            predictions_dict[user].append(item)
            pop_dict[user] += item_pop.get(item, 0)
    
    
    for user in pop_dict:
        pop_dict[user] /= k

    return pop_dict



In [50]:
# recall
rec_lgcn_2 = compute_recall_at_k('amazon_books_60core.test.tsv', 'preds/amazon_books_60core_split_2_LightGCN_preds.tsv', k=10)
avg_rec_lgcn_2 = compute_avg_recall(rec_lgcn_2)

rec_dmf_2 = compute_recall_at_k('amazon_books_60core.test.tsv', 'preds/amazon_books_60core_split_2_DMF_preds.tsv', k=10)
avg_rec_dmf_2 = compute_avg_recall(rec_dmf_2)

rec_ngcf_2 = compute_recall_at_k('amazon_books_60core.test.tsv', 'preds/amazon_books_60core_split_2_NGCF_preds.tsv', k=10)
avg_rec_ngcf_2 = compute_avg_recall(rec_ngcf_2)

rec_lgcn_6 = compute_recall_at_k('amazon_books_60core.test.tsv', 'preds/amazon_books_60core_split_6_LightGCN_preds.tsv', k=10)
avg_rec_lgcn_6 = compute_avg_recall(rec_lgcn_6)

rec_dmf_6 = compute_recall_at_k('amazon_books_60core.test.tsv', 'preds/amazon_books_60core_split_6_DMF_preds.tsv', k=10)
avg_rec_dmf_6 = compute_avg_recall(rec_dmf_6)

rec_ngcf_6 = compute_recall_at_k('amazon_books_60core.test.tsv', 'preds/amazon_books_60core_split_6_NGCF_preds.tsv', k=10)
avg_rec_ngcf_6 = compute_avg_recall(rec_ngcf_6)

rec_lgcn_10 = compute_recall_at_k('amazon_books_60core.test.tsv', 'preds/amazon_books_60core_split_10_LightGCN_preds.tsv', k=10)
avg_rec_lgcn_10 = compute_avg_recall(rec_lgcn_10)

rec_dmf_10 = compute_recall_at_k('amazon_books_60core.test.tsv', 'preds/amazon_books_60core_split_10_DMF_preds.tsv', k=10)
avg_rec_dmf_10 = compute_avg_recall(rec_dmf_10)

rec_ngcf_10 = compute_recall_at_k('amazon_books_60core.test.tsv', 'preds/amazon_books_60core_split_10_NGCF_preds.tsv', k=10)
avg_rec_ngcf_10 = compute_avg_recall(rec_ngcf_10)


# avg pop
pop_lgcn_2 = compute_avgpop_at_k('amazon_books_60core_split_2.train.tsv', 'preds/amazon_books_60core_split_2_LightGCN_preds.tsv', k=10)
avg_pop_lgcn_2 = compute_avgpop(pop_lgcn_2)

pop_dmf_2 = compute_avgpop_at_k('amazon_books_60core_split_2.train.tsv', 'preds/amazon_books_60core_split_2_DMF_preds.tsv', k=10)
avg_pop_dmf_2 = compute_avgpop(pop_dmf_2)

pop_ngcf_2 = compute_avgpop_at_k('amazon_books_60core_split_2.train.tsv', 'preds/amazon_books_60core_split_2_NGCF_preds.tsv', k=10)
avg_pop_ngcf_2 = compute_avgpop(pop_ngcf_2)

pop_lgcn_6 = compute_avgpop_at_k('amazon_books_60core_split_6.train.tsv', 'preds/amazon_books_60core_split_6_LightGCN_preds.tsv', k=10)
avg_pop_lgcn_6 = compute_avgpop(pop_lgcn_6)

pop_dmf_6 = compute_avgpop_at_k('amazon_books_60core_split_6.train.tsv', 'preds/amazon_books_60core_split_6_DMF_preds.tsv', k=10)
avg_pop_dmf_6 = compute_avgpop(pop_dmf_6)

pop_ngcf_6 = compute_avgpop_at_k('amazon_books_60core_split_6.train.tsv', 'preds/amazon_books_60core_split_6_NGCF_preds.tsv', k=10)
avg_pop_ngcf_6 = compute_avgpop(pop_ngcf_6)

pop_lgcn_10 = compute_avgpop_at_k('amazon_books_60core_split_10.train.tsv', 'preds/amazon_books_60core_split_10_LightGCN_preds.tsv', k=10)
avg_pop_lgcn_10 = compute_avgpop(pop_lgcn_10)

pop_dmf_10 = compute_avgpop_at_k('amazon_books_60core_split_10.train.tsv', 'preds/amazon_books_60core_split_10_DMF_preds.tsv', k=10)
avg_pop_dmf_10 = compute_avgpop(pop_dmf_10)

pop_ngcf_10 = compute_avgpop_at_k('amazon_books_60core_split_10.train.tsv', 'preds/amazon_books_60core_split_10_NGCF_preds.tsv', k=10)
avg_pop_ngcf_10 = compute_avgpop(pop_ngcf_10)

  ground_truth = pd.read_csv(ground_truth_path, sep='\t', names=['user', 'item', 'rating'])
  ground_truth = pd.read_csv(ground_truth_path, sep='\t', names=['user', 'item', 'rating'])
  ground_truth = pd.read_csv(ground_truth_path, sep='\t', names=['user', 'item', 'rating'])
  ground_truth = pd.read_csv(ground_truth_path, sep='\t', names=['user', 'item', 'rating'])
  ground_truth = pd.read_csv(ground_truth_path, sep='\t', names=['user', 'item', 'rating'])
  ground_truth = pd.read_csv(ground_truth_path, sep='\t', names=['user', 'item', 'rating'])


In [51]:
results = {}

# LGCN
results['rec_lgcn_2_vs_rec_lgcn_6'] = PairedTTest.compare(rec_lgcn_2, rec_lgcn_6)
results['rec_lgcn_2_vs_rec_lgcn_10'] = PairedTTest.compare(rec_lgcn_2, rec_lgcn_10)
results['rec_lgcn_6_vs_rec_lgcn_10'] = PairedTTest.compare(rec_lgcn_6, rec_lgcn_10)
results['pop_lgcn_2_vs_pop_lgcn_6'] = PairedTTest.compare(pop_lgcn_2, pop_lgcn_6)
results['pop_lgcn_2_vs_pop_lgcn_10'] = PairedTTest.compare(pop_lgcn_2, pop_lgcn_10)
results['pop_lgcn_6_vs_pop_lgcn_10'] = PairedTTest.compare(pop_lgcn_6, pop_lgcn_10)

# DMF
results['rec_dmf_2_vs_rec_dmf_6'] = PairedTTest.compare(rec_dmf_2, rec_dmf_6)
results['rec_dmf_2_vs_rec_dmf_10'] = PairedTTest.compare(rec_dmf_2, rec_dmf_10)
results['rec_dmf_6_vs_rec_dmf_10'] = PairedTTest.compare(rec_dmf_6, rec_dmf_10)
results['pop_dmf_2_vs_pop_dmf_6'] = PairedTTest.compare(pop_dmf_2, pop_dmf_6)
results['pop_dmf_2_vs_pop_dmf_10'] = PairedTTest.compare(pop_dmf_2, pop_dmf_10)
results['pop_dmf_6_vs_pop_dmf_10'] = PairedTTest.compare(pop_dmf_6, pop_dmf_10)

# NGCF
results['rec_ngcf_2_vs_rec_ngcf_6'] = PairedTTest.compare(rec_ngcf_2, rec_ngcf_6)
results['rec_ngcf_2_vs_rec_ngcf_10'] = PairedTTest.compare(rec_ngcf_2, rec_ngcf_10)
results['rec_ngcf_6_vs_rec_ngcf_10'] = PairedTTest.compare(rec_ngcf_6, rec_ngcf_10)
results['pop_ngcf_2_vs_pop_ngcf_6'] = PairedTTest.compare(pop_ngcf_2, pop_ngcf_6)
results['pop_ngcf_2_vs_pop_ngcf_10'] = PairedTTest.compare(pop_ngcf_2, pop_ngcf_10)
results['pop_ngcf_6_vs_pop_ngcf_10'] = PairedTTest.compare(pop_ngcf_6, pop_ngcf_10)

# Compare same dataset, different models
datasets = [2, 6, 10]

for dataset in datasets:
    results[f'rec_lgcn_{dataset}_vs_rec_dmf_{dataset}'] = PairedTTest.compare(globals()[f'rec_lgcn_{dataset}'], globals()[f'rec_dmf_{dataset}'])
    results[f'rec_lgcn_{dataset}_vs_rec_ngcf_{dataset}'] = PairedTTest.compare(globals()[f'rec_lgcn_{dataset}'], globals()[f'rec_ngcf_{dataset}'])
    results[f'rec_dmf_{dataset}_vs_rec_ngcf_{dataset}'] = PairedTTest.compare(globals()[f'rec_dmf_{dataset}'], globals()[f'rec_ngcf_{dataset}'])
    results[f'pop_lgcn_{dataset}_vs_pop_dmf_{dataset}'] = PairedTTest.compare(globals()[f'pop_lgcn_{dataset}'], globals()[f'pop_dmf_{dataset}'])
    results[f'pop_lgcn_{dataset}_vs_pop_ngcf_{dataset}'] = PairedTTest.compare(globals()[f'pop_lgcn_{dataset}'], globals()[f'pop_ngcf_{dataset}'])
    results[f'pop_dmf_{dataset}_vs_pop_ngcf_{dataset}'] = PairedTTest.compare(globals()[f'pop_dmf_{dataset}'], globals()[f'pop_ngcf_{dataset}'])

# print results when not statistically significant
for key, p_value in results.items():
    if p_value >= 0.05:
        print(f'{key}: {p_value} --> not a statistically significant difference')


In [52]:
# save to file
with open('ttest_results.tsv', 'w') as fout:
    for key, p_value in results.items():
        if p_value < 0.05:
            out_msg = 'statistically significant difference'
        else:
            out_msg = 'not a statistically significant difference'
        fout.write(f'{key}:\t{p_value}\t{out_msg}\n')
