In [44]:
# calculate error bars

# 1. read from log files

import regex
import glob

def split_filename(filename: str):
    parts = filename.split('/')
    if len(parts) < 2: return None
    parts = parts[-1].split('_')
    if len(parts) < 2: return None
    idx = parts[1]
    infos = parts[0].split('-')
    if len(infos) < 3: return None
    dataset = infos[0]
    model = infos[1]
    return dataset, model, idx

def read_log_file(dirpath: str):
    # 2024-06-30 07:13:14,658    - (Testing)    scores: MRR:0.224386 hits@10:0.222489 map@10:0.087642 hits@50:0.408720 map@50:0.096358 hits@100:0.490232 map@100:0.097533, elapse: 3.663 min, gpu memory usage=23832.000 MiB
    pattern = regex.compile(r'scores: MRR:(\d+\.\d+) hits@10:(\d+\.\d+) map@10:(\d+\.\d+) hits@50:(\d+\.\d+) map@50:(\d+\.\d+) hits@100:(\d+\.\d+) map@100:(\d+\.\d+)')
    if dirpath[-1] != '/': dirpath += '/'

    results = {}
    for log_file in glob.glob(dirpath + "*basic*.txt"):
        # print(log_file)
        dataset, model, idx = split_filename(log_file)
        if dataset not in results: results[dataset] = {}
        if model not in results[dataset]: results[dataset][model] = {}
        with open(log_file, 'r') as f:
            for line in f:
                if "scores" in line:
                    m = pattern.search(line)
                    if m:
                        results[dataset][model][idx] = m.groups()
            if dataset == "wb" and model == "tan":
                print(dataset, model, idx, results[dataset][model][idx])
    return results

results = read_log_file("/root/pyHeter-GAT/src/")

# 2. calculate mean and std
names = ["MRR", "hits@10", "map@10", "hits@50", "map@50", "hits@100", "map@100"]
for dataset, models in results.items():
    if dataset != 'tw': continue
    for model, idxs in models.items():
        if model != 'tan': continue
        scores = [[] for _ in range(len(names))]
        for _, vals in idxs.items():
            for idx, val in enumerate(vals): 
                scores[idx].append(float(val))
        for idx, name in enumerate(names):
            print(scores[idx])
            mean = sum(scores[idx]) / len(idxs)
            std = (sum([(score - mean) ** 2 for score in scores[idx]]) / len(idxs)) ** 0.5
            print(dataset, model, name, round(mean*100, 2), round(std*100,2))
    break

wb tan 11.txt ('0.022915', '0.044744', '0.017017', '0.124295', '0.020395', '0.186530', '0.021267')
wb tan 7.txt ('0.022998', '0.045781', '0.017128', '0.124105', '0.020475', '0.186077', '0.021343')
wb tan 8.txt ('0.023104', '0.044817', '0.017185', '0.124850', '0.020590', '0.186092', '0.021451')
wb tan 1.txt ('0.023138', '0.045621', '0.017288', '0.125040', '0.020632', '0.185639', '0.021482')
wb tan 2.txt ('0.023038', '0.044832', '0.017153', '0.122878', '0.020503', '0.184981', '0.021377')
wb tan 3.txt ('0.022994', '0.044963', '0.017069', '0.124076', '0.020465', '0.186720', '0.021345')
wb tan 4.txt ('0.022953', '0.045109', '0.017065', '0.123491', '0.020418', '0.185799', '0.021291')
wb tan 5.txt ('0.022881', '0.045109', '0.017025', '0.123798', '0.020351', '0.186501', '0.021228')
wb tan 6.txt ('0.023178', '0.044919', '0.017285', '0.123988', '0.020653', '0.185946', '0.021523')
wb tan 9.txt ('0.023200', '0.045328', '0.017321', '0.124266', '0.020673', '0.186618', '0.021545')
wb tan 10.txt ('0.0

In [74]:
# /root/pyHeter-GAT/sota-dependencies/Inf-VAE/tw-inf_vae-basic_1.txt

def read_json(dirpath: str):
    import json
    if dirpath[-1] != '/': dirpath += '/'
    results = {}
    for json_file in glob.glob(dirpath + "*basic*.txt"):
        dataset = json_file.split('/')[-1][:2]
        model = "inf-vae"
        idx = json_file.split('_')[-1].split('.')[0]
        if dataset not in results: results[dataset] = {}
        if model not in results[dataset]: results[dataset][model] = {}
        with open(json_file, 'r') as f:
            lines = f.readlines()
            lines = lines[-7:]
            j = json.loads("".join([line[:-1] for line in lines]).replace("'", '"'))
            results[dataset][model][idx] = [j['MRR'], j['Recall@10'], j['MAP@10'], j['Recall@50'], j['MAP@50'], j['Recall@100'], j['MAP@100']]
    return results

results2 = read_json("/root/pyHeter-GAT/sota-dependencies/Inf-VAE/")

# 2. calculate mean and std
names = ["MRR", "hits@10", "map@10", "hits@50", "map@50", "hits@100", "map@100"]
for dataset, models in results2.items():
    if dataset != 'wb': continue
    for model, idxs in models.items():
        if model != 'inf-vae': continue
        scores = [[] for _ in range(len(names))]
        for _, vals in idxs.items():
            for idx, val in enumerate(vals): 
                scores[idx].append(float(val))
        for idx, name in enumerate(names):
            print(scores[idx])
            mean = sum(scores[idx]) / len(idxs)
            std = (sum([(score - mean) ** 2 for score in scores[idx]]) / len(idxs)) ** 0.5
            print(dataset, model, name, round(mean*100, 2), round(std*100,2))
    break

[0.1565068138590276, 0.15946014549664625, 0.15165694188479273, 0.15976805853100157, 0.1529991556635849, 0.15568861810622808, 0.15726053335098295, 0.1564570435429694, 0.16942183830637914, 0.15900262347244418]
wb inf-vae MRR 15.78 0.46
[0.013165574535775848, 0.013486941752489656, 0.01326584690232167, 0.013274490963653319, 0.013397486688225031, 0.013779555428633211, 0.014269735303154289, 0.013462739059643672, 0.013913708802677126, 0.01346270730585379]
wb inf-vae hits@10 1.35 0.03
[0.025989456771043716, 0.02753715266096991, 0.026657132618693836, 0.026759657518621062, 0.026166134547073866, 0.025453053865930997, 0.027800998239936384, 0.02776809928851594, 0.027595917259343287, 0.026271499246339786]
wb inf-vae map@10 2.68 0.08
[0.04923980054063875, 0.04967717722880837, 0.05007902963679854, 0.048818594353054676, 0.050922007892850805, 0.05239792801490654, 0.051616866113567125, 0.050651992615210556, 0.04974048000460114, 0.051591713930597236]
wb inf-vae hits@50 5.05 0.11
[0.013582190535606338, 0.0

[0.049891, 0.049372, 0.050704, 0.049194, 0.050901, 0.049703, 0.049591, 0.049262, 0.049553, 0.048964]
tw tan MRR 4.97 0.06
[0.092274, 0.091694, 0.093571, 0.090149, 0.092908, 0.09048, 0.091639, 0.091363, 0.092329, 0.091115]
tw tan hits@10 9.18 0.1
[0.043483, 0.043144, 0.044392, 0.042987, 0.044633, 0.04362, 0.043209, 0.042796, 0.043249, 0.042732]
tw tan map@10 4.34 0.06
[0.190922, 0.186369, 0.191529, 0.184409, 0.189128, 0.183416, 0.187831, 0.19048, 0.189156, 0.187445]
tw tan hits@50 18.81 0.26
[0.047925, 0.047395, 0.04875, 0.047249, 0.048956, 0.047817, 0.0476, 0.047282, 0.047584, 0.047007]
tw tan map@50 4.78 0.06
[0.251987, 0.247406, 0.251904, 0.244619, 0.249421, 0.240397, 0.248924, 0.252318, 0.249945, 0.246827]
tw tan hits@100 24.84 0.35
[0.048791, 0.048255, 0.049605, 0.048104, 0.049805, 0.048621, 0.048466, 0.048153, 0.048446, 0.04785]
tw tan map@100 4.86 0.06


In [10]:
s = "2024-06-30 07:13:14,658    - (Testing)    scores: MRR:0.224386 hits@10:0.222489 map@10:0.087642 hits@50:0.408720 map@50:0.096358 hits@100:0.490232 map@100:0.097533, elapse: 3.663 min, gpu memory usage=23832.000 MiB"
pattern = regex.compile(r'scores: MRR:(\d+\.\d+) hits@10:(\d+\.\d+) map@10:(\d+\.\d+) hits@50:(\d+\.\d+) map@50:(\d+\.\d+) hits@100:(\d+\.\d+) map@100:(\d+\.\d+)')

m = pattern.findall(s)
m

[('0.224386',
  '0.222489',
  '0.087642',
  '0.408720',
  '0.096358',
  '0.490232',
  '0.097533')]

In [4]:
import pickle

filepath = "/remote-home/share/dmb_nas/wangzejian/HeterGAT/Weibo-Aminer/"
filepath2 = "/remote-home/share/dmb_nas/wangzejian/HeterGAT/Twitter-Huangxin/sub10000/"

def dump(filepath):
    names = ["u2idx.data", "train.data", "valid.data", "test.data"]
    for name in names:
        with open(filepath + name, "rb") as f:
            data = pickle.load(f)
        with open(filepath + "{}2".format(name), "wb") as f:
            pickle.dump(data, f, protocol=2)

# dump(filepath)
dump(filepath2)