In [1]:
import json
import os
import pandas as pd
from tqdm import tqdm
import numpy as np
import warnings
warnings.filterwarnings("ignore")

#### 对CalNovelty得分结果的汇总

##### uzzi

In [2]:
path = '../CalNovelty/refer/Result/fast_uzzi/reference'
files = os.listdir(path)

paper_to_novelty = {}
for file in files:
    with open(os.path.join(path, file), 'r') as f:
        datas = json.load(f)

    for data in datas:
        paper_to_novelty[data['doi']] = {
            'novelty':data['reference_uzzi']['score']['novelty'],
            'year': data['year']
        }

len(paper_to_novelty)

312124

In [3]:
df = pd.DataFrame()
df['Doi'] = list(paper_to_novelty.keys())
df['Novelty'] = [data['novelty'] for data in paper_to_novelty.values()]
df['Year'] = [data['year'] for data in paper_to_novelty.values()]
df = df.dropna()
df.to_csv("base_data/overview/uzzi.csv", index=False)

In [None]:
# 去除异常值
Q1, Q3 = df['Novelty'].quantile([0.25, 0.75])
IQR = Q3 - Q1

filtered_df = df[(df['Novelty'] >= Q1 - 1.5 * IQR) & (df['Novelty'] <= Q3 + 1.5 * IQR)]
filtered_df_stats = filtered_df[['Novelty']].describe()
filtered_df.to_csv("data/overview/uzzi_filtered.csv", index=False)
print(filtered_df_stats)

##### uzzi-sec

有两种方式去将结果利用起来

    （1） 获取所有组合的分数，然后取10分位数   --具体取第几分位数，这个可以再讨论
    （2） 将论文中不同篇章的分数取平均值，或者直接加和

In [None]:
path = '../CalNovelty/refer/Result/fast_uzzi_sec/reference'

paper_to_novelty = {}
files = os.listdir(path)
for file in tqdm(files):
    with open(os.path.join(path, file), 'r') as f:
        datas = json.load(f)
        
    for data in datas:
        doi = data['doi']
        pos = doi.find('_')
        doi = doi[:pos]
        if doi not in paper_to_novelty:
            paper_to_novelty[doi] = {
                'comb_array':[],
                'sec_novel':[]
            }
        paper_to_novelty[doi]['comb_array'].extend(data['reference_uzzi']['scores_array'])
        paper_to_novelty[doi]['sec_novel'].append(data['reference_uzzi']['score']['novelty'])

In [None]:
paper_to_novelty_merge_comb = {}
paper_to_novelty_merge_sec = {}
for doi in tqdm(paper_to_novelty):
    paper_to_novelty_merge_comb[doi] = np.nanquantile(paper_to_novelty[doi]['comb_array'], 0.1)
    paper_to_novelty_merge_sec[doi] = np.nanmean(paper_to_novelty[doi]['sec_novel'])

In [None]:
df = pd.DataFrame()
df['Doi'] = list(paper_to_novelty_merge_comb.keys())
df['Novelty'] = list(paper_to_novelty_merge_comb.values())
print(len(df))
df = df.dropna()
df.to_csv("data/overview/uzzi_sec_merge_comb.csv", index=False)

In [None]:
df = pd.read_csv("data/overview/uzzi_sec_merge_sec.csv")
Q1, Q3 = df['Novelty'].quantile([0.25, 0.75])
IQR = Q3 - Q1

filtered_df = df[(df['Novelty'] >= Q1 - 1.5 * IQR) & (df['Novelty'] <= Q3 + 1.5 * IQR)]
filtered_df.to_csv("data/overview/uzzi_sec_merge_sec_filtered.csv", index=False)

In [6]:
# 添加时间因素
with open('../../data/plos_pub_year.json', 'r') as f:
    pub_year = json.load(f)

df = pd.read_csv('base_data/overview/uzzi_sec_merge_sec_filtered.csv')
unmatch_ids = []
for i, idx in enumerate(df.iterrows()):
    if idx[1]['Doi'] not in pub_year.keys():
        unmatch_ids.append(idx[1]['Doi'])
filter_df = df[~df['Doi'].isin(unmatch_ids)]

dois = filter_df['Doi'].tolist()
years = [pub_year[doi]['epub'] for doi in dois]
filter_df['Year'] = years
filter_df.to_csv('base_data/overview/uzzi_sec_merge_sec_filtered.csv', index=False)

##### wang

In [5]:
path = '../CalNovelty/refer/Result/fast_wang/reference'
files = os.listdir(path)

paper_to_novelty = {}
for file in tqdm(files):
    with open(os.path.join(path, file), 'r') as f:
        datas = json.load(f)

    for data in datas:
        paper_to_novelty[data] = datas[data]

len(paper_to_novelty)

100%|██████████| 16/16 [00:00<00:00, 28.83it/s]


275598

In [6]:
df = pd.DataFrame()
df['Doi'] = list(paper_to_novelty.keys())
df['Novelty'] = list(paper_to_novelty.values())
df = df.dropna()
df.to_csv("base_data/overview/wang.csv", index=False)

In [None]:
Q1, Q3 = df['Novelty'].quantile([0.25, 0.75])
IQR = Q3 - Q1

filtered_df = df[(df['Novelty'] >= Q1 - 1.5 * IQR) & (df['Novelty'] <= Q3 + 1.5 * IQR)]
filtered_df_stats = filtered_df[['Novelty']].describe()
filtered_df.to_csv("data/overview/wang_filtered.csv", index=False)

In [7]:
with open('../../data/plos_pub_year.json', 'r') as f:
    pub_year = json.load(f)

df = pd.read_csv('base_data/overview/wang.csv')
unmatch_ids = []
for i, idx in enumerate(df.iterrows()):
    if idx[1]['Doi'] not in pub_year.keys():
        unmatch_ids.append(idx[1]['Doi'])
filter_df = df[~df['Doi'].isin(unmatch_ids)]

dois = filter_df['Doi'].tolist()
years = [pub_year[doi]['epub'] for doi in dois]
filter_df['Year'] = years
filter_df.to_csv('base_data/overview/wang.csv', index=False)

##### wang-sec

In [None]:
path = '../CalNovelty/refer/Result/fast_wang_sec/reference'

paper_to_novelty = {}
files = os.listdir(path)
for file in tqdm(files):
    with open(os.path.join(path, file), 'r') as f:
        datas = json.load(f)
        
    for data in datas:
        doi = data
        pos = doi.find('_')
        doi = doi[:pos]
        if doi not in paper_to_novelty:
            paper_to_novelty[doi] = 0
        paper_to_novelty[doi] = datas[data]

In [None]:
df = pd.DataFrame()
df['Doi'] = list(paper_to_novelty.keys())
df['Novelty'] = list(paper_to_novelty.values())
df = df.dropna()
df_stats = df[['Novelty']].describe()
df.to_csv("data/overview/wang_sec.csv", index=False)

In [None]:
df = pd.read_csv("data/overview/wang_sec.csv")
Q1, Q3 = df['Novelty'].quantile([0.25, 0.75])
IQR = Q3 - Q1

filtered_df = df[(df['Novelty'] >= Q1 - 1.5 * IQR) & (df['Novelty'] <= Q3 + 1.5 * IQR)]
filtered_df_stats = filtered_df[['Novelty']].describe()
filtered_df.to_csv("data/overview/wang_sec_filtered.csv", index=False)

In [3]:
with open('../../data/plos_pub_year.json', 'r') as f:
    pub_year = json.load(f)

df = pd.read_csv('base_data/overview/wang_sec.csv')
unmatch_ids = []
for i, idx in enumerate(df.iterrows()):
    if idx[1]['Doi'] not in pub_year.keys():
        unmatch_ids.append(idx[1]['Doi'])
filter_df = df[~df['Doi'].isin(unmatch_ids)]

dois = filter_df['Doi'].tolist()
years = [pub_year[doi]['epub'] for doi in dois]
filter_df['Year'] = years
filter_df.to_csv('base_data/overview/wang_sec.csv', index=False)

#### 合并F1000数据

In [None]:
file_path = "gold_standard/F1000/PLos Other/"
files = os.listdir(file_path)

df_other = pd.DataFrame()
for file in files:
    df = pd.read_csv(os.path.join(file_path, file))
    df = df[['title', 'doi', 'classifications']]
    if df_other.empty:
        df_other = df
    else:
        df_other = df_other.append(df)

df_other = df_other.dropna()
len(df_other)

In [None]:
standard_recom_label = ['confirmation','controversial','good for teaching','hypothesis','negative','new finding','novel drug target', 'refutation', 'technical advance']

format_labels = [[] for i in range(len(standard_recom_label))]
labels = df_other['classifications'].tolist()
titles = df_other['title'].tolist()
dois = df_other['doi'].tolist()

new_titles, new_dois = [] ,[]
for i, doi in enumerate(dois):
    pos = i
    if doi not in new_dois:
        new_dois.append(doi)
        new_titles.append(titles[i])
    else:
        pos = new_dois.index(doi)

    label = labels[i].lower()  

    if i != pos:
        for j, standard_label in enumerate(standard_recom_label):
            if standard_label in label.lower():
                format_labels[j][pos] += 1
    else:
        for j, standard_label in enumerate(standard_recom_label):
            if standard_label in label.lower():
                format_labels[j].append(1)
            else:
                format_labels[j].append(0)

df = pd.DataFrame()
df['Doi'] = new_dois
for i, standard_label in enumerate(standard_recom_label):
    df[standard_label] = format_labels[i]

df.to_excel("gold_standard/F1000/PLos Other/pb_all_tags.xlsx", index=False)

In [None]:
df_100 = pd.read_csv("gold_standard/F1000/PLos 100/plos_100_tags.csv")
df_bio = pd.read_excel("gold_standard/F1000/PLos Biology/pb_all_tags.xlsx")
df = pd.read_excel("gold_standard/F1000/PLos Other/pb_all_tags.xlsx")

select_columns = ['Doi']
standard_recom_label = ['confirmation','controversial','good for teaching','hypothesis',
                        'negative','new finding','novel drug target', 'refutation', 'technical advance']
select_columns.extend(standard_recom_label)

df_100 = df_100[select_columns]
df_bio = df_bio[select_columns]
df = df[select_columns]

df_all = pd.concat([df_100, df_bio, df], axis=0, ignore_index=True)
df_all = df_all.drop_duplicates(ignore_index=True)

In [None]:
df_all = df_all.dropna()

dois = df_all['Doi'].tolist()
formatted_dois = []
for doi in dois:
    if 'https://doi.org' in doi:
        doi = doi.replace('https://doi.org/', '')
    formatted_dois.append(doi)

df_all['Doi'] = formatted_dois
df_all = df_all.drop_duplicates(ignore_index=True)

In [None]:
labels = [df_all[standard_label].tolist() for standard_label in standard_recom_label]

unique_datas = {}
formatted_dois = df_all['Doi'].tolist()
for i, doi in enumerate(formatted_dois):
    if doi not in unique_datas:
        unique_datas[doi] = {standard_label: labels[j][i] for j, standard_label in enumerate(standard_recom_label)}
    else:
        for j, standard_label in enumerate(standard_recom_label):
            if labels[j][i] != 0:
                unique_datas[doi][standard_label] = labels[j][i]

format_labels = [[] for i in range(len(standard_recom_label))]
dois = list(unique_datas.keys())
for doi in unique_datas:
    labels = unique_datas[doi]
    for i, label in enumerate(labels):
        format_labels[i].append(labels[label])

df = pd.DataFrame()
df['Doi'] = dois
for label in standard_recom_label:
    df[label] = format_labels[standard_recom_label.index(label)]
df.to_excel('gold_standard/F1000/Plos_recom_tags.xlsx', index=False)

In [None]:
with open('../../../data/plos_pub_year.json', 'r') as f:
    pub_year = json.load(f)

df = pd.read_excel('gold_standard/F1000/Plos_recom_tags.xlsx')
unmatch_ids = []
for i, idx in enumerate(df.iterrows()):
    if idx[1]['Doi'] not in pub_year.keys():
        unmatch_ids.append(idx[1]['Doi'])
filter_df = df[~df['Doi'].isin(unmatch_ids)]

dois = filter_df['Doi'].tolist()
years = [pub_year[doi]['epub'] for doi in dois]
filter_df['Year'] = years
filter_df.to_excel('gold_standard/F1000/Plos_recom_tags.xlsx', index=False)