In [1]:
import pandas as pd
import os
import numpy as np
from lxml import etree
from tqdm import tqdm
import warnings
import json
warnings.filterwarnings("ignore")

##### 异常值的产生

In [2]:
def get_outlier_data(root_path, filename):
    df_raw = pd.read_csv(os.path.join(root_path, f"{filename}.csv"))
    df_filtered = pd.read_csv(os.path.join(root_path, f"{filename}_filtered.csv"))

    raw_dois = df_raw['Doi'].tolist()
    filter_dois = df_filtered['Doi'].tolist()
    outlier_dois = list(set(raw_dois) - set(filter_dois))

    df_outlier = df_raw[df_raw['Doi'].isin(outlier_dois)]
    df_outlier.to_csv(f"robust/outliers/{filename}.csv", index=False)

In [None]:
file_path = 'base_data/overview'

get_outlier_data(file_path, 'uzzi_sec_merge_comb')
get_outlier_data(file_path, 'uzzi_sec_merge_sec')
get_outlier_data(file_path, 'uzzi')

get_outlier_data(file_path, 'wang_sec')
get_outlier_data(file_path, 'wang')

In [4]:
# 看不同方法的异常值论文是否较为一致
def check_common_outlier(file_1, file_2, save=False, save_path=None):
    df_1 = pd.read_csv(os.path.join('robust/outliers', file_1))
    df_2 = pd.read_csv(os.path.join('robust/outliers', file_2))

    dois_1 = df_1['Doi'].tolist()
    dois_2 = df_2['Doi'].tolist()

    common_dois = set(dois_1).intersection(set(dois_2))
    print(f"file_1: {len(dois_1)} file_2: {len(dois_2)} common: {len(common_dois)}")

    if save:
        df_common = df_1[df_1['Doi'].isin(common_dois)]
        df_common.to_csv(os.path.join('robust/outliers/', save_path), index=False)

In [None]:
check_common_outlier('uzzi_sec_merge_comb.csv', 'uzzi_sec_merge_sec.csv', True, 'uzzi_sec_with_comb.csv')
check_common_outlier('uzzi_sec_merge_comb.csv', 'wang_sec.csv', True, 'wang_with_uzzi_comb.csv')
check_common_outlier('uzzi_sec_merge_sec.csv', 'wang_sec.csv', True, 'wang_with_uzzi_sec.csv')

file_1: 17127 file_2: 26331 common: 12503
file_1: 17127 file_2: 29209 common: 729
file_1: 26331 file_2: 29209 common: 1171


In [12]:
# 看异常值中有多少是在金标准中的，如果几乎没有，那可以确定没影响
gold_data_path = 'gold_standard/F1000/Plos_recom_tags.xlsx'

def check_common_with_gold(outlier_file):
    df_outlier = pd.read_csv(os.path.join('robust/outliers', outlier_file))
    outlier_dois = df_outlier['Doi'].tolist()

    df_gold = pd.read_excel(gold_data_path)
    gold_dois = df_gold['Doi'].tolist()

    print(len(set(outlier_dois).intersection(gold_dois)))

pos_recom_label = ['hypothesis', 'new finding', 'novel drug target', 'technical advance']
def save_common_with_gold(outlier_file):
    df_outlier = pd.read_csv(os.path.join('robust/outliers', outlier_file))
    outlier_dois = df_outlier['Doi'].tolist()

    novelty_type = []
    df_gold = pd.read_excel(gold_data_path)
    for row in df_gold.iterrows():
        flag = False
        for column in pos_recom_label:
            if row[1][column] != 0:
                novelty_type.append(1)
                flag = True
                break
        if not flag:
            novelty_type.append(0)
    df_gold['Novelty_Type'] = novelty_type

    simple_df_gold = df_gold[['Doi', 'Novelty_Type']]
    merged_df = pd.merge(simple_df_gold, df_outlier, on='Doi', how='inner')

    merged_df.to_csv(os.path.join('robust/outliers', 'outlier_in_gold_' + outlier_file), index=False)

In [13]:
save_common_with_gold('wang_sec.csv')
save_common_with_gold('uzzi_sec_merge_comb.csv')
save_common_with_gold('uzzi_sec_merge_sec.csv')

#### 分析带异常值的结果

In [2]:
df_all = pd.read_excel('gold_standard/F1000/Plos_recom_tags.xlsx')
print(df_all.shape)
pos_plos_dois = []
pos_recom_label = ['hypothesis', 'new finding', 'novel drug target', 'technical advance']

for row in df_all.iterrows():
    for column in pos_recom_label:
        if row[1][column] != 0:
            pos_plos_dois.append(row[1]['Doi'])
            break

dois = df_all['Doi'].tolist()
years = df_all['Year'].tolist()
doi_to_year = {dois[i]:years[i] for i in range(len(dois))}
neg_plos_dois = [x for x in dois if x not in pos_plos_dois]

print(len(pos_plos_dois))
print(len(neg_plos_dois))

(5082, 11)
4662
420


In [3]:
def generate_balanced_file(path, saved_path):
    df = pd.read_csv(path)
    df_pos = df[df['Doi'].isin(pos_plos_dois)]
    df_neg = df[df['Doi'].isin(neg_plos_dois)]
    pos_grouped = df_pos.groupby('Year')
    neg_grouped = df_neg.groupby('Year')

    sampled_pos = pd.DataFrame()
    for year, neg_year_grouped in neg_grouped:
        sample_size = len(neg_year_grouped)

        pos_year_group = pos_grouped.get_group(year)
        sampled_pos_year = pos_year_group.sample(n=sample_size, random_state=2024)

        sampled_pos = pd.concat([sampled_pos, sampled_pos_year])

    sampled_pos = sampled_pos.reset_index(drop=True)
    df_pos = sampled_pos

    df_pos['label'] = [1 for i in range(len(df_pos))]
    df_neg['label'] = [0 for i in range(len(df_neg))]

    df = pd.concat([df_pos, df_neg], axis=0, ignore_index=True)
    df.to_excel(saved_path, index=False)

def generate_unbalanced_file(path, saved_path):
    df = pd.read_csv(path)
    df_pos = df[df['Doi'].isin(pos_plos_dois)]
    df_neg = df[df['Doi'].isin(neg_plos_dois)]

    df_pos['label'] = [1 for i in range(len(df_pos))]
    df_neg['label'] = [0 for i in range(len(df_neg))]

    df = pd.concat([df_pos, df_neg], axis=0, ignore_index=True)
    df.to_excel(saved_path, index=False)

In [4]:
root_path = 'base_data/overview'
saved_path = 'robust/log-just-outlier'

generate_balanced_file(os.path.join(root_path, 'uzzi_sec_merge_comb.csv'),
                       os.path.join(saved_path, 'uzzi_sec_merge_comb_balanced.xlsx'))

generate_balanced_file(os.path.join(root_path, 'uzzi_sec_merge_sec.csv'),
                       os.path.join(saved_path, 'uzzi_sec_merge_sec_balanced.xlsx'))

generate_balanced_file(os.path.join(root_path, 'wang_sec.csv'),
                       os.path.join(saved_path, 'wang_sec_balanced.xlsx'))

generate_balanced_file(os.path.join(root_path, 'wang.csv'),
                       os.path.join(saved_path, 'wang_balanced.xlsx'))

generate_balanced_file(os.path.join(root_path, 'uzzi.csv'),
                       os.path.join(saved_path, 'uzzi_balanced.xlsx'))                

In [5]:
root_path = 'base_data/overview'
saved_path = 'robust/log-just-outlier'

generate_unbalanced_file(os.path.join(root_path, 'uzzi_sec_merge_comb.csv'),
                       os.path.join(saved_path, 'uzzi_sec_merge_comb_unbalanced.xlsx'))

generate_unbalanced_file(os.path.join(root_path, 'uzzi_sec_merge_sec.csv'),
                       os.path.join(saved_path, 'uzzi_sec_merge_sec_unbalanced.xlsx'))

generate_unbalanced_file(os.path.join(root_path, 'wang_sec.csv'),
                       os.path.join(saved_path, 'wang_sec_unbalanced.xlsx'))

generate_unbalanced_file(os.path.join(root_path, 'wang.csv'),
                       os.path.join(saved_path, 'wang_unbalanced.xlsx'))

generate_unbalanced_file(os.path.join(root_path, 'uzzi.csv'),
                       os.path.join(saved_path, 'uzzi_unbalanced.xlsx'))                

#### 添加其他控制变量

In [10]:
file_path = 'G://Dataset//PLOS//allofplos'
files = os.listdir(file_path)
random_intergers = np.random.randint(0, len(files), size=5)
test_files = [files[i] for i in random_intergers]

In [12]:
# 作者数量、机构数量
tree = etree.parse(os.path.join(file_path, files[0]))
authors = tree.xpath("//contrib[@contrib-type=\"author\"]")
affs = tree.xpath("//aff[contains(@id, 'aff')]")

print(os.path.join(file_path, test_files[0]))
print(f"authors: {len(authors)} affs: {len(affs)}")

G://Dataset//PLOS//allofplos\journal.pone.0089109.xml
authors: 6 affs: 3


In [1]:
# 将控制变量信息添加在表格当中
def add_control_variable(root_path, save_path, paper_file, control_variable_file):
    with open(control_variable_file, 'r') as f:
        doi_to_datas = json.load(f)

    df = pd.read_excel(os.path.join(root_path,paper_file))
    dois = df['Doi'].tolist()
    authors = [doi_to_datas[doi]['author'] for doi in dois]
    affs = [doi_to_datas[doi]['aff'] for doi in dois]
    df['Author'] = authors
    df['Aff'] = affs
    df.to_excel(os.path.join(save_path, paper_file), index=False)

In [3]:
root_path = 'gold_standard/regression'
save_path = 'robust/log-extra-control-variable'
control_variable_file = 'robust/log-extra-control-variable/plos_author_aff.json'

add_control_variable(root_path, save_path, 'uzzi_sec_merge_comb_filtered_unbalanced.xlsx', control_variable_file)
add_control_variable(root_path, save_path, 'uzzi_sec_merge_comb_filtered_balanced.xlsx', control_variable_file)
add_control_variable(root_path, save_path, 'uzzi_sec_merge_sec_filtered_balanced.xlsx', control_variable_file)
add_control_variable(root_path, save_path, 'uzzi_sec_merge_sec_filtered_unbalanced.xlsx', control_variable_file)
add_control_variable(root_path, save_path, 'uzzi_filtered_balanced.xlsx', control_variable_file)
add_control_variable(root_path, save_path, 'uzzi_filtered_unbalanced.xlsx', control_variable_file)