In [1]:
import os

path = os.getcwd()
os.chdir(f"{path}")

The purpose of this notebook is to create pairs of papers with at least 1 common citation, with respective q-values (similarity measurement) and their paper information.

## Import & Check Data

In [2]:
# import all necessary packages

import pandas as pd
import scipy.stats
# from collections import defaultdict
from itertools import combinations as comb
from tqdm import tqdm

In [3]:
cb = pd.read_csv("./citationBara.csv")
primdata = pd.read_csv("./primdata.csv")
# paperdata = pd.read_csv("./paperdata.csv")
# lastdata = pd.read_csv("./lastdata.csv")

In [3]:
# are there duplicated pairs?

if len(cb) == len(cb.drop_duplicates(['citing_doi', 'cited_doi'])):
    print("No duplicated pairs in the data.")
else:
    print("There are duplicated pairs in the data. Remove them.")

cb.head()

No duplicated pairs in the data.


Unnamed: 0,citing_doi,cited_doi
0,10.1103/PhysRevSeriesI.11.215,10.1103/PhysRevSeriesI.1.1
1,10.1103/PhysRevSeriesI.17.267,10.1103/PhysRevSeriesI.1.1
2,10.1103/RevModPhys.51.863,10.1103/PhysRevSeriesI.1.1
3,10.1103/PhysRevSeriesI.4.433,10.1103/PhysRevSeriesI.1.1
4,10.1103/PhysRevSeriesI.17.267,10.1103/PhysRevSeriesI.1.95


In [4]:
print("Number of citing papers: ", cb.citing_doi.nunique())
print("Number of cited papers:  ", cb.cited_doi.nunique())

print("Number of paper nodes:   ", len(list(set(list(cb.citing_doi.unique()) + list(cb.cited_doi.unique())))))

Number of citing papers:  411795
Number of cited papers:   357925
Number of paper nodes:    420477


## Similarity Algorithm (get q-values)

In [5]:
# Step 01: Creating a DataFrame: paper, reference, count (of reference)

sim = cb.copy()

sim['count'] = sim.groupby('cited_doi')['cited_doi'].transform('count')

sim.sort_values("count", axis = 0, ascending = True, inplace = True, na_position ='first')
sim.columns = ["paper", "reference", "count"]

print(len(sim))
sim.head()

4215907


Unnamed: 0,paper,reference,count
4215906,10.1103/PhysRevB.80.235206,10.1103/PhysRevB.80.235205,1
3445237,10.1103/PhysRevD.70.072001,10.1103/PhysRevD.63.031103,1
3445452,10.1103/PhysRevB.67.174112,10.1103/PhysRevB.63.052102,1
3445455,10.1103/PhysRevB.64.184435,10.1103/PhysRevB.63.052410,1
3445461,10.1103/PhysRevB.77.235118,10.1103/PhysRevB.63.052412,1


In [6]:
# Step 02: Creating a dictionary of 'sim's by count

sim_x={key:sim[sim['count']==key] for key in sim['count'].unique()}

In [7]:
# Step 03: Link papers with connections together as groups

sim_y = {}
nunique_list = {}
for key, sim_x_sub in sim_x.items():
    cbt = sim_x_sub
    from_doi = cbt["paper"]
    to_doi = cbt["reference"]

    nunique_list[key] = to_doi.nunique()

    parent = {}
    for ref in to_doi.unique():
        parent[ref] = []
    for a, b in zip(from_doi, to_doi):
        parent[b].append(a)

    pair_count = {}
    for l in parent.values():
        for tp in comb(sorted(l), 2):
            if tp in pair_count:
                pair_count[tp] += 1
            else:
                pair_count[tp] = 1
    dfc = []
    for p, value in pair_count.items():
        dfc.append([p[0], p[1], value])
    dfc = pd.DataFrame(dfc, columns=['paper1', 'paper2', 'common'])
    sim_y[key] = dfc

KeyboardInterrupt: 

In [6]:
# Step 04: Creating a dictionary: paper1, paper2, reference, common, freq1, freq2

freq = {}

for key, tables in sim_y.items():
    ftable = sim_x[key].groupby(["paper"]).size().reset_index(name='frequency')
    freq[key] = ftable

fin = {}
for key, tables in sim_y.items():
    if len(tables) > 0:
        tcount = freq[key]
        final_table = tables.merge(tcount, left_on='paper1', right_on='paper')
        final_table = final_table.merge(tcount, left_on='paper2', right_on='paper')
        final_table = final_table.drop(['paper_x', 'paper_y'], axis=1)
    
        fin[key] = final_table
        fin[key]['keyval'] = key

fin[9].tail()

Unnamed: 0,paper1,paper2,common,frequency_x,frequency_y,keyval
418960,10.1103/PhysRevB.74.235309,10.1103/PhysRevB.75.193308,1,1,1,9
418961,10.1103/PhysRevB.74.075328,10.1103/PhysRevB.76.035301,1,1,2,9
418962,10.1103/PhysRevB.74.235309,10.1103/PhysRevB.76.035301,1,1,2,9
418963,10.1103/PhysRevB.75.193308,10.1103/PhysRevB.76.035301,1,1,2,9
418964,10.1103/PhysRevB.42.5586,10.1103/PhysRevD.16.1965,1,1,1,9


In [7]:
# Step 05: Assigning q-values to all pairs

H_func = scipy.stats.hypergeom.pmf

for key, table in tqdm(fin.items()):
    qval = []
    _dp = {}
    nbk = nunique_list[key]

    for i, r in table.iterrows():
        Hs = []
        di, dj = r['frequency_x'], r['frequency_y']
        nijk = r['common']

        for X in range(nijk):
            _ref = (X, nbk, di, dj)
            if _ref in _dp:
                H = _dp[_ref]
            else:
                H = H_func(X, nbk, di, dj)
                _dp[_ref] = H
            Hs.append(H)
        
        qval.append(1-sum(Hs))
        
    fin[key]['qval'] = qval

fin[9].sort_values('common').tail(10)

100%|██████████| 572/572 [7:05:43<00:00, 44.66s/it]    


Unnamed: 0,paper1,paper2,common,frequency_x,frequency_y,keyval,qval
97183,10.1103/PhysRev.74.1189,10.1103/RevModPhys.16.1,4,4,18,9,-7.770007e-12
4025,10.1103/RevModPhys.26.402,10.1103/RevModPhys.27.77,5,14,27,9,8.181811e-11
96639,10.1103/PhysRevB.75.125302,10.1103/PhysRevB.77.075335,5,5,5,9,1.524236e-11
83358,10.1103/PhysRevC.71.055501,10.1103/PhysRevC.79.055502,6,6,6,9,1.608114e-11
4254,10.1103/RevModPhys.24.321,10.1103/RevModPhys.25.390,6,24,7,9,8.361756e-12
25084,10.1103/RevModPhys.9.245,10.1103/RevModPhys.9.69,10,11,10,9,-2.704503e-12
3110,10.1103/RevModPhys.26.95,10.1103/RevModPhys.29.683,11,23,19,9,-1.261391e-11
22477,10.1103/RevModPhys.20.585,10.1103/RevModPhys.21.271,12,42,16,9,-8.965717e-12
4014,10.1103/RevModPhys.24.321,10.1103/RevModPhys.27.77,14,24,27,9,1.359401e-11
24696,10.1103/RevModPhys.16.1,10.1103/RevModPhys.20.585,18,18,42,9,-3.210543e-12


In [2]:
# Save data as pickle if needed

# import pickle

# with open('fin.pickle', 'wb') as fw:
#     pickle.dump(fin, fw)
    
# with open('fin.pickle', 'rb') as fr:
#     fin = pickle.load(fr)

In [3]:
# Step 06: Creating a DataFrame: paper1, paper2, common, freq1, freq2, qval

df = pd.concat(fin.values())
df.groupby(['paper1', 'paper2']).size()
df = df.sort_values(['paper1', 'paper2'])
print(len(df))
df.sort_values('common', ascending=False).head()

191343760


Unnamed: 0,paper1,paper2,common,frequency_x,frequency_y,keyval,qval
98715,10.1103/RevModPhys.16.1,10.1103/RevModPhys.20.585,25,27,44,8,1.633049e-11
3588,10.1103/RevModPhys.16.1,10.1103/RevModPhys.20.585,24,25,49,7,2.32635e-11
33841,10.1103/RevModPhys.16.1,10.1103/RevModPhys.20.585,23,23,52,5,2.648815e-11
3447,10.1103/RevModPhys.16.1,10.1103/RevModPhys.20.585,22,22,45,6,1.729761e-11
45708,10.1103/RevModPhys.26.95,10.1103/RevModPhys.29.683,19,39,29,8,9.227064e-12


In [1]:
# Step 07: Combining equivalent pairs together with the minimum qval

df = df.drop(["common", "frequency_x", "frequency_y"], axis=1)

df = df.sort_values(["paper1", "paper2", "qval"]).reset_index(drop=True).groupby(
        ["paper1", "paper2"], as_index=False).min()

print(len(df))
# df.sort_values('qval').head()

142053508


In [None]:
# Step 08 UNUSED

# Step 09: Merging a DataFrame: adding gender & year

df = df.merge(lastdata, left_on='paper1', right_on='doi').drop(
    ['doi', 'order', 'numAuthor', 'is_alpha', 'articleType'], axis=1).rename(
    {'gender': 'gender1', 'year': 'year1', 'journal': 'journal1', 'pacs00': 'pacs00_1',
    'pacs10': 'pacs10_1', 'pacs20': 'pacs20_1', 'pacs30': 'pacs30_1', 'pacs40': 'pacs40_1',
    'pacs50': 'pacs50_1', 'pacs60': 'pacs60_1', 'pacs70': 'pacs70_1', 'pacs80': 'pacs80_1',
    'pacs90': 'pacs90_1'}, axis=1)
df = df.merge(lastdata, left_on='paper2', right_on='doi').drop(
    ['doi', 'order', 'numAuthor', 'is_alpha', 'articleType'], axis=1).rename(
    {'gender': 'gender2', 'id': 'id2', 'year': 'year2', 'journal': 'journal2', 'pacs00': 'pacs00_2',
    'pacs10': 'pacs10_2', 'pacs20': 'pacs20_2', 'pacs30': 'pacs30_2', 'pacs40': 'pacs40_2',
    'pacs50': 'pacs50_2', 'pacs60': 'pacs60_2', 'pacs70': 'pacs70_2', 'pacs80': 'pacs80_2',
    'pacs90': 'pacs90_2'}, axis=1)

cols = ['paper1', 'id1', 'gender1', 'year1', 'journal1', 'pacs00_1', 'pacs10_1', 'pacs20_1', 'pacs30_1', 
        'pacs40_1', 'pacs50_1', 'pacs60_1', 'pacs70_1', 'pacs80_1', 'pacs90_1', 'paper2', 'id2', 'gender2',
        'year2', 'journal2', 'pacs00_2', 'pacs10_2', 'pacs20_2', 'pacs30_2', 'pacs40_2', 'pacs50_2',
        'pacs60_2', 'pacs70_2', 'pacs80_2', 'pacs90_2', 'qval', 'keyval']
df = df.reindex(columns=cols)

print(len(df))
df.sort_values('qval').head()

In [18]:
df.to_csv(f'../pairs_edited.csv',index=False)