<a href="https://colab.research.google.com/github/therobinkay/gender_bias_2021/blob/main/Journal_Analysis_Preparations.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# import all necessary packages

import io
import pandas as pd
import os
import itertools
import numpy as np
import scipy.stats
import time

import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm

import math

from collections import defaultdict
from itertools import combinations as comb

import networkx as nx
import random

In [2]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [3]:
!cd "drive/My Drive"

In [75]:
# read all necessary .csv files

data = pd.read_csv("drive/My Drive/codes/data.csv")
dp = pd.read_csv("drive/My Drive/codes/doipacs.csv")
cen = pd.read_csv('drive/My Drive/codes/cen.csv')

cdata = pd.read_csv('drive/My Drive/codes/cdata.csv')

cb = pd.read_csv("drive/My Drive/codes/citationBara.csv")
# extra modification for future analyses
cb['pair'] = "(" + cb["citing_doi"] + ", " + cb["cited_doi"] + ")"
cb['pair2'] = "(" + cb["cited_doi"] + ", " + cb["citing_doi"] + ")"

# Similarity Preparations

In [79]:
# Step 00a: Creating a DataFrame:
# p(doi, id, gender, year, PACS),
# r(doi, id, gender, year, PACS)
hdata = cdata.query('citing_order == 1 & cited_order == 1\
 & citing_is_alpha == False & cited_is_alpha == False').drop(['citing_order',
  'cited_order', 'citing_numAuthor', 'cited_numAuthor', 'citing_is_last',
  'cited_is_last', 'citing_is_alpha', 'cited_is_alpha', 'cited_year',
  'citing_articleType', 'cited_articleType', 'citing_journal', 'cited_journal',
  'citing_exceptions', 'cited_exceptions'],axis=1)

print(len(hdata))
print(hdata['citing_doi'].nunique())

hdata.head()

703005
134908


Unnamed: 0,citing_doi,citing_id,citing_gender,citing_year,citing_0,citing_1,citing_2,citing_3,citing_4,citing_5,citing_6,citing_7,citing_8,citing_9,cited_doi,cited_id,cited_gender,cited_0,cited_1,cited_2,cited_3,cited_4,cited_5,cited_6,cited_7,cited_8,cited_9
86,10.1103/PhysRevB.1.3614,48018,female,1970,False,False,False,False,False,False,False,False,False,False,10.1103/PhysRev.91.174,9374,male,False,False,False,False,False,False,False,False,False,False
88,10.1103/PhysRev.92.1367,9374,male,1953,False,False,False,False,False,False,False,False,False,False,10.1103/PhysRev.91.174,9374,male,False,False,False,False,False,False,False,False,False,False
90,10.1103/RevModPhys.62.251,81636,male,1990,False,False,False,False,False,False,False,False,False,False,10.1103/PhysRev.91.174,9374,male,False,False,False,False,False,False,False,False,False,False
93,10.1103/PhysRevA.10.1494,29373,male,1974,False,False,False,False,False,False,False,False,False,False,10.1103/PhysRev.91.174,9374,male,False,False,False,False,False,False,False,False,False,False
97,10.1103/PhysRevD.74.085005,57188,male,2006,True,True,False,False,False,False,False,False,False,False,10.1103/PhysRev.91.174,9374,male,False,False,False,False,False,False,False,False,False,False


In [80]:
# Step 00b: Creating a DataFrame: paper, gender, year

rhom = hdata[['citing_doi', 'citing_gender', 'citing_year']].drop_duplicates(
    subset = 'citing_doi')
rhom.columns = ["paper", "gender", "year"]

print(len(rhom))
rhom.head()

134908


Unnamed: 0,paper,gender,year
86,10.1103/PhysRevB.1.3614,female,1970
88,10.1103/PhysRev.92.1367,male,1953
90,10.1103/RevModPhys.62.251,male,1990
93,10.1103/PhysRevA.10.1494,male,1974
97,10.1103/PhysRevD.74.085005,male,2006


In [81]:
# Step 01: Creating a DataFrame: paper, reference, count (of reference)

N = 9 #enter hom number
for N in range(N+1):
  cited = 'cited_'+str(N)
  citing = 'citing_'+str(N)
  
  hom = hdata[hdata[cited] == True]
  hom = hom[hom[citing] == True]
  hom['count'] = hom.groupby('cited_doi')['cited_doi'].transform('count')
  hom.sort_values("count", axis = 0, ascending = True, inplace = True,
                  na_position ='first')
  hom = hom[['citing_doi', 'cited_doi', 'count']]
  hom.columns = ["paper", "reference", "count"]

print(len(hom))
hom.head()

20686


Unnamed: 0,paper,reference,count
9383928,10.1103/PhysRevD.80.123524,10.1103/PhysRevD.79.063512,1
9127653,10.1103/PhysRevE.79.036314,10.1103/PhysRevE.50.2048,1
6051565,10.1103/PhysRevB.66.092103,10.1103/PhysRevB.65.193107,1
6059870,10.1103/PhysRevB.66.092103,10.1103/PhysRevB.63.104111,1
9128496,10.1103/PhysRevD.78.103007,10.1103/PhysRevD.76.125019,1


In [82]:
# Step 02: Creating a dictionary of 'hom's by count

hom_x={key:hom[hom['count']==key] for key in hom['count'].unique()}

In [83]:
# Step 03: Link papers with connections together as groups

hom_y = {}
nunique_list = {}
for key, hom_x_sub in hom_x.items():
  cbt = hom_x_sub
  from_doi = cbt["paper"]
  to_doi = cbt["reference"]

  nunique_list[key] = len(to_doi.unique())

  parent = {}
  for ref in to_doi.unique():
    parent[ref] = []
  for a, b in zip(from_doi, to_doi):
    parent[b].append(a)

  pair_count = {}
  for l in parent.values():
    for tp in comb(sorted(l), 2):
      if tp in pair_count:
        pair_count[tp] += 1
      else:
        pair_count[tp] = 1
  dfc = []
  for p, value in pair_count.items():
    dfc.append([p[0], p[1], value])
  dfc = pd.DataFrame(dfc, columns=['paper1', 'paper2', 'common'])
  hom_y[key] = dfc

In [84]:
# Step 04: Creating a dictionary: paper1, paper2, reference, common, freq1, freq2

from itertools import combinations as comb

freq = {}

for key, tables in hom_y.items():
  ftable = hom_x[key].groupby(["paper"]).size().reset_index(name='frequency')
  freq[key] = ftable

fin = {}
for key, tables in hom_y.items():
  if len(tables) > 0:
    tcount = freq[key]
    final_table = tables.merge(tcount, left_on='paper1', right_on='paper')
    final_table = final_table.merge(tcount, left_on='paper2', right_on='paper')
    final_table = final_table.drop(['paper_x', 'paper_y'], axis=1)
    
    fin[key] = final_table

fin[9].head()

Unnamed: 0,paper1,paper2,common,frequency_x,frequency_y
0,10.1103/PhysRevE.48.4250,10.1103/PhysRevE.50.2607,1,1,1
1,10.1103/PhysRevE.48.4250,10.1103/PhysRevE.51.884,1,1,1
2,10.1103/PhysRevE.50.2607,10.1103/PhysRevE.51.884,1,1,1
3,10.1103/PhysRevE.48.4250,10.1103/PhysRevE.52.3390,1,1,1
4,10.1103/PhysRevE.50.2607,10.1103/PhysRevE.52.3390,1,1,1


In [85]:
# Step 05: Assigning q-values to all pairs

H_func = scipy.stats.hypergeom.pmf

for key, table in fin.items():
  qval = []
  _dp = {}
  nbk = nunique_list[key]

  for i, r in table.iterrows():
    Hs = []
    di, dj = r['frequency_x'], r['frequency_y']
    nijk = r['common']

    for X in range(nijk):
      _ref = (X, nbk, di, dj)
      if _ref in _dp:
        H = _dp[_ref]
      else:
        H = H_func(X, nbk, di, dj)
        _dp[_ref] = H
      Hs.append(H)

    qval.append(1-sum(Hs))

  fin[key]['qval'] = qval

In [86]:
# Step 06: Creating a DataFrame: paper1, paper2, common, freq1, freq2, qval

df = pd.concat(fin.values())
df.groupby(['paper1', 'paper2']).size()
df = df.sort_values(['paper1', 'paper2'])
print(len(df))
df.sort_values('common', ascending=False).head()

142222


Unnamed: 0,paper1,paper2,common,frequency_x,frequency_y,qval
2164,10.1103/PhysRevD.68.024016,10.1103/PhysRevD.68.103512,4,4,4,1.79751e-07
2177,10.1103/PhysRevD.68.103512,10.1103/PhysRevD.70.043513,4,4,4,1.79751e-07
2175,10.1103/PhysRevD.68.024016,10.1103/PhysRevD.70.043513,4,4,4,1.79751e-07
343,10.1103/PhysRevD.67.023509,10.1103/PhysRevD.67.123515,4,4,4,1.031845e-07
1030,10.1103/PhysRevD.72.023510,10.1103/PhysRevD.80.083002,3,3,4,1.258851e-05


In [87]:
# Step 07: Combining equivalent pairs together with the minimum qval

df = df.drop(["common", "frequency_x", "frequency_y"], axis=1).sort_values(
    ["paper1", "paper2", "qval"]).reset_index(drop=True).groupby(
        ["paper1", "paper2"], as_index=False).min()

print(len(df))
df.sort_values('qval').head()

103313


Unnamed: 0,paper1,paper2,qval
100335,10.1103/PhysRevD.80.083002,10.1103/PhysRevD.80.126018,6.411863e-08
20468,10.1103/PhysRevD.60.104040,10.1103/PhysRevD.69.084018,6.411863e-08
43693,10.1103/PhysRevD.67.023509,10.1103/PhysRevD.67.123515,1.031845e-07
88955,10.1103/PhysRevD.77.023513,10.1103/PhysRevLett.94.151601,1.233891e-07
57204,10.1103/PhysRevD.70.043513,10.1103/PhysRevD.70.044021,1.233891e-07


In [88]:
# Step 08: Assigning k to all pairs

pairs = "(" + df["paper1"] + ", " + df["paper2"] + ")"
df['k'] = pairs.isin(cb.pair) | pairs.isin(cb.pair2)

print(len(df))
print(sum(df['k']))
df.head()

103313
10278


Unnamed: 0,paper1,paper2,qval,k
0,10.1103/PhysRevA.35.4035,10.1103/PhysRevA.36.4163,0.001441,False
1,10.1103/PhysRevA.36.4700,10.1103/PhysRevA.38.1839,0.003817,True
2,10.1103/PhysRevA.36.4700,10.1103/PhysRevA.40.6260,0.003817,True
3,10.1103/PhysRevA.36.4700,10.1103/PhysRevA.40.6931,0.003817,True
4,10.1103/PhysRevA.36.4700,10.1103/PhysRevA.47.4065,0.003817,True


In [89]:
# Step 09: Merging a DataFrame: adding gender & year

df = df.merge(rhom, left_on='paper1', right_on='paper', how='inner').drop(
    ['paper'], axis=1).rename({'gender': 'gender1', 'year': 'year1'}, axis=1)
df = df.merge(rhom, left_on='paper2', right_on='paper', how='inner').drop(
    ['paper'], axis=1).rename({'gender': 'gender2', 'year': 'year2'}, axis=1)

cols = ['paper1', 'gender1', 'year1', 'paper2', 'gender2', 'year2', 'qval', 'k']
df = df.reindex(columns=cols)

print(len(df))
df.head()

103313


Unnamed: 0,paper1,gender1,year1,paper2,gender2,year2,qval,k
0,10.1103/PhysRevA.35.4035,male,1987,10.1103/PhysRevA.36.4163,male,1987,0.001441,False
1,10.1103/PhysRevA.36.4700,male,1987,10.1103/PhysRevA.38.1839,male,1988,0.003817,True
2,10.1103/PhysRevA.36.4700,male,1987,10.1103/PhysRevA.40.6260,male,1989,0.003817,True
3,10.1103/PhysRevA.38.1839,male,1988,10.1103/PhysRevA.40.6260,male,1989,1.5e-05,True
4,10.1103/PhysRevA.36.4700,male,1987,10.1103/PhysRevA.40.6931,male,1989,0.003817,True


In [16]:
# Step XX: Converting to .csv file

df.to_csv(r'drive/My Drive/codes/hom9.csv',index=False)