<a href="https://colab.research.google.com/github/therobinkay/firstmover/blob/main/First_mover_advantage_SIMILARITY_ALGORITHM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
!cd "drive/My Drive"

In [None]:
# import all necessary packages

import pandas as pd
import scipy.stats
from collections import defaultdict
from itertools import combinations as comb

In [None]:
# read all necessary .csv files

cdata = pd.read_csv('drive/My Drive/codes/cdata.csv')

cb = pd.read_csv("drive/My Drive/codes/citationBara.csv")
# extra modification for this analysis
cb['pair'] = "(" + cb["citing_doi"] + ", " + cb["cited_doi"] + ")"
cb['pair2'] = "(" + cb["cited_doi"] + ", " + cb["citing_doi"] + ")"

# Similarity Preparations

In [None]:
# Step 00a: Creating a DataFrame:
# p(doi, id, gender, year, PACS),
# r(doi, id, gender, year, PACS)
hdata = cdata.query('citing_order == 1 & cited_order == 1\
 & citing_is_alpha == False & cited_is_alpha == False').drop(['citing_order',
  'cited_order', 'citing_numAuthor', 'cited_numAuthor', 'citing_is_last',
  'cited_is_last', 'citing_is_alpha', 'cited_is_alpha', 'cited_year',
  'citing_articleType', 'cited_articleType', 'citing_journal', 'cited_journal',
  'citing_exceptions', 'cited_exceptions'],axis=1)

# print(len(hdata))
# print(hdata['citing_doi'].nunique())

# hdata.head()

In [None]:
# Step 00b: Creating a DataFrame: paper, gender, year

refsim = hdata[['citing_doi', 'citing_gender', 'citing_year']].drop_duplicates(
    subset = 'citing_doi')
refsim.columns = ["paper", "gender", "year"]

# print(len(refsim))
# refsim.head()

In [None]:
# Step 01: Creating a DataFrame: paper, reference, count (of reference)

N = 1 #enter sim number
for N in range(N+1):
  cited = 'cited_'+str(N)
  citing = 'citing_'+str(N)
  
  sim = hdata[hdata[cited] == True]
  sim = sim[sim[citing] == True]
  sim['count'] = sim.groupby('cited_doi')['cited_doi'].transform('count')
  sim.sort_values("count", axis = 0, ascending = True, inplace = True,
                  na_position ='first')
  sim = sim[['citing_doi', 'cited_doi', 'count']]
  sim.columns = ["paper", "reference", "count"]

# print(len(sim))
# sim.head()

In [None]:
# Step 02: Creating a dictionary of 'sim's by count

sim_x={key:sim[sim['count']==key] for key in sim['count'].unique()}

In [None]:
# Step 03: Link papers with connections together as groups

sim_y = {}
nunique_list = {}
for key, sim_x_sub in sim_x.items():
  cbt = sim_x_sub
  from_doi = cbt["paper"]
  to_doi = cbt["reference"]

  nunique_list[key] = len(to_doi.unique())

  parent = {}
  for ref in to_doi.unique():
    parent[ref] = []
  for a, b in zip(from_doi, to_doi):
    parent[b].append(a)

  pair_count = {}
  for l in parent.values():
    for tp in comb(sorted(l), 2):
      if tp in pair_count:
        pair_count[tp] += 1
      else:
        pair_count[tp] = 1
  dfc = []
  for p, value in pair_count.items():
    dfc.append([p[0], p[1], value])
  dfc = pd.DataFrame(dfc, columns=['paper1', 'paper2', 'common'])
  sim_y[key] = dfc

In [None]:
# Step 04: Creating a dictionary: paper1, paper2, reference, common, freq1, freq2

freq = {}

for key, tables in sim_y.items():
  ftable = sim_x[key].groupby(["paper"]).size().reset_index(name='frequency')
  freq[key] = ftable

fin = {}
for key, tables in sim_y.items():
  if len(tables) > 0:
    tcount = freq[key]
    final_table = tables.merge(tcount, left_on='paper1', right_on='paper')
    final_table = final_table.merge(tcount, left_on='paper2', right_on='paper')
    final_table = final_table.drop(['paper_x', 'paper_y'], axis=1)
    
    fin[key] = final_table

# fin[9].head()

In [None]:
# Step 05: Assigning q-values to all pairs

H_func = scipy.stats.hypergeom.pmf

for key, table in fin.items():
  qval = []
  _dp = {}
  nbk = nunique_list[key]

  for i, r in table.iterrows():
    Hs = []
    di, dj = r['frequency_x'], r['frequency_y']
    nijk = r['common']

    for X in range(nijk):
      _ref = (X, nbk, di, dj)
      if _ref in _dp:
        H = _dp[_ref]
      else:
        H = H_func(X, nbk, di, dj)
        _dp[_ref] = H
      Hs.append(H)

    qval.append(1-sum(Hs))

  fin[key]['qval'] = qval

In [None]:
# Step 06: Creating a DataFrame: paper1, paper2, common, freq1, freq2, qval

df = pd.concat(fin.values())
df.groupby(['paper1', 'paper2']).size()
df = df.sort_values(['paper1', 'paper2'])
# print(len(df))
# df.sort_values('common', ascending=False).head()

In [None]:
# Step 07: Combining equivalent pairs together with the minimum qval

df = df.drop(["common", "frequency_x", "frequency_y"], axis=1).sort_values(
    ["paper1", "paper2", "qval"]).reset_index(drop=True).groupby(
        ["paper1", "paper2"], as_index=False).min()

# print(len(df))
# df.sort_values('qval').head()

In [None]:
# Step 08: Assigning k to all pairs

pairs = "(" + df["paper1"] + ", " + df["paper2"] + ")"
df['k'] = pairs.isin(cb.pair) | pairs.isin(cb.pair2)

# print(len(df))
# print(sum(df['k']))
# df.head()

In [None]:
# Step 09: Merging a DataFrame: adding gender & year

df = df.merge(refsim, left_on='paper1', right_on='paper', how='inner').drop(
    ['paper'], axis=1).rename({'gender': 'gender1', 'year': 'year1'}, axis=1)
df = df.merge(refsim, left_on='paper2', right_on='paper', how='inner').drop(
    ['paper'], axis=1).rename({'gender': 'gender2', 'year': 'year2'}, axis=1)

cols = ['paper1', 'gender1', 'year1', 'paper2', 'gender2', 'year2', 'qval', 'k']
df = df.reindex(columns=cols)

# print(len(df))
# df.head()

In [None]:
# Step XX: Converting to .csv file

# df.to_csv(r'drive/My Drive/codes/sim1.csv',index=False)