In [None]:
# import necessary packages

import pandas as pd

In [None]:
# import necessary data

cb = pd.read_csv("../citationBara.csv")
primdata = pd.read_csv("../primdata.csv")
lastdata = pd.read_csv("../lastdata.csv")
pair = pd.read_csv("../pairs_edited.csv")

# if pair.csv is too big, try dividing it to chunks

# pair = pd.read_csv("/Users/robinkong/codes/pairs_edited.csv", chunksize = 1000000)
# pair = list(pair)
# pair = pd.concat(pair)

In [None]:
#### IF PRIMARY ANALYSIS

# data = primdata.copy()

#### IF LAST AUTHOR ANALYSIS

data = lastdata.copy()

In [None]:
# DataFrame: {citing_doi, citing_year, cited_doi}

citinfo = data[["doi", "year"]].drop_duplicates(subset = 'doi')
cols = ['citing_doi', 'citing_year', 'cited_doi']

citinfo = citinfo.merge(cb, left_on = 'doi', right_on = 'citing_doi',
              how = 'inner').drop(['doi'], axis=1).rename(
                  {'year': 'citing_year'}, axis=1).reindex(
                      columns=cols).sort_values('cited_doi')

# citinfo.head()

## m-m pairs

In [None]:
# choosing pairs where both last authors are men

mmpairs = pair[(pair.gender1 == 'male') & (pair.gender2 == 'male')].reset_index(drop=True)

In [None]:
# counting for similar m-m pairs

sim_mm = mmpairs[['paper1', 'paper2', 'gender1', 'gender2', 'year1', 'year2', 'keyval', 'qval']]

In [None]:
# Create a dummy data (test1, test2) to perform SQL:
# Create a DataFrame that treats citation info for male and female papers
test1 = sim_mm.merge(citinfo, left_on = 'paper1',
                    right_on = 'cited_doi', how = 'left').drop(
                        ["cited_doi", "citing_year"], axis=1).drop_duplicates(
                            subset=['paper1', 'citing_doi'])
test1['count'] = test1.groupby(['paper1'])['paper1'].transform('count')
test1 = test1.drop(["citing_doi"], axis=1).rename({'count': 'count1'}, axis=1)

test1 = test1.drop_duplicates(subset=['paper1', 'paper2'])[['paper1', 'count1']]
sim_mm = sim_mm.merge(test1, on='paper1', how='inner')

test2 = sim_mm.merge(citinfo, left_on = 'paper2',
                    right_on = 'cited_doi', how = 'left').drop(
                        ["cited_doi", "citing_year"], axis=1).drop_duplicates(
                            subset=['paper2', 'citing_doi'])
test2['count'] = test2.groupby(['paper2'])['paper2'].transform('count')
test2 = test2.drop(["citing_doi"], axis=1).rename(
    {'count': 'count2'}, axis=1).drop_duplicates(
        subset=['paper1', 'paper2'])[['paper2', 'count2']]
sim_mm = sim_mm.merge(test2, on='paper2', how='inner')

# Delete the dummy data
del test1
del test2

In [None]:
# Calculate year & centrality difference among pairs

listd = []
listy = []

for c in tqdm(range(len(sim_mm))):
    listd.append(sim_mm['count1'][c] - sim_mm['count2'][c])
    listy.append(sim_mm['year1'][c] - sim_mm['year2'][c])

sim_mm['countd'] = listd
sim_mm['yeard'] = listy

# Keep the minimum q-value per pair
sim_mm = sim_mm.sort_values('qval').drop_duplicates(
    subset=['paper1', 'paper2'], keep='first')

In [None]:
sim_mm.to_csv("../mmpairs_similarity_edited.csv", index=False)

## m-w pairs

In [None]:
# choosing pairs where one last author is a man and another is a woman

mwpairs = pair[(pair.gender1 != pair.gender2)].reset_index(drop=True)

In [None]:
# counting for similar m-m pairs

sim_mw = mwpairs[['paper1', 'paper2', 'gender1', 'gender2', 'year1', 'year2', 'keyval', 'qval']]

In [None]:
# Create a dummy data (test1, test2) to perform SQL:
# Create a DataFrame that treats citation info for male and female papers
test1 = sim_mw.merge(citinfo, left_on = 'paper1',
                    right_on = 'cited_doi', how = 'left').drop(
                        ["cited_doi", "citing_year"], axis=1).drop_duplicates(
                            subset=['paper1', 'citing_doi'])
test1['count'] = test1.groupby(['paper1'])['paper1'].transform('count')
test1 = test1.drop(["citing_doi"], axis=1).rename({'count': 'count1'}, axis=1)

test1 = test1.drop_duplicates(subset=['paper1', 'paper2'])[['paper1', 'count1']]
sim_mw = sim_mw.merge(test1, on='paper1', how='inner')

test2 = sim_mw.merge(citinfo, left_on = 'paper2',
                    right_on = 'cited_doi', how = 'left').drop(
                        ["cited_doi", "citing_year"], axis=1).drop_duplicates(
                            subset=['paper2', 'citing_doi'])
test2['count'] = test2.groupby(['paper2'])['paper2'].transform('count')
test2 = test2.drop(["citing_doi"], axis=1).rename(
    {'count': 'count2'}, axis=1).drop_duplicates(
        subset=['paper1', 'paper2'])[['paper2', 'count2']]
sim_mw = sim_mw.merge(test2, on='paper2', how='inner')

# Delete the dummy data
del test1
del test2

In [None]:
# Calculate year & centrality difference among pairs

listd = []
listy = []

for c in tqdm(range(len(sim_mw))):
    if sim_mw['gender1'][c] == 'male':
        listd.append(sim_mw['count1'][c] - sim_mw['count2'][c])
        listy.append(sim_mw['year1'][c] - sim_mw['year2'][c])
    else:
        listd.append(sim_mw['count2'][c] - sim_mw['count1'][c])
        listy.append(sim_mw['year2'][c] - sim_mw['year1'][c])

sim_mw['countd'] = listd
sim_mw['yeard'] = listy

# Keep the minimum q-value per pair
sim_mw = sim_mw.sort_values('qval').drop_duplicates(
    subset=['paper1', 'paper2'], keep='first')

In [None]:
sim_mw.to_csv("../mwpairs_similarity_edited.csv", index=False)

## w-w pairs

In [None]:
# choosing pairs where both last authors are women

wwpairs = pair[(pair.gender1 == 'female') & (pair.gender2 == 'female')].reset_index(drop=True)

In [None]:
# counting for similar m-m pairs

sim_ww = wwpairs[['paper1', 'paper2', 'gender1', 'gender2', 'year1', 'year2', 'keyval', 'qval']]

In [None]:
# Create a dummy data (test1, test2) to perform SQL:
# Create a DataFrame that treats citation info for male and female papers
test1 = sim_ww.merge(citinfo, left_on = 'paper1',
                    right_on = 'cited_doi', how = 'left').drop(
                        ["cited_doi", "citing_year"], axis=1).drop_duplicates(
                            subset=['paper1', 'citing_doi'])
test1['count'] = test1.groupby(['paper1'])['paper1'].transform('count')
test1 = test1.drop(["citing_doi"], axis=1).rename({'count': 'count1'}, axis=1)

test1 = test1.drop_duplicates(subset=['paper1', 'paper2'])[['paper1', 'count1']]
sim_ww = sim_ww.merge(test1, on='paper1', how='inner')

test2 = sim_ww.merge(citinfo, left_on = 'paper2',
                    right_on = 'cited_doi', how = 'left').drop(
                        ["cited_doi", "citing_year"], axis=1).drop_duplicates(
                            subset=['paper2', 'citing_doi'])
test2['count'] = test2.groupby(['paper2'])['paper2'].transform('count')
test2 = test2.drop(["citing_doi"], axis=1).rename(
    {'count': 'count2'}, axis=1).drop_duplicates(
        subset=['paper1', 'paper2'])[['paper2', 'count2']]
sim_ww = sim_ww.merge(test2, on='paper2', how='inner')

# Delete the dummy data
del test1
del test2

In [None]:
# Calculate year & centrality difference among pairs

listd = []
listy = []

for c in tqdm(range(len(sim_ww))):
    listd.append(sim_ww['count1'][c] - sim_ww['count2'][c])
    listy.append(sim_ww['year1'][c] - sim_ww['year2'][c])

sim_ww['countd'] = listd
sim_ww['yeard'] = listy

# Keep the minimum q-value per pair
sim_ww = sim_ww.sort_values('qval').drop_duplicates(
    subset=['paper1', 'paper2'], keep='first')

In [None]:
sim_ww.to_csv("../wwpairs_similarity_edited.csv", index=False)