In [1]:
# !pip install habanero --quiet

In [2]:
import pandas as pd
import numpy as np
from habanero import Crossref
from habanero import counts
from habanero import cn

In [3]:
def find_citation_counts(df, laureate):
    """habanero takes forever, so use this function at your own risk"""
    paper_citations = []
    laureate_data = df[df['Laureate name'] == laureate]
    for row in laureate_data.itertuples():
        paper_citations.append(counts.citation_count(doi= row.DOI))
    return paper_citations

def find_all_citation_counts(df):
    citations = []
    laureate_names = df['Laureate name'].unique()
    for name in laureate_names:
        citations += find_citation_counts(df, name)
    return citations

In [4]:
def get_citation_count(doi):
    try:
        paper_citations = counts.citation_count(doi=doi)
        return paper_citations
    except Exception as e:
        print(f"Error retrieving {doi}: {e}")
        return None

In [5]:
df_chem = pd.read_csv('data_nature_paper/Chemistry publication record.csv')
df_phys = pd.read_csv('data_nature_paper/Physics publication record.csv', encoding='latin1')
df_med = pd.read_csv('data_nature_paper/Medicine publication record.csv', encoding='latin1')

df_chem_vals = pd.read_csv('chem_p_and_gender_data.csv')
df_med_vals = pd.read_csv('med_p_and_gender_data.csv')
df_phys_vals = pd.read_csv('phys_p_and_gender_data.csv')

In [6]:
df_chem_vals.head()

Unnamed: 0,Laureate name,p-value,Gender
0,"stoddart, j",0.250622,male
1,"feringa, b",0.524845,male
2,"sauvage, j",0.047244,male
3,"modrich, p",0.196429,male
4,"lindahl, t",0.133333,male


In [7]:
df_chem['field'] = 'Chemistry'
df_phys['field'] = 'Physics'
df_med['field'] = 'Medicine'

df_chem_merged = df_chem.merge(df_chem_vals, on='Laureate name', how='left')
df_phys_merged = df_phys.merge(df_phys_vals, on='Laureate name', how='left')
df_med_merged = df_med.merge(df_med_vals, on='Laureate name', how='left')

In [8]:
df_chem.head()

Unnamed: 0,Laureate ID,Laureate name,Prize year,Title,Pub year,Paper ID,DOI,Journal,Affiliation,Is prize-winning paper,field
0,20001,"stoddart, j",2016,a molecular shuttle,1991,1976039000.0,10.1021/ja00013a096,journal of the american chemical society,northwestern university,YES,Chemistry
1,20001,"stoddart, j",2016,chemical synthesis of nanostructures,1993,1963538000.0,10.1557/PROC-330-57,mrs proceedings,northwestern university,NO,Chemistry
2,20001,"stoddart, j",2016,formation and x ray crystal structure of pt h2...,1981,1963552000.0,10.1039/C39810000851,journal of the chemical society chemical commu...,northwestern university,NO,Chemistry
3,20001,"stoddart, j",2016,single walled carbon nanotubes under the influ...,2005,2095637000.0,10.1002/smll.200400070,small,northwestern university,NO,Chemistry
4,20001,"stoddart, j",2016,synthesis of medium heterocyclic rings from 6 ...,1974,2095679000.0,10.1016/S0008-6215(00)82105-9,carbohydrate research,northwestern university,NO,Chemistry


In [9]:
df_all = pd.concat([df_chem_merged, df_med_merged, df_phys_merged], ignore_index=True)
df_all.head()

Unnamed: 0,Laureate ID,Laureate name,Prize year,Title,Pub year,Paper ID,DOI,Journal,Affiliation,Is prize-winning paper,field,p-value,Gender
0,20001,"stoddart, j",2016,a molecular shuttle,1991.0,1976039000.0,10.1021/ja00013a096,journal of the american chemical society,northwestern university,YES,Chemistry,0.250622,male
1,20001,"stoddart, j",2016,chemical synthesis of nanostructures,1993.0,1963538000.0,10.1557/PROC-330-57,mrs proceedings,northwestern university,NO,Chemistry,0.250622,male
2,20001,"stoddart, j",2016,formation and x ray crystal structure of pt h2...,1981.0,1963552000.0,10.1039/C39810000851,journal of the chemical society chemical commu...,northwestern university,NO,Chemistry,0.250622,male
3,20001,"stoddart, j",2016,single walled carbon nanotubes under the influ...,2005.0,2095637000.0,10.1002/smll.200400070,small,northwestern university,NO,Chemistry,0.250622,male
4,20001,"stoddart, j",2016,synthesis of medium heterocyclic rings from 6 ...,1974.0,2095679000.0,10.1016/S0008-6215(00)82105-9,carbohydrate research,northwestern university,NO,Chemistry,0.250622,male


In [10]:
n = len(df_all)
chunk_size = n // 8  # Integer division

split_1 = df_all.iloc[:chunk_size] 
split_2 = df_all.iloc[chunk_size:2*chunk_size]
split_3 = df_all.iloc[2*chunk_size:3*chunk_size]
split_4 = df_all.iloc[3*chunk_size:4*chunk_size]
split_5 = df_all.iloc[4*chunk_size:5*chunk_size]
split_6 = df_all.iloc[5*chunk_size:6*chunk_size]
split_7 = df_all.iloc[6*chunk_size:7*chunk_size]
split_8 = df_all.iloc[7*chunk_size:]

## THIS THING, SET X AS YOUR SPLIT AND RUN THE FOLLOWING CELL

In [None]:
# # SET X HERE FOR YOUR SPLIT
# x = 1 # CHANGE THIS

# split_x['citation_count'] = split_x['DOI'].apply(lambda d: get_citation_count(d)) # CHANGE THE X IN THIS
# split_x.to_csv(f'split_{x}_citation_counts.csv', index=False) # CHANGE THE X IN THIS

KeyboardInterrupt: 

In [11]:
from tqdm import tqdm
tqdm.pandas()

In [None]:
# the thing that takes 3 days to run
# df_all['citation_count'] = df_all['DOI'].progress_apply(lambda d: get_citation_count(d))

  0%|          | 68/93394 [00:20<7:56:59,  3.26it/s]


KeyboardInterrupt: 

## IDK IF ANYTHING BELOW THIS WORKS

In [12]:
df_all = pd.read_csv('df_all.csv')
df_all.head()

Unnamed: 0,Laureate ID,Laureate name,Prize year,Title,Pub year,Paper ID,DOI,Journal,Affiliation,Is prize-winning paper,field,p-value,Gender,citation_count
0,20001,"stoddart, j",2016,a molecular shuttle,1991.0,1976039000.0,10.1021/ja00013a096,journal of the american chemical society,northwestern university,YES,Chemistry,0.250622,male,667.0
1,20001,"stoddart, j",2016,chemical synthesis of nanostructures,1993.0,1963538000.0,10.1557/PROC-330-57,mrs proceedings,northwestern university,NO,Chemistry,0.250622,male,1.0
2,20001,"stoddart, j",2016,formation and x ray crystal structure of pt h2...,1981.0,1963552000.0,10.1039/C39810000851,journal of the chemical society chemical commu...,northwestern university,NO,Chemistry,0.250622,male,10.0
3,20001,"stoddart, j",2016,single walled carbon nanotubes under the influ...,2005.0,2095637000.0,10.1002/smll.200400070,small,northwestern university,NO,Chemistry,0.250622,male,87.0
4,20001,"stoddart, j",2016,synthesis of medium heterocyclic rings from 6 ...,1974.0,2095679000.0,10.1016/S0008-6215(00)82105-9,carbohydrate research,northwestern university,NO,Chemistry,0.250622,male,6.0


In [13]:
df_all.isna().sum()

Laureate ID                   0
Laureate name                 0
Prize year                    0
Title                         1
Pub year                      1
Paper ID                     33
DOI                         198
Journal                    1230
Affiliation               17514
Is prize-winning paper        0
field                         0
p-value                       0
Gender                        0
citation_count              201
dtype: int64

In [14]:
df_all = df_all.dropna(subset=['citation_count'])

In [36]:
df_all['citation_percentile'] = df_all.groupby('Laureate name')['citation_count'].rank(pct=True)
nobel_percentiles = df_all[df_all['Is prize-winning paper'] == 'YES'].groupby('Laureate name')['citation_percentile'].max().reset_index()
nobel_percentiles = nobel_percentiles.rename(columns={'citation_percentile': 'nobel_citation_percentile'})
df_ah = df_all.merge(nobel_percentiles, on='Laureate name', how='left')

In [37]:
df_ah.head()

Unnamed: 0,Laureate ID,Laureate name,Prize year,Title,Pub year,Paper ID,DOI,Journal,Affiliation,Is prize-winning paper,field,p-value,Gender,citation_count,citation_percentile,nobel_citation_percentile
0,20001,"stoddart, j",2016,a molecular shuttle,1991.0,1976039000.0,10.1021/ja00013a096,journal of the american chemical society,northwestern university,YES,Chemistry,0.250622,male,667.0,0.981743,0.981743
1,20001,"stoddart, j",2016,chemical synthesis of nanostructures,1993.0,1963538000.0,10.1557/PROC-330-57,mrs proceedings,northwestern university,NO,Chemistry,0.250622,male,1.0,0.156846,0.981743
2,20001,"stoddart, j",2016,formation and x ray crystal structure of pt h2...,1981.0,1963552000.0,10.1039/C39810000851,journal of the chemical society chemical commu...,northwestern university,NO,Chemistry,0.250622,male,10.0,0.257676,0.981743
3,20001,"stoddart, j",2016,single walled carbon nanotubes under the influ...,2005.0,2095637000.0,10.1002/smll.200400070,small,northwestern university,NO,Chemistry,0.250622,male,87.0,0.736515,0.981743
4,20001,"stoddart, j",2016,synthesis of medium heterocyclic rings from 6 ...,1974.0,2095679000.0,10.1016/S0008-6215(00)82105-9,carbohydrate research,northwestern university,NO,Chemistry,0.250622,male,6.0,0.219087,0.981743


In [38]:
important_columns = ['Laureate name', 'Title', 'Is prize-winning paper', 'p-value', 'Gender', 'field', 'citation_count', 'citation_percentile', 'nobel_citation_percentile', 'Pub year']
df_ah = df_ah[important_columns]
df_ah.head()

Unnamed: 0,Laureate name,Title,Is prize-winning paper,p-value,Gender,field,citation_count,citation_percentile,nobel_citation_percentile,Pub year
0,"stoddart, j",a molecular shuttle,YES,0.250622,male,Chemistry,667.0,0.981743,0.981743,1991.0
1,"stoddart, j",chemical synthesis of nanostructures,NO,0.250622,male,Chemistry,1.0,0.156846,0.981743,1993.0
2,"stoddart, j",formation and x ray crystal structure of pt h2...,NO,0.250622,male,Chemistry,10.0,0.257676,0.981743,1981.0
3,"stoddart, j",single walled carbon nanotubes under the influ...,NO,0.250622,male,Chemistry,87.0,0.736515,0.981743,2005.0
4,"stoddart, j",synthesis of medium heterocyclic rings from 6 ...,NO,0.250622,male,Chemistry,6.0,0.219087,0.981743,1974.0


In [39]:
df_ah.to_csv('df_complete.csv', index=False)