In [122]:
import pandas as pd
import numpy as np

In [123]:
path_in = "/Users/timholdsworth/code/scaling-science/Data/impact_1900-1950.csv"
def get_data(path_in):
    df = pd.read_csv(path_in)
    return df

In [124]:
# Add a column thats sums up all the values in a row, which are all the paper's pagerank scores in given years
def add_score_sum(df):
    df['score_sum'] = df.sum(axis=1, skipna=True)
    return df

In [125]:
# Add a column with the number of years since the paper was published
def add_total_years_pub(df):
    df['total_years_pub'] = len(df.columns) - 2 - df.isnull().sum(axis=1, skipna=True) 
    return df

In [126]:
# Add a column with the average score for a paper
def add_time_weighted_score(df):
    df['time_weighted_score'] = df['score_sum'] / df['total_years_pub']
    return df

In [127]:
# Sort by the most popular papers according to time weighted score
def sort(df):
    df = df.sort_values(by=['time_weighted_score'], ascending=False).reset_index(drop=True)
    return df

In [137]:
def clean_for_viz(df):
    # Get the top 100 results
    df = df.head(100)
    df = df.round(3)
    df = df.rename(columns={"a.title": "title"})
    return df

In [138]:
def main(df):
    df = get_data(path)
    df1 = add_score_sum(df)
    df2 = add_total_years_pub(df1)
    df3 = add_time_weighted_score(df2)
    df4 = sort(df3)
    df5 = clean_for_viz(df4)
    return df5
df

Unnamed: 0,title,1900,1901,1902,1903,1904,1905,1906,1907,1908,...,1944,1945,1946,1947,1948,1949,1950,score_sum,total_years_pub,time_weighted_score
0,Zur Frage des Einflusses der Milz auf den Eise...,,,,,,,,,,...,0.40,0.40,0.40,0.40,0.40,0.40,0.40,7.68,23,0.33
1,Zur Pathogenese des Sogenannten „Röntgenkaters“,,,,,,,,,,...,0.28,0.28,0.28,0.28,0.28,0.28,0.28,7.77,28,0.28
2,Substituirte Stickstoffbromide und ihre Bezieh...,0.15,0.28,0.28,0.28,0.28,0.28,0.28,0.28,0.28,...,0.28,0.28,0.28,0.28,0.28,0.28,0.28,14.03,51,0.28
3,Über die Acetylierung von löslicher Stärke,,0.15,0.15,0.15,0.15,0.15,0.15,0.15,0.15,...,0.39,0.39,0.39,0.39,0.39,0.39,0.39,13.73,50,0.27
4,The action of young's glycotropic factor of th...,,,,,,,,,,...,0.28,0.28,0.28,0.28,0.28,0.28,0.28,3.48,13,0.27
5,Weitere Mittheilungen über glashäutige Neubild...,0.15,0.15,0.15,0.15,0.28,0.28,0.28,0.28,0.28,...,0.28,0.28,0.28,0.28,0.28,0.28,0.28,13.64,51,0.27
6,DERMATITIS VENENATA CAUSED BY THE MANZANILLO TREE,,,,,,,,,,...,0.28,0.28,0.28,0.28,0.28,0.28,0.28,2.09,8,0.26
7,Mind in the Making,,,,,,,,,0.15,...,0.28,0.28,0.28,0.28,0.28,0.28,0.28,11.17,43,0.26
8,A Comparison of Triploid and Diploid Crossing ...,,,,,,,,,,...,0.28,0.28,0.28,0.28,0.28,0.28,0.28,4.89,19,0.26
9,Catastrophes of peptic ulcer,,,,,,,,,,...,0.28,0.28,0.28,0.28,0.28,0.28,0.28,4.34,17,0.26


In [143]:
path_out = '/Users/timholdsworth/code/scaling-science/Data/100_most_impactful_papers.csv'
def write_to_csv(df):
    df.to_csv(path_out, index=False)

In [144]:
df = main(df)
write_to_csv(df)