In [1]:
import pandas as pd
import numpy as np

In [47]:
path_in = "/Users/timholdsworth/code/scaling-science/Data/impactByTitle_166M_1950-2020-10.csv"
def get_data(path_in):
    df = pd.read_csv(path_in)
    return df

In [48]:
# Add a column thats sums up all the values in a row, which are all the paper's pagerank scores in given years
def add_score_sum(df):
    df['score_sum'] = df.sum(axis=1, skipna=True)
    return df

In [49]:
# Add a column with the number of years since the paper was published
def add_total_years_pub(df):
    df['total_years_pub'] = len(df.columns) - 2 - df.isnull().sum(axis=1, skipna=True) 
    return df

In [50]:
# Add a column with the average score for a paper
def add_time_weighted_score(df):
    df['time_weighted_score'] = df['score_sum'] / df['total_years_pub']
    return df

In [51]:
# Sort by the most popular papers according to time weighted score
def sort(df):
    df = df.sort_values(by=['time_weighted_score'], ascending=False).reset_index(drop=True)
    return df

In [64]:
num_results = 100
def clean_for_viz(df, num_results):
    df = df.head(num_results)
    df = df.round(3)
    df = df.rename(columns={"a.title": "title"})
    return df

In [65]:
# Returns a series of len(df) where each value is the column the data starts in for each row
def get_total_years_pub(df):
    
    total_years_pub = df.loc[:, 'total_years_pub']
    # Because there are 3 extra column at the end
    start_col_pos = total_years_pub + 3
    # Because we want the last certain number of columns
    start_cols = -start_col_pos
    
    return start_cols

In [91]:
# Method to calculate decay_scores for a given paper, returning the scores as a series
decay_rate = 25
def calc_decay_scores(df, start_col, index, decay_rate):
         
    # Get the nondecayed scores 
    start = start_col[index]
    impact_scores = df.iloc[start:-3, index].reset_index(drop=True)
    
    # Generate a series of decay coefficients
    time = np.arange(len(impact_scores))
    decay_list = [np.exp(-t / decay_rate) for t in time]
    decay_series = pd.Series(decay_list)

    # Multiply the decay coefficeints by the nondecayed scores
    decay_score = decay_series.multiply(impact_scores)
    
    return decay_score

In [92]:
# Method to update the dataframe with the impact scores
def update_df_with_decay_scores(df, start_cols):
    
    # Get the index at which data starts for a given column
    total_years_pub = df.loc[: , 'total_years_pub']
    start_index = total_years_pub + 3

    # Set the index to paper title and transpose main df 
    df = df.set_index('title')
    df = df.transpose()

    count = 0
    
    # For all papers, where each column represents a paper, update the score with the decayed_score
    for column in df:
        
        # Calculate the decay scores for each row
        decay_score = calc_decay_scores(df, start_cols, df.columns.get_loc(column), decay_rate)
        
        # Turn the decayed_score into a df with column names matching and back to year-indexed series
        decay_frame = decay_score.to_frame()
        
        # Get the value of the column title from the dataframe itself - which is the column title
        decay_frame.columns = [list(df.columns.values)[df.columns.get_loc(column)]]
        
        # Build an index of years for the decay_frame
        time = np.arange(len(decay_score))
        year_index = start_index[count]
        years = df.index.values.tolist()
        year = years[-year_index]
        year_list = ['' + str((int(year)) + 10*t) + '' for t in time]
        year_series = pd.Series(year_list)
        decay_frame['years'] = year_series
        decay_frame = decay_frame.set_index('years') 
        
        # Update the dataframe with the new values
        df.update(decay_frame)
    
        count = count + 1
    
    df = df.round(3)
    return df.T

In [93]:
path_out = '/Users/timholdsworth/code/scaling-science/Data/' + str(num_results) + '_most_impactful_papers_decayed_test.csv'
def write_to_csv(df):
    df.to_csv(path_out, index_label='title')

In [94]:
#%%timeit
# Takes in data, finds most impactful papers, applies decay scores, writes these to csv
def data_prep():
    df = get_data(path_in)
    df1 = add_score_sum(df)
    df2 = add_total_years_pub(df1)
    df3 = add_time_weighted_score(df2)
    df4 = sort(df3)
    df5 = clean_for_viz(df4, num_results)
    return df5

df = data_prep()
df

Unnamed: 0,title,1950,1960,1970,1980,1990,2000,2010,2020,score_sum,total_years_pub,time_weighted_score
0,Molecular cloning : a laboratory manual,,,,,89.240,696.480,1365.195,1794.188,3945.103,4,986.276
1,Protein Measurement with the Folin Phenol Reagent,,4.738,77.274,383.015,822.652,1127.160,1363.603,1563.213,5341.653,7,763.093
2,C: Solid State Phys 5,,,,231.804,601.957,686.875,788.043,880.375,3189.054,5,637.811
3,Cleavage of Structural Proteins during the Ass...,,,,20.780,241.634,572.095,819.337,989.481,2643.328,5,528.666
4,A rapid and sensitive method for the quantitat...,,,,,87.742,255.228,462.025,656.286,1461.281,4,365.320
5,A short history of SHELX,,,,,,,144.974,524.440,669.414,2,334.707
6,CRC Handbook of chemistry and physics,,,,,,104.383,262.963,385.951,753.296,3,251.099
7,Numerical Recipes in C: The Art of Scientific ...,,,,,,75.479,256.591,387.431,719.501,3,239.834
8,Handbook of Mathematical Functions,,,5.403,51.170,124.906,240.610,437.622,566.364,1426.074,6,237.679
9,Quantum Computation and Quantum Information,,,,,,,171.890,296.954,468.844,2,234.422


In [95]:
#%%timeit
def exponential_decay(df):
    start_cols = get_total_years_pub(df)
    df1 = update_df_with_decay_scores(df, start_cols) 
    write_to_csv(df1)
    return df1

df1 = exponential_decay(df)
df1

Unnamed: 0_level_0,1950,1960,1970,1980,1990,2000,2010,2020,score_sum,total_years_pub,time_weighted_score
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Molecular cloning : a laboratory manual,,,,,89.240,669.171,1260.234,1591.302,3945.103,4.0,986.276
Protein Measurement with the Folin Phenol Reagent,,4.738,74.244,353.567,729.627,960.502,1116.424,1229.667,5341.653,7.0,763.093
C: Solid State Phys 5,,,,231.804,578.354,634.066,698.931,750.206,3189.054,5.0,637.811
Cleavage of Structural Proteins during the Assembly of the Head of Bacteriophage T4,,,,20.780,232.159,528.110,726.687,843.180,2643.328,5.0,528.666
A rapid and sensitive method for the quantitation of microgram quantities of protein utilizing the principle of protein-dye binding,,,,,87.742,245.220,426.503,582.073,1461.281,4.0,365.320
A short history of SHELX,,,,,,,144.974,503.876,669.414,2.0,334.707
CRC Handbook of chemistry and physics,,,,,,104.383,252.652,356.278,753.296,3.0,251.099
Numerical Recipes in C: The Art of Scientific Computing,,,,,,75.479,246.530,357.644,719.501,3.0,239.834
Handbook of Mathematical Functions,,,5.403,49.164,115.303,213.402,372.917,463.700,1426.074,6.0,237.679
Quantum Computation and Quantum Information,,,,,,,171.890,285.310,468.844,2.0,234.422


In [96]:
#%%timeit
df = data_prep()
def main(df):
    df1 = exponential_decay(df)
    return df1

df_decay = main(df)
df

Unnamed: 0,title,1950,1960,1970,1980,1990,2000,2010,2020,score_sum,total_years_pub,time_weighted_score
0,Molecular cloning : a laboratory manual,,,,,89.240,696.480,1365.195,1794.188,3945.103,4,986.276
1,Protein Measurement with the Folin Phenol Reagent,,4.738,77.274,383.015,822.652,1127.160,1363.603,1563.213,5341.653,7,763.093
2,C: Solid State Phys 5,,,,231.804,601.957,686.875,788.043,880.375,3189.054,5,637.811
3,Cleavage of Structural Proteins during the Ass...,,,,20.780,241.634,572.095,819.337,989.481,2643.328,5,528.666
4,A rapid and sensitive method for the quantitat...,,,,,87.742,255.228,462.025,656.286,1461.281,4,365.320
5,A short history of SHELX,,,,,,,144.974,524.440,669.414,2,334.707
6,CRC Handbook of chemistry and physics,,,,,,104.383,262.963,385.951,753.296,3,251.099
7,Numerical Recipes in C: The Art of Scientific ...,,,,,,75.479,256.591,387.431,719.501,3,239.834
8,Handbook of Mathematical Functions,,,5.403,51.170,124.906,240.610,437.622,566.364,1426.074,6,237.679
9,Quantum Computation and Quantum Information,,,,,,,171.890,296.954,468.844,2,234.422
