In [34]:
import pandas as pd
import numpy as np

In [49]:
df = pd.read_csv("/Users/timholdsworth/code/scaling-science/Data/impact_1900-1950.csv")

In [50]:
df = df.head(5)

In [51]:
# Takes in a series of values and weighs them
def time_weighted_score(df):
    # Add up all the values in a row, which are all the paper's pagerank scores in given years
    df['score_sum'] = df.sum(axis=1, skipna=True)
    # Define time_since_pub as the number of years in the graph - the number of values that are not NaN
    df['time_since_pub'] = len(df.columns) - 4 - df.isnull().sum(axis=1, skipna=True) 
    df['weighted_score'] = df['score_sum'] / df['time_since_pub']
    df = df.rename(columns={"a.title": "title"})
    return df

In [52]:
df = time_weighted_score(df).sort_values(by=['weighted_score'], ascending=False).reset_index(drop=True)
df = df.head(1000)
df

Unnamed: 0,title,1900,1901,1902,1903,1904,1905,1906,1907,1908,...,1944,1945,1946,1947,1948,1949,1950,score_sum,time_since_pub,weighted_score
0,"""A"" Stage Hangar Line Inspection",,,,,,,,,,...,0.15,0.15,0.15,0.15,0.15,0.15,0.15,2.7,16,0.16875
1,""" Cod Liver Oil."" Source-Manufacture-Substitut...",,,,,,,,,,...,0.15,0.15,0.15,0.15,0.15,0.15,0.15,3.45,21,0.164286
2,""" Rickettsia ""-Bodies as a Result of Cell-Dige...",,,,,,,,,,...,0.15,0.15,0.15,0.15,0.15,0.15,0.15,4.2,26,0.161538
3,""" Codigo civil portuguez annotado etc. "". - T....",0.15,0.15,0.15,0.15,0.15,0.15,0.15,0.15,0.15,...,0.15,0.15,0.15,0.15,0.15,0.15,0.15,7.65,49,0.156122
4,""" THE PREVENTION OF SMALL-POX.""",0.15,0.15,0.15,0.15,0.15,0.15,0.15,0.15,0.15,...,0.15,0.15,0.15,0.15,0.15,0.15,0.15,7.65,49,0.156122


In [53]:
# Get the number of years since publication as a series for all rows in database
time_since_pub = df.loc[:, 'time_since_pub']
# Becauset there are 3 extra column at the end
start_col_pos = time_since_pub + 3
# Because we want the last certain number of columns
start_col = -start_col_pos

In [61]:
# TODO figure out why multiplying these two gives a series double the length of each of the series being multiplied
def exp_decay(df, start_col):
    for index, row in df.iterrows():

        # Get the column which the data starts for this row 
        start = start_col[index]
        # Create a series of the impact scores for this row
        impact_scores = df.iloc[index, start:-3].reset_index(drop=True)
       
        #print('Printing Impact score')
        #print(impact_scores)
        #print('Impact score is length ' + str(len(impact_scores)))
        #print()
        
        # Create an Series with decay coefficients that is the same length as the impact_scores series
        time = np.arange(len(impact_scores))
        decay_list = [np.exp(-t / 25) for t in time]
        decay_series = pd.Series(decay_list)
        
        #print('Printing decay series')
        #print(decay_series)
        #print('Decay series is length ' + str(len(decay_series)))
        #print()
        
        decayed_score = decay_series.multiply(impact_scores)
        
        #print('Printing decayed score')
        #print(decayed_score)
        #print('Decayed score is length ' + str(len(decayed_score)))
        #print()
        
        # Write the results back to the appropriate columns in dataframe
        # Figure out how to update the dataframe
        df.iloc[index, start:-3].update(decayed_score)

    return df
    
df1 = exp_decay(df, start_col)

In [60]:
df1.to_csv('/Users/timholdsworth/code/scaling-science/Data/1000_most_impactful_papers_decayed.csv')

In [373]:
# Extra code I used to test out multiplying a series
# An array of values from 0 to (100 - the number of years a paper has been published)
s = pd.Series([10, 10, 10, 10])
time = np.arange(len(s))
# Array of coefficients that get smaller as time goes on
decay_coeff = [np.exp(-t / 15) for t in time] 
#print(decay_coeff)

# Multiple a given series by its corresponding decay coefficients
s.multiply(decay_coeff)

0    10.000000
1     9.355070
2     8.751733
3     8.187308
dtype: float64