In [None]:
import pandas as pd
import numpy as np

In [None]:
file_name = 'impactByTitle_166M_1950-2020-10'
path_in = '/Users/timholdsworth/code/scaling-science/Data/' + file_name + '.csv'

def get_data(path_in):
    df = pd.read_csv(path_in)
    return df

In [None]:
# Add a column thats sums up all the values in a row, which are all the paper's pagerank scores in given years
def add_score_sum(df):
    df['score_sum'] = df.sum(axis=1, skipna=True)
    return df

In [None]:
# Add a column with the number of years since the paper was published
def add_total_years_pub(df):
    df['total_years_pub'] = len(df.columns) - 2 - df.isnull().sum(axis=1, skipna=True) 
    return df

In [None]:
# Add a column with the average score for a paper
def add_time_weighted_score(df):
    df['time_weighted_score'] = df['score_sum'] / df['total_years_pub']
    return df

In [None]:
# Sort by the most popular papers according to time weighted score
def sort(df):
    df = df.sort_values(by=['time_weighted_score'], ascending=False).reset_index(drop=True)
    return df

In [None]:
num_results = 100
def clean_for_viz(df, num_results):
    df = df.head(num_results)
    df = df.round(3)
    df = df.rename(columns={"a.title": "title"})
    return df

In [None]:
# Returns a series of len(df) where each value is the column the data starts in for each row
def get_total_years_pub(df):
    
    total_years_pub = df.loc[:, 'total_years_pub']
    # Because there are 3 extra column at the end
    start_col_pos = total_years_pub + 3
    # Because we want the last certain number of columns
    start_cols = -start_col_pos
    
    return start_cols

In [None]:
# Method to calculate decay_scores for a given paper, returning the scores as a series
def calc_decay_scores(df, start_col, index, decay_rate):
         
    # Get the nondecayed scores 
    start = start_col[index]
    impact_scores = df.iloc[start:-3, index].reset_index(drop=True)
    
    # Generate a series of decay coefficients
    time = np.arange(len(impact_scores))
    decay_list = [np.exp(-1 / decay_rate) for t in time]
    decay_series = pd.Series(decay_list)

    # Multiply the decay coefficeints by the nondecayed scores
    decay_score = decay_series.multiply(impact_scores)
    
    return decay_score

In [None]:
# Method to update the dataframe with the impact scores
def update_df_with_decay_scores(df, start_cols, decay_rate):
    
    # Get the index at which data starts for a given column
    total_years_pub = df.loc[: , 'total_years_pub']
    start_index = total_years_pub + 3

    # Set the index to paper title and transpose main df 
    df = df.set_index('title')
    df = df.transpose()

    count = 0
    
    # For all papers, where each column represents a paper, update the score with the decayed_score
    for column in df:
        
        # Calculate the decay scores for each row
        decay_score = calc_decay_scores(df, start_cols, df.columns.get_loc(column), decay_rate)
        
        # Turn the decayed_score into a df with column names matching and back to year-indexed series
        decay_frame = decay_score.to_frame()
        
        # Get the value of the column title from the dataframe itself - which is the column title
        decay_frame.columns = [list(df.columns.values)[df.columns.get_loc(column)]]
        
        # Build an index of years for the decay_frame
        time = np.arange(len(decay_score))
        year_index = start_index[count]
        years = df.index.values.tolist()
        year = years[-year_index]
        year_list = ['' + str((int(year)) + 10*t) + '' for t in time]
        year_series = pd.Series(year_list)
        decay_frame['years'] = year_series
        decay_frame = decay_frame.set_index('years') 
        
        # Update the dataframe with the new values
        df.update(decay_frame)
    
        count = count + 1
    
    df = df.round(3)
    return df.T

In [None]:
def write_to_csv(df, decay_rate):
    path_out = '/Users/timholdsworth/code/scaling-science/Data/DecayIter' + file_name + '_' + str(num_results) + '_results_decayed_at_' + str(decay_rate) + '.csv'
    df.to_csv(path_out, index_label='title')

In [None]:
#%%timeit
# Takes in data, finds most impactful papers, applies decay scores, writes these to csv
def data_prep():
    df = get_data(path_in)
    df1 = add_score_sum(df)
    df2 = add_total_years_pub(df1)
    df3 = add_time_weighted_score(df2)
    df4 = sort(df3)
    df5 = clean_for_viz(df4, num_results)
    return df5

df = data_prep()
#df

In [None]:
#decay_rate = 25
def exponential_decay(df, decay_rate):
    start_cols = get_total_years_pub(df)
    df1 = update_df_with_decay_scores(df, start_cols, decay_rate) 
    write_to_csv(df1, decay_rate)
    return df1

#df1 = exponential_decay(df)
#df1

In [None]:
#%%timeit
df = data_prep()
def main(df):
    df1 = exponential_decay(df, decay_rate)
    #return df1
    print("Runnning main")


In [None]:
for decay_rate in range(5, 100, 5):
    main(df)