In [385]:
import pandas as pd
import numpy as np

In [386]:
path_in = "/Users/timholdsworth/code/scaling-science/Data/100_most_impactful_papers.csv"
def get_data(path_in):
    df = pd.read_csv(path_in)
    return df

In [387]:
# Returns a series of len(df) where each value is the column the data starts in for each row
def get_start_columns(df):
    
    total_years_pub = df.loc[:, 'total_years_pub']
    # Because there are 3 extra column at the end
    start_col_pos = total_years_pub + 3
    # Because we want the last certain number of columns
    start_cols = -start_col_pos
    
    return start_cols

In [388]:
# Method to calculate decay_scores for a given paper, returning the scores as a series
def calc_decay_scores(df, start_col, index):
         
    # Get the nondecayed scores 
    start = start_col[index]
    #impact_scores = df.iloc[index, start:-3].reset_index(drop=True)
    impact_scores = df.iloc[start:-3, index].reset_index(drop=True)
    #print(impact_scores)
    
    # Generate a series of decay coefficients
    time = np.arange(len(impact_scores))
    decay_list = [np.exp(-t / 35) for t in time]
    decay_series = pd.Series(decay_list)
    #print(decay_series)

    # Multiply the decay coefficeints by the nondecayed scores
    decay_score = decay_series.multiply(impact_scores)
    
    return decay_score
    

In [400]:
# Method to update the dataframe with the impact scores
def update_df_with_decay_scores(df, start_cols):
    
    # Take in a series of decay scores for a given paper
    # Add correct years as index and turn to a df
    # Update main df with scores
    
    # Get the index at which data starts for a given column
    #total_years_pub = df.loc[:, # Get the index at which data starts for a given column
    total_years_pub = df.loc[: , 'total_years_pub']
    start_index = total_years_pub + 3

    # Set the index to paper title and transpose main df 
    df = df.set_index('title')
    df = df.transpose()
    #df = df[:-2]
    
    count = 0
    
    # For all papers, where each column represents a paper, update the score with the decayed_score
    for column in df:
        
        # Calculate the decay scores for each row
        decay_score = calc_decay_scores(df, start_cols, df.columns.get_loc(column))
        
        # Turn the decayed_score into a df with column names matching and back to year-indexed series
        decay_frame = decay_score.to_frame()
        
        # Get the value of the column title from the dataframe itself - which is the column title
        decay_frame.columns = [list(df.columns.values)[df.columns.get_loc(column)]]
        
        # Decay frame successfully adds column title from dataframe colum - TODO check for off by one
        #print(decay_frame.columns.values)
        
        # Build an index of years for the decay_frame
        time = np.arange(len(decay_score))
        year_index = start_index[count]
        years = df.index.values.tolist()
        year = years[-year_index]
        year_list = ['' + str((int(year)) + t) + '' for t in time]
        year_series = pd.Series(year_list)
        decay_frame['years'] = year_series
        decay_frame = decay_frame.set_index('years') 
        
        # Update the dataframe with the new values
        df.update(decay_frame)
    
        count = count + 1
    
    df = df.round(3)
    return df.T
    

In [401]:
def main():
    
    df = get_data(path_in)
    start_cols = get_start_columns(df)
    df1 = update_df_with_decay_scores(df, start_cols)  
    return df1

df_decay = main()
df_decay

Unnamed: 0_level_0,1900,1901,1902,1903,1904,1905,1906,1907,1908,1909,...,1944,1945,1946,1947,1948,1949,1950,score_sum,total_years_pub,time_weighted_score
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Zur Frage des Einflusses der Milz auf den Eisenstoffwechsel,,,,,,,,,,,...,0.250,0.243,0.236,0.230,0.223,0.217,0.211,7.680,23.0,0.334
Zur Pathogenese des Sogenannten „Röntgenkaters“,,,,,,,,,,,...,0.153,0.148,0.144,0.140,0.136,0.132,0.129,7.770,28.0,0.277
Substituirte Stickstoffbromide und ihre Beziehung zur Bromsubstitution in Aniliden und Anilinen,0.15,0.270,0.263,0.255,0.248,0.241,0.234,0.228,0.221,0.215,...,0.079,0.077,0.075,0.073,0.071,0.069,0.067,14.025,51.0,0.275
Über die Acetylierung von löslicher Stärke,,0.150,0.146,0.142,0.138,0.134,0.130,0.126,0.123,0.119,...,0.113,0.110,0.107,0.104,0.101,0.098,0.095,13.735,50.0,0.275
The action of young's glycotropic factor of the anterior pituitary gland,,,,,,,,,,,...,0.234,0.228,0.221,0.215,0.209,0.203,0.197,3.480,13.0,0.268
Weitere Mittheilungen über glashäutige Neubildungen an der Descemet'schen Membran und auf der Iris und über Veränderungen des Hornhautendothels,0.15,0.146,0.142,0.138,0.248,0.241,0.234,0.228,0.221,0.215,...,0.079,0.077,0.075,0.073,0.071,0.069,0.067,13.643,51.0,0.268
DERMATITIS VENENATA CAUSED BY THE MANZANILLO TREE,,,,,,,,,,,...,0.270,0.263,0.255,0.248,0.241,0.234,0.228,2.093,8.0,0.262
Mind in the Making,,,,,,,,,0.150,0.146,...,0.099,0.097,0.094,0.091,0.089,0.086,0.084,11.168,43.0,0.260
A Comparison of Triploid and Diploid Crossing over for Chromosome II of DROSOPHILA MELANOGASTER.,,,,,,,,,,,...,0.197,0.192,0.186,0.181,0.176,0.171,0.166,4.890,19.0,0.257
Catastrophes of peptic ulcer,,,,,,,,,,,...,0.209,0.203,0.197,0.192,0.186,0.181,0.176,4.335,17.0,0.255


In [402]:
path_out = '/Users/timholdsworth/code/scaling-science/Data/100_most_impactful_papers_decayed.csv'
def write_to_csv(df):
    df.to_csv(path_out, index=False)

In [403]:
write_to_csv(df_decay)

0    26
1    31
2    54
3    53
4    16
Name: total_years_pub, dtype: int64


NameError: name 'start_cols' is not defined

In [379]:
for index, row in df0.iterrows():
    print(index)
    df00 = pd.DataFrame({'A': [index+8, index+9, index+10]})
    df0.update(df00)
    #df00 = df1.squeeze()
    #df0.iloc[index, :].update(df00)
#new_df = pd.DataFrame({'B': [4, 5, 6],'C': [7, 8, 9]})
#df.update(new_df)
df0
df0.T

0
1
2


Unnamed: 0,0,1,2
A,10,11,12
B,400,500,600


In [380]:
# Iterates through rows of a dataframe and updates scores by multiplying them by exp decay factor
def exp_decay(df, start_col):
    
    for index, row in df.iterrows():
        print(index)
        
        # Get the nondecayed scores 
        start = start_col[index]
        impact_scores = df.iloc[index, start:-3].reset_index(drop=True)
        
        # Generate a series of decay coefficients
        time = np.arange(len(impact_scores))
        decay_list = [np.exp(-t / 25) for t in time]
        decay_series = pd.Series(decay_list)
        
        # Multiply the decay coefficeints by the nondecayed scores
        decayed_score = decay_series.multiply(impact_scores)
        
        # Decayed frame is for one paper and correctly updates with each iteration
        # Turn the decayed_score into a df with column names matching and back to year-indexed series
        decayed_frame = decayed_score.to_frame()
        cols = list(df.columns.values)
        year = int(cols[start])
        year_list = ['' + str(year + t) + '' for t in time]
        year_series = pd.Series(year_list)
        decayed_frame['years'] = year_series
        decayed_frame = decayed_frame.set_index('years').T 
        #decayed_series1 = decayed_frame.squeeze()
        
        # This seems to mis
        df.update(decayed_frame)
        
        #print(impact_scores)
        #print(decayed_frame)
        #print(decayed_series1.index)
        #print(df.columns.values)
        
        #df.iloc[index, start:-3].update(decayed_series1)
        
        # Debugging notes: impact score, index, decayed_frame updates fine with iterations
        # The dataframe only updates after the first iteration
        
        
        # TODO figure out why this works only to update the first row - do I need to reset something 
        # at the top of the method
        # df.update(decayed_frame) works to update values in the first row but misallocates the values
        #df.update(decayed_frame)
        #print(df.iloc[index, -7:-3])
        
        #print('Printing decayed frame')
        #print(decayed_frame)
        #print(type(decayed_series_indexed))
        
    
    return df

In [384]:
# Take in a csv with a bunch of data that has year scores and apply exponential decay
def main1():
    
    df = get_data(path_in).head(5)
    start_cols = get_start_columns(df)
    df_decayed = exp_decay(df, start_cols)
    return df_decayed

df_decayed = main()
#df_decayed

0
1
2
3
4


In [None]:
# Play with the df.update until it works
df0 = pd.DataFrame({'A': [1, 2, 3], 'B': [400, 500, 600]})
df1 = pd.DataFrame({'A': [4, 5, 6]})
df0.update(df1)