In [None]:
import pandas as pd
import numpy as np

In [None]:
file_name = 'patents_pivottable_1900-2020-5.csv'
root = '/Users/timholdsworth/code/scaling-science/data/result/'

def get_data(file_name):
    path_in = root + file_name
    df = pd.read_csv(path_in, encoding='latin1')
    return df

In [None]:
# Add extra columns that aggregate information from the dataframe
def add_aggregate_cols(df):
    # Add a column thats sums up all the values in a row, which are all the paper's pagerank scores in given years
    df['score_sum'] = df.sum(axis=1, skipna=True)
    # Add a column with the number of years since the paper was published
    df['total_years_pub'] = len(df.columns) - 2 - df.isnull().sum(axis=1, skipna=True) 
    # Add a column with the average score for a paper
    df['time_weighted_score'] = df['score_sum'] / df['total_years_pub']
    return df

#df = get_data(file_name)
#df1 = add_aggregate_cols(df)
#df1

In [None]:
# Sort by most impactful by average score over time, return num_results, round values, rename a column
def clean_for_viz(df, num_results=None):
    if num_results == None:
        num_results = 100
    
    df = df.sort_values(by=['time_weighted_score'], ascending=False).reset_index(drop=True)
    df = df.head(num_results)
    df = df.round(3)
    #df = df.rename(columns={"a.title": "title"})
    return df

In [None]:
# Takes in data, finds most impactful papers, applies decay scores, writes these to csv
def data_prep(file_name, num_results):
    df = get_data(file_name)
    df1 = add_aggregate_cols(df)
    df2 = clean_for_viz(df1, num_results)
    return df2

In [None]:
# Take in undecayed scores and return a dataframe with the decayed scores
def decay(df,decay_rate):
    
    #titles = df.index.values
    #print(titles)
    
    # Reverse the order of the columns so we don't have to track NaNs and can multiply every row by same decay_vector
    df1 = df[df.columns[::-1]]
    #print(df1)
    
    # Get only floats so we can do operations on entire rows
    #df2 = df1[df1[1:]]
    #df1 = df1.select_dtypes(include=['float64'])
    #df1 = df1[df1.select_dtypes(include=['float64'])]
    
    # Make decay vector, and reverse it, and make it a ndarray
    time = np.arange(len(df1.columns)-3)
    decay_list = [np.exp(-t / decay_rate) for t in time]
    decay_list.reverse()
    decay_array = np.asarray(decay_list)
    print(type(decay_array[0]))
    
    a = df1.iloc[:, 3:]
    #an = np.asarray(a)
    #print(an)
    print(type(a))
   
    #print(a)
    
    df2 = decay_array.T * np.asmatrix(a)
    
    # Put the columns back in their original order
    df3 = df2[df2.columns[::-1]]
    
    return df3
main(file_name, 400, 30)

In [None]:
def write_to_csv(df, filename=None, num_results=None, decay_rate=None):
    if filename == None:
        filename = file_name
    if num_results == None:
        num_results = 100
    if decay_rate == None:
        decay_rate = 25
    
    path_out = root + file_name[:-4] + '_' + str(num_results) + '_normalized_decayed_at_' + str(decay_rate) + '.csv'
    
    df1 = df.round(3)
    df1.to_csv(path_out, index_label='title')

In [None]:
def main(file_name=None, num_results=None,  decay_rate=None):
    if file_name == None:
        file_name = filename
    if num_results == None:
        num_results = 100
    if decay_rate == None:
        decay_rate == 25
    """
    Take a CSV file worth of data, get the top results, apply exponential decay,
    and write the decayed scores back to CSV

    The file should:
        Be a CSV
        Have a 1st column named 'title'
        Have subsequent columns as single year values - i.e. 1900
        
    Run this 'main' script from the root scaling-science folder
        

    Parameters
    ----------
    file_name: string
        The name of the CSV file
    
    num_results: int
        The number of top results you would like to return
   
    decay_rate: int
        The rate at which you would like scores to decay. 
        **Note - smaller decay_rate values make scores decay *quicker*


    Returns
    -------
    CSV of floats with same column structure as that which was put in
        Where each float represents the decayed impact value for that paper in that year

    """

    #df = data_prep(file_name, num_results)
    df = get_data(file_name)
    df1 = add_aggregate_cols(df)
    df2 = clean_for_viz(df1, num_results)
    df_decay = decay(df, decay_rate)
    df_norm = scale_linear_bycolumn(df_decay, high=100.0, low=0.0)
    #df_norm = normalize(df_decay)
    write_to_csv(df_norm, file_name, num_results, decay_rate)
    print('Finished writing results')
    return df_decay
    #return df_norm

In [None]:
#%%timeit


In [None]:
def write_iterated_to_csv(df, num_results=None, decay_rate=None):
    if num_results == None:
        num_results = 400
    if decay_rate == None:
        decay_rate = 25
        
    path_out = '/Users/timholdsworth/code/scaling-science/Data/' + file_name[:-4] + '_' + str(num_results) + '_results_decayed_at_' + str(decay_rate) + '.csv'
    df1 = df.round(3)
    df1.to_csv(path_out, index_label='title')
    

In [None]:
# Iterate through decay values and write them all to one csv file
def iterate_through_decay_vals():
    col_name = str(year) + str(decay_value)
    #for decay_rate in range(0, 100, 5):
        #main(file_name, 100, 5, decay_rate)