In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
df = pd.read_csv('C:\\Users\\Brend\\Downloads\\article_level_top42.csv', index_col = 'q.id', encoding = 'UTF-8')

In [2]:
print("Before trimming: %d authors" % df.shape[0])
df_trimmed = df[df['num_profs'] < 1000]
print("Remaining after trimming: %d authors" % df_trimmed.shape[0])

Before trimming: 2037922 authors
Remaining after trimming: 1940670 authors


In [3]:
df_log = df_trimmed.copy()
for col in df_trimmed.columns:
    log_col = df_log[[col]]
    df_log[col + '_log'] = log_col.apply(np.log, axis=0)
df_log = df_log[[col for col in df_log.columns if 'log' in col]]

In [4]:
df_log_adjusted = df_trimmed.copy()
for col in df_trimmed.columns:
    log_col = df_log_adjusted[[col]]
    df_log_adjusted[col + '_log_adj'] = log_col.apply(lambda x: np.log(x+1), axis=0)
df_log_adjusted = df_log_adjusted[[col for col in df_log_adjusted.columns if 'log' in col]]

In [5]:
from sklearn.preprocessing import StandardScaler
df_scaled = df_trimmed.copy()
for col in df_trimmed.columns:
    ss = StandardScaler()
    scaled_col = df_scaled[[col]]
    df_scaled[col + '_zscore'] = ss.fit_transform(scaled_col)
df_zscores = df_scaled[[col for col in df_scaled.columns if 'zscore' in col]]

In [6]:
disrupt = ['disruption']
impact = ['articlerank', 'pagerank']
num_profs = ['num_profs']

In [7]:
processing = dict()
processing['disruption'] = 'log_adj'
processing['num_profs'] = 'zscore'
processing['articlerank'] = 'log'
processing['pagerank'] = 'log'
df_table = {'zscore': df_zscores, 'log': df_log, 'log_adj': df_log_adjusted}

In [8]:
def regression(x_cols, y_cols):
    d = dict()
    c = dict()
    for x_col in x_cols:
        score_list = []
        index_list = []
        coef_list = []
        for y_col in y_cols:    
            X = df_table[processing[x_col]][[x_col + '_' + processing[x_col]]]
            y = df_table[processing[y_col]][[y_col + '_' + processing[y_col]]]
            df_X_notinf = X.replace([np.inf, -np.inf], np.nan)
            df_y_notinf = y.replace([np.inf, -np.inf], np.nan)
            df_X_notna = df_X_notinf.dropna(subset=[x_col + '_' + processing[x_col]])
            df_y_notna = df_y_notinf.dropna(subset=[y_col + '_' + processing[y_col]])
            df_reg = df_X_notna.join(df_y_notna, how = 'inner')
            X = df_reg[[x_col + '_' + processing[x_col]]]
            y = df_reg[[y_col + '_' + processing[y_col]]]
            reg = LinearRegression().fit(X, y)
            score = round(reg.score(X, y), 4)
            coef = round(reg.coef_[0][0], 4)
            score_list.append(score)
            index_list.append(y_col)
            coef_list.append(coef)
        d[x_col + '_' + processing[x_col]] = pd.Series(score_list, index=index_list)
        c[x_col + '_' + processing[x_col]] = pd.Series(coef_list, index=index_list)
    return pd.DataFrame(d), pd.DataFrame(c)

In [9]:
r2, coef = regression(disrupt, impact)

In [10]:
r2

Unnamed: 0,disruption_log_adj
articlerank,0.0018
pagerank,0.0092


In [11]:
coef

Unnamed: 0,disruption_log_adj
articlerank,0.4935
pagerank,1.3439


In [12]:
r2, coef = regression(num_profs, impact)

In [13]:
r2

Unnamed: 0,num_profs_zscore
articlerank,0.0209
pagerank,0.0129


In [14]:
coef

Unnamed: 0,num_profs_zscore
articlerank,0.1133
pagerank,0.1122


In [15]:
r2, coef = regression(num_profs, disrupt)

In [16]:
r2

Unnamed: 0,num_profs_zscore
disruption,0.0045


In [17]:
coef

Unnamed: 0,num_profs_zscore
disruption,-0.0039
