In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
df_clusters = pd.read_csv('C:\\Users\\Brend\\Downloads\\clusters_top42.csv', index_col = 'name', encoding = "UTF-8")
df_pct_collab = pd.read_csv('C:\\Users\\Brend\\Downloads\\pct_collaborative_pubs_top42.csv', index_col = 'name', encoding = 'UTF-8')
df_pct_collab_new = pd.read_csv('C:\\Users\\Brend\\Downloads\\pct_collaborative_pubs_top42_v2.csv', index_col = 'name', encoding = 'UTF-8')
df_impact = pd.read_csv('C:\\Users\\Brend\\Downloads\\impact_top42.csv', index_col = 'name', encoding = 'UTF-8')
df_author_focus = pd.read_csv('C:\\Users\\Brend\\Downloads\\community_focus_top42.csv', index_col = 'name', encoding = 'UTF-8')
df_disrupt = pd.read_csv('C:\\Users\\Brend\\Downloads\\disrupt_profs_top42.csv', index_col = 'name', encoding = 'UTF-8')
df_unique_profs = pd.read_csv('C:\\Users\\Brend\\Downloads\\unique_profs_top42.csv', index_col = 'name', encoding = "UTF-8")
df_orgs = pd.read_csv('C:\\Users\\Brend\\Downloads\\orgs_top42.csv', index_col = 'name', encoding = "UTF-8")

In [2]:
df_all = df_pct_collab.join(df_clusters[['TotalConns', 'InterClusterRatio', 'IntraClusterRatio']], how = 'outer')
df_all = df_all.join(df_impact, how = 'outer')
df_all = df_all.join(df_author_focus, how = 'outer')
df_all = df_all.join(df_disrupt, how = 'outer')
df_all = df_all.join(df_unique_profs, how = 'outer')
df_all = df_all.join(df_orgs, how = 'outer')

In [3]:
df_trimmed = df_all[df_all['num_pubs'] >= 50]
df_trimmed = df_trimmed[df_trimmed['TotalConns'] >= 25]
df_trimmed = df_trimmed[df_trimmed['num_disrupt'] >= 10]
df_trimmed = df_trimmed[df_trimmed['orgs'] <= 2000]
print("Remaining after trimming: %d authors" % df_trimmed.shape[0])

Remaining after trimming: 53490 authors


In [12]:
linear_metrics = ['pct_collab_linauth', 'pct_collab_linprof', 
                  'pct_collab_sqrtauth', 'pct_collab_sqrtprof', 
                  'pct_collab_unweighted', 'InterClusterRatio', 
                  'IntraClusterRatio']
log_metrics = ['IntraCommunityFocus', 'InterCommunityFocus']
all_metrics = linear_metrics + log_metrics
impact = ['avg_PR', 'max_PR', 'avg_AR', 'max_AR']
disrupt = ['avg_disrupt', 'max_disrupt', 'median_disrupt', 'min_disrupt']
unique_coauthor_profs = ['unique_coauthor_profs', 'unique_coauthor_profs_lin_damp', 'unique_coauthor_profs_sqrt_damp', 
                        'unique_coauthor_profs_square_damp', 'unique_coauthor_profs_exp_damp']


In [13]:
df_log = df_trimmed.copy()
for col in df_trimmed.columns:
    log_col = df_log[[col]]
    df_log[col + '_log'] = log_col.apply(np.log, axis=0)
df_log = df_log[[col for col in df_log.columns if 'log' in col]]

In [14]:
df_log_adjusted = df_trimmed.copy()
for col in df_trimmed.columns:
    log_col = df_log_adjusted[[col]]
    df_log_adjusted[col + '_log_adj'] = log_col.apply(lambda x: np.log(x+1), axis=0)
df_log_adjusted = df_log_adjusted[[col for col in df_log_adjusted.columns if 'log' in col]]

In [15]:
df_scaled = df_trimmed.copy()
for col in df_trimmed.columns:
    ss = StandardScaler()
    scaled_col = df_scaled[[col]]
    df_scaled[col + '_zscore'] = ss.fit_transform(scaled_col)
df_zscores = df_scaled[[col for col in df_scaled.columns if 'zscore' in col]]

In [16]:
processing = dict()
for c in linear_metrics:
    processing[c] = 'zscore'
for c in log_metrics:
    processing[c] = 'log_adj'
for c in impact:
    processing[c] = 'log'
for c in disrupt:
    processing[c] = 'log_adj'
for c in unique_coauthor_profs:
    processing[c] = 'log'
df_table = {'zscore': df_zscores, 'log': df_log, 'log_adj': df_log_adjusted}

In [17]:
def regression(x_cols, y_cols):
    d = dict()
    c = dict()
    for x_col in x_cols:
        score_list = []
        index_list = []
        coef_list = []
        for y_col in y_cols:    
            X = df_table[processing[x_col]][[x_col + '_' + processing[x_col]]]
            y = df_table[processing[y_col]][[y_col + '_' + processing[y_col]]]
            df_X_notinf = X.replace([np.inf, -np.inf], np.nan)
            df_y_notinf = y.replace([np.inf, -np.inf], np.nan)
            df_X_notna = df_X_notinf.dropna(subset=[x_col + '_' + processing[x_col]])
            df_y_notna = df_y_notinf.dropna(subset=[y_col + '_' + processing[y_col]])
            df_reg = df_X_notna.join(df_y_notna, how = 'inner')
            X = df_reg[[x_col + '_' + processing[x_col]]]
            y = df_reg[[y_col + '_' + processing[y_col]]]
            reg = LinearRegression().fit(X, y)
            score = round(reg.score(X, y), 4)
            coef = round(reg.coef_[0][0], 4)
            score_list.append(score)
            index_list.append(y_col)
            coef_list.append(coef)
        d[x_col + '_' + processing[x_col]] = pd.Series(score_list, index=index_list)
        c[x_col + '_' + processing[x_col]] = pd.Series(coef_list, index=index_list)
    return pd.DataFrame(d), pd.DataFrame(c)


In [10]:
r2, coef = regression(linear_metrics, impact)

In [11]:
r2

Unnamed: 0,pct_collab_linauth_zscore,pct_collab_linprof_zscore,pct_collab_sqrtauth_zscore,pct_collab_sqrtprof_zscore,pct_collab_unweighted_zscore,InterClusterRatio_zscore,IntraClusterRatio_zscore
avg_PR,0.0976,0.0696,0.1281,0.0988,0.1257,0.0003,0.0003
max_PR,0.043,0.0084,0.0407,0.0198,0.0525,0.0018,0.0018
avg_AR,0.0748,0.0767,0.1182,0.1114,0.1436,0.0008,0.0008
max_AR,0.0328,0.0111,0.0383,0.026,0.0664,0.0007,0.0007


In [12]:
coef

Unnamed: 0,pct_collab_linauth_zscore,pct_collab_linprof_zscore,pct_collab_sqrtauth_zscore,pct_collab_sqrtprof_zscore,pct_collab_unweighted_zscore,InterClusterRatio_zscore,IntraClusterRatio_zscore
avg_PR,0.1474,0.1244,0.1688,0.1482,0.1672,-0.0076,0.0076
max_PR,0.2303,0.1018,0.224,0.1562,0.2545,0.0467,-0.0467
avg_AR,0.0986,0.0998,0.1239,0.1203,0.1365,-0.0104,0.0104
max_AR,0.184,0.1074,0.199,0.1638,0.2619,0.0264,-0.0264


In [22]:
r2, coef = regression(linear_metrics, disrupt)

In [23]:
r2

Unnamed: 0,pct_collab_linauth_zscore,pct_collab_linprof_zscore,pct_collab_sqrtauth_zscore,pct_collab_sqrtprof_zscore,pct_collab_unweighted_zscore,InterClusterRatio_zscore,IntraClusterRatio_zscore
avg_disrupt,0.0741,0.0063,0.0372,0.0011,0.0015,0.0006,0.0006
max_disrupt,0.0594,0.0086,0.0374,0.0042,0.0002,0.0013,0.0013
median_disrupt,0.0318,0.0019,0.0133,0.0,0.0031,0.0006,0.0006
min_disrupt,0.0002,0.0001,0.0001,0.0,0.0001,0.0,0.0


In [24]:
coef

Unnamed: 0,pct_collab_linauth_zscore,pct_collab_linprof_zscore,pct_collab_sqrtauth_zscore,pct_collab_sqrtprof_zscore,pct_collab_unweighted_zscore,InterClusterRatio_zscore,IntraClusterRatio_zscore
avg_disrupt,0.0053,0.0015,0.0038,0.0007,-0.0008,0.0005,-0.0005
max_disrupt,0.0378,0.0144,0.03,0.0101,0.0023,0.0057,-0.0057
median_disrupt,0.0009,0.0002,0.0006,0.0,-0.0003,0.0001,-0.0001
min_disrupt,-0.0008,0.0006,-0.0007,0.0,-0.0006,0.0002,-0.0002


In [25]:
r2, coef = regression(disrupt, impact)

In [26]:
r2

Unnamed: 0,avg_disrupt_log_adj,max_disrupt_log_adj,median_disrupt_log_adj,min_disrupt_log_adj
avg_PR,0.0193,0.0369,0.0026,0.0036
max_PR,0.0108,0.0378,0.0012,0.0081
avg_AR,0.0025,0.0163,0.0003,0.0023
max_AR,0.0012,0.0207,0.0003,0.0065


In [27]:
coef

Unnamed: 0,avg_disrupt_log_adj,max_disrupt_log_adj,median_disrupt_log_adj,min_disrupt_log_adj
avg_PR,3.8936,0.6782,5.2274,-0.5262
max_PR,5.6624,1.3305,6.8865,-1.5323
avg_AR,1.1136,0.3606,-1.5299,-0.3391
max_AR,1.7937,0.9203,-3.2556,-1.2887


In [18]:
r2, coef = regression(unique_coauthor_profs, impact)

In [20]:
r2

Unnamed: 0,unique_coauthor_profs_log,unique_coauthor_profs_lin_damp_log,unique_coauthor_profs_sqrt_damp_log,unique_coauthor_profs_square_damp_log,unique_coauthor_profs_exp_damp_log
avg_PR,0.0042,0.003,0.0038,0.0021,0.0001
max_PR,0.0747,0.0646,0.0739,0.0386,0.0
avg_AR,0.0169,0.0142,0.0168,0.0074,0.0
max_AR,0.1115,0.0969,0.1116,0.0529,0.0001


In [21]:
coef

Unnamed: 0,unique_coauthor_profs_log,unique_coauthor_profs_lin_damp_log,unique_coauthor_profs_sqrt_damp_log,unique_coauthor_profs_square_damp_log,unique_coauthor_profs_exp_damp_log
avg_PR,0.0347,0.0437,0.0418,0.0381,0.0005
max_PR,0.2833,0.3946,0.3599,0.3158,0.0
avg_AR,0.0556,0.0762,0.0708,0.057,0.0
max_AR,0.3237,0.4516,0.4134,0.3455,-0.001


In [22]:
r2, coef = regression(unique_coauthor_profs, disrupt)

In [23]:
r2

Unnamed: 0,unique_coauthor_profs_log,unique_coauthor_profs_lin_damp_log,unique_coauthor_profs_sqrt_damp_log,unique_coauthor_profs_square_damp_log,unique_coauthor_profs_exp_damp_log
avg_disrupt,0.0366,0.0361,0.0404,0.0125,0.0007
max_disrupt,0.0016,0.0,0.0005,0.0052,0.0007
median_disrupt,0.0219,0.0182,0.0226,0.0041,0.0036
min_disrupt,0.0029,0.0067,0.0047,0.0077,0.0004


In [24]:
coef

Unnamed: 0,unique_coauthor_profs_log,unique_coauthor_profs_lin_damp_log,unique_coauthor_profs_sqrt_damp_log,unique_coauthor_profs_square_damp_log,unique_coauthor_profs_exp_damp_log
avg_disrupt,-0.0036,-0.0054,-0.0049,-0.0033,0.0001
max_disrupt,-0.0061,0.0014,-0.0044,0.017,0.0005
median_disrupt,-0.0008,-0.001,-0.001,-0.0005,0.0
min_disrupt,-0.0032,-0.0074,-0.0053,-0.0082,-0.0001
