In [52]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
df_clusters = pd.read_csv('C:\\Users\\Brend\\Downloads\\clusters_top42.csv', index_col = 'name', encoding = "UTF-8")
df_pct_collab = pd.read_csv('C:\\Users\\Brend\\Downloads\\pct_collaborative_pubs_top42.csv', index_col = 'name', encoding = 'UTF-8')
df_pct_collab_new = pd.read_csv('C:\\Users\\Brend\\Downloads\\pct_collaborative_pubs_top42_v2.csv', index_col = 'name', encoding = 'UTF-8')
df_impact = pd.read_csv('C:\\Users\\Brend\\Downloads\\impact_top42.csv', index_col = 'name', encoding = 'UTF-8')
df_author_focus = pd.read_csv('C:\\Users\\Brend\\Downloads\\community_focus_top42.csv', index_col = 'name', encoding = 'UTF-8')
df_disrupt = pd.read_csv('C:\\Users\\Brend\\Downloads\\disrupt_top42.csv', index_col = 'name', encoding = 'UTF-8')

In [53]:
df_all = df_pct_collab.join(df_clusters[['TotalConns', 'InterClusterRatio', 'IntraClusterRatio']], how = 'outer')
df_all = df_all.join(df_impact, how = 'outer')
df_all = df_all.join(df_author_focus, how = 'outer')
df_all = df_all.join(df_disrupt, how = 'outer')

In [54]:
df_trimmed = df_all[df_all['num_pubs'] >= 50]
df_trimmed = df_trimmed[df_trimmed['TotalConns'] >= 25]
print("Remaining after trimming: %d authors" % df_trimmed.shape[0])

Remaining after trimming: 368618 authors


In [71]:
linear_metrics = ['pct_collab_linauth', 'pct_collab_linprof', 
                  'pct_collab_sqrtauth', 'pct_collab_sqrtprof', 
                  'pct_collab_unweighted', 'InterClusterRatio', 
                  'IntraClusterRatio']
log_metrics = ['IntraCommunityFocus', 'InterCommunityFocus']
all_metrics = linear_metrics + log_metrics
impact = ['avg_PR', 'max_PR', 'avg_AR', 'max_AR']
disrupt = ['avg_disrupt', 'max_disrupt', 'median_disrupt', 'min_disrupt']


In [72]:
df_log = df_trimmed.copy()
for col in df_trimmed.columns:
    log_col = df_log[[col]]
    df_log[col + '_log'] = log_col.apply(np.log, axis=0)
df_log = df_log[[col for col in df_log.columns if 'log' in col]]

In [73]:
df_log_adjusted = df_trimmed.copy()
for col in df_trimmed.columns:
    log_col = df_log_adjusted[[col]]
    df_log_adjusted[col + '_log_adj'] = log_col.apply(lambda x: np.log(x+1), axis=0)
df_log_adjusted = df_log_adjusted[[col for col in df_log_adjusted.columns if 'log' in col]]

  after removing the cwd from sys.path.


In [74]:
df_scaled = df_trimmed.copy()
for col in df_trimmed.columns:
    ss = StandardScaler()
    scaled_col = df_scaled[[col]]
    df_scaled[col + '_zscore'] = ss.fit_transform(scaled_col)
df_zscores = df_scaled[[col for col in df_scaled.columns if 'zscore' in col]]

In [75]:
processing = dict()
for c in linear_metrics:
    processing[c] = 'zscore'
for c in log_metrics:
    processing[c] = 'log_adj'
for c in impact:
    processing[c] = 'log'
for c in disrupt:
    processing[c] = 'log_adj'
df_table = {'zscore': df_zscores, 'log': df_log, 'log_adj': df_log_adjusted}

In [76]:
def regression(x_cols, y_cols):
    d = dict()
    c = dict()
    for x_col in x_cols:
        score_list = []
        index_list = []
        coef_list = []
        for y_col in y_cols:    
            X = df_table[processing[x_col]][[x_col + '_' + processing[x_col]]]
            y = df_table[processing[y_col]][[y_col + '_' + processing[y_col]]]
            df_X_notinf = X.replace([np.inf, -np.inf], np.nan)
            df_y_notinf = y.replace([np.inf, -np.inf], np.nan)
            df_X_notna = df_X_notinf.dropna(subset=[x_col + '_' + processing[x_col]])
            df_y_notna = df_y_notinf.dropna(subset=[y_col + '_' + processing[y_col]])
            df_reg = df_X_notna.join(df_y_notna, how = 'inner')
            X = df_reg[[x_col + '_' + processing[x_col]]]
            y = df_reg[[y_col + '_' + processing[y_col]]]
            reg = LinearRegression().fit(X, y)
            score = round(reg.score(X, y), 4)
            coef = round(reg.coef_[0][0], 4)
            score_list.append(score)
            index_list.append(y_col)
            coef_list.append(coef)
        d[x_col + '_' + processing[x_col]] = pd.Series(score_list, index=index_list)
        c[x_col + '_' + processing[x_col]] = pd.Series(coef_list, index=index_list)
    return pd.DataFrame(d), pd.DataFrame(c)


In [61]:
r2, coef = regression(linear_metrics, impact)
r2

Unnamed: 0,pct_collab_linauth_zscore,pct_collab_linprof_zscore,pct_collab_sqrtauth_zscore,pct_collab_sqrtprof_zscore,pct_collab_unweighted_zscore,InterClusterRatio_zscore,IntraClusterRatio_zscore
avg_PR,0.0976,0.0696,0.1281,0.0988,0.1257,0.0003,0.0003
max_PR,0.043,0.0084,0.0407,0.0198,0.0525,0.0018,0.0018
avg_AR,0.0748,0.0767,0.1182,0.1114,0.1436,0.0008,0.0008
max_AR,0.0328,0.0111,0.0383,0.026,0.0664,0.0007,0.0007


In [62]:
coef

Unnamed: 0,pct_collab_linauth_zscore,pct_collab_linprof_zscore,pct_collab_sqrtauth_zscore,pct_collab_sqrtprof_zscore,pct_collab_unweighted_zscore,InterClusterRatio_zscore,IntraClusterRatio_zscore
avg_PR,0.1474,0.1244,0.1688,0.1482,0.1672,-0.0076,0.0076
max_PR,0.2303,0.1018,0.224,0.1562,0.2545,0.0467,-0.0467
avg_AR,0.0986,0.0998,0.1239,0.1203,0.1365,-0.0104,0.0104
max_AR,0.184,0.1074,0.199,0.1638,0.2619,0.0264,-0.0264


In [63]:
r2, coef = regression(log_metrics, impact)
r2

Unnamed: 0,IntraCommunityFocus_log_adj,InterCommunityFocus_log_adj
avg_PR,0.0105,0.0007
max_PR,0.0075,0.0014
avg_AR,0.0135,0.0004
max_AR,0.0095,0.0011


In [64]:
coef

Unnamed: 0,IntraCommunityFocus_log_adj,InterCommunityFocus_log_adj
avg_PR,0.1194,0.0391
max_PR,0.2393,0.1336
avg_AR,0.1037,0.0234
max_AR,0.2465,0.1083


In [65]:
r2, coef = regression(disrupt, impact)
r2

Unnamed: 0,avg_disrupt_log_adj,max_disrupt_log_adj,median_disrupt_log_adj,min_disrupt_log_adj
avg_PR,0.0111,0.0452,0.0039,0.001
max_PR,0.0006,0.0248,0.0001,0.0198
avg_AR,0.015,0.0596,0.0058,0.0017
max_AR,0.001,0.0329,0.0,0.0262


In [66]:
coef

Unnamed: 0,avg_disrupt_log_adj,max_disrupt_log_adj,median_disrupt_log_adj,min_disrupt_log_adj
avg_PR,0.1696,0.2925,0.0987,-0.0253
max_PR,0.0947,0.5013,-0.0316,-0.2645
avg_AR,0.1516,0.2575,0.0917,-0.0258
max_AR,0.11,0.529,-0.0171,-0.2788


In [67]:
r2, coef = regression(linear_metrics, disrupt)
r2

Unnamed: 0,pct_collab_linauth_zscore,pct_collab_linprof_zscore,pct_collab_sqrtauth_zscore,pct_collab_sqrtprof_zscore,pct_collab_unweighted_zscore,InterClusterRatio_zscore,IntraClusterRatio_zscore
avg_disrupt,0.0093,0.0185,0.0164,0.0135,0.0,0.0119,0.0119
max_disrupt,0.0109,0.0304,0.0269,0.0342,0.0167,0.0029,0.0029
median_disrupt,0.0054,0.0117,0.0096,0.0075,0.0004,0.0098,0.0098
min_disrupt,0.0138,0.0098,0.0109,0.0015,0.0211,0.023,0.023


In [68]:
coef

Unnamed: 0,pct_collab_linauth_zscore,pct_collab_linprof_zscore,pct_collab_sqrtauth_zscore,pct_collab_sqrtprof_zscore,pct_collab_unweighted_zscore,InterClusterRatio_zscore,IntraClusterRatio_zscore
avg_disrupt,0.0286,0.0409,0.0382,0.035,0.0009,0.0331,-0.0331
max_disrupt,0.0362,0.0615,0.0573,0.0653,0.0456,0.019,-0.019
median_disrupt,0.0223,0.0333,0.03,0.0268,-0.0061,0.0308,-0.0308
min_disrupt,0.0684,0.0585,0.0612,0.023,-0.086,0.0901,-0.0901


In [37]:
r2, coef = regression(log_metrics, disrupt)
r2

Unnamed: 0,IntraCommunityFocus_log_adj,InterCommunityFocus_log_adj
avg_disrupt,0.0,0.0003
max_disrupt,0.0083,0.001
median_disrupt,0.0004,0.0002
min_disrupt,0.0112,0.0009


In [69]:
coef

Unnamed: 0,pct_collab_linauth_zscore,pct_collab_linprof_zscore,pct_collab_sqrtauth_zscore,pct_collab_sqrtprof_zscore,pct_collab_unweighted_zscore,InterClusterRatio_zscore,IntraClusterRatio_zscore
avg_disrupt,0.0286,0.0409,0.0382,0.035,0.0009,0.0331,-0.0331
max_disrupt,0.0362,0.0615,0.0573,0.0653,0.0456,0.019,-0.019
median_disrupt,0.0223,0.0333,0.03,0.0268,-0.0061,0.0308,-0.0308
min_disrupt,0.0684,0.0585,0.0612,0.023,-0.086,0.0901,-0.0901


In [70]:
df_all = df_pct_collab_new.join(df_clusters[['TotalConns', 'InterClusterRatio', 'IntraClusterRatio']], how = 'outer')
df_all = df_all.join(df_impact, how = 'outer')
df_all = df_all.join(df_author_focus, how = 'outer')
df_all = df_all.join(df_disrupt)
df_trimmed = df_all[df_all['num_pubs'] >= 50]
df_trimmed = df_trimmed[df_trimmed['TotalConns'] >= 25]
print("Remaining after trimming: %d authors" % df_trimmed.shape[0])

Remaining after trimming: 368618 authors


In [77]:
r2, coef = regression(linear_metrics, impact)
r2

Unnamed: 0,pct_collab_linauth_zscore,pct_collab_linprof_zscore,pct_collab_sqrtauth_zscore,pct_collab_sqrtprof_zscore,pct_collab_unweighted_zscore,InterClusterRatio_zscore,IntraClusterRatio_zscore
avg_PR,0.0013,0.0072,0.0039,0.0087,0.0056,0.0003,0.0003
max_PR,0.0003,0.0019,0.0051,0.0075,0.0011,0.0018,0.0018
avg_AR,0.0,0.0056,0.005,0.0134,0.0183,0.0008,0.0008
max_AR,0.002,0.0029,0.0044,0.0049,0.0003,0.0007,0.0007


In [78]:
coef

Unnamed: 0,pct_collab_linauth_zscore,pct_collab_linprof_zscore,pct_collab_sqrtauth_zscore,pct_collab_sqrtprof_zscore,pct_collab_unweighted_zscore,InterClusterRatio_zscore,IntraClusterRatio_zscore
avg_PR,0.0168,0.0399,0.0294,0.0439,0.0354,-0.0076,0.0076
max_PR,-0.0202,-0.0489,-0.0791,-0.0961,-0.0376,0.0467,-0.0467
avg_AR,0.002,0.0269,0.0253,0.0418,0.0488,-0.0104,0.0104
max_AR,-0.0454,-0.0551,-0.0677,-0.0712,0.0188,0.0264,-0.0264


In [81]:
r2, coef = regression(linear_metrics, disrupt)
r2

Unnamed: 0,pct_collab_linauth_zscore,pct_collab_linprof_zscore,pct_collab_sqrtauth_zscore,pct_collab_sqrtprof_zscore,pct_collab_unweighted_zscore,InterClusterRatio_zscore,IntraClusterRatio_zscore
avg_disrupt,0.0101,0.019,0.022,0.029,0.0001,0.0119,0.0119
max_disrupt,0.0005,0.0075,0.0099,0.0208,0.0046,0.0029,0.0029
median_disrupt,0.0082,0.0139,0.0171,0.0213,0.0004,0.0098,0.0098
min_disrupt,0.0483,0.0498,0.0465,0.0426,0.0145,0.023,0.023


In [82]:
coef

Unnamed: 0,pct_collab_linauth_zscore,pct_collab_linprof_zscore,pct_collab_sqrtauth_zscore,pct_collab_sqrtprof_zscore,pct_collab_unweighted_zscore,InterClusterRatio_zscore,IntraClusterRatio_zscore
avg_disrupt,0.0299,0.041,0.0445,0.051,-0.0035,0.0331,-0.0331
max_disrupt,0.0077,0.0302,0.0349,0.0506,0.0239,0.019,-0.019
median_disrupt,0.0277,0.0359,0.0402,0.0448,-0.0061,0.0308,-0.0308
min_disrupt,0.1288,0.1304,0.1264,0.1208,-0.0713,0.0901,-0.0901
