In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
df_clusters = pd.read_csv('C:\\Users\\Brend\\Downloads\\clusters_top42.csv', index_col = 'name', encoding = "UTF-8")
df_pct_collab = pd.read_csv('C:\\Users\\Brend\\Downloads\\pct_collaborative_pubs_top42.csv', index_col = 'name', encoding = 'UTF-8')
df_pct_collab_new = pd.read_csv('C:\\Users\\Brend\\Downloads\\pct_collaborative_pubs_top42_v2.csv', index_col = 'name', encoding = 'UTF-8')
df_impact = pd.read_csv('C:\\Users\\Brend\\Downloads\\impact_top42.csv', index_col = 'name', encoding = 'UTF-8')
df_author_focus = pd.read_csv('C:\\Users\\Brend\\Downloads\\community_focus_top42.csv', index_col = 'name', encoding = 'UTF-8')
df_disrupt = pd.read_csv('C:\\Users\\Brend\\Downloads\\disrupt_profs_top42.csv', index_col = 'name', encoding = 'UTF-8')
df_unique_profs = pd.read_csv('C:\\Users\\Brend\\Downloads\\unique_profs_top42.csv', index_col = 'name', encoding = "UTF-8")
df_orgs = pd.read_csv('C:\\Users\\Brend\\Downloads\\orgs_top42.csv', index_col = 'name', encoding = "UTF-8")

In [3]:
df_all = df_pct_collab.join(df_clusters[['TotalConns', 'InterClusterRatio', 'IntraClusterRatio']], how = 'outer')
df_all = df_all.join(df_impact, how = 'outer')
df_all = df_all.join(df_author_focus, how = 'outer')
df_all = df_all.join(df_disrupt, how = 'outer')
df_all = df_all.join(df_unique_profs, how = 'outer')
df_all = df_all.join(df_orgs, how = 'outer')

In [15]:
df_trimmed = df_all[df_all['num_pubs'] >= 50]
df_trimmed = df_trimmed[df_trimmed['TotalConns'] >= 25]
df_trimmed = df_trimmed[df_trimmed['num_disrupt'] >= 10]
df_trimmed = df_trimmed[df_trimmed['orgs'] <= 2000]
print("Remaining after trimming: %d authors" % df_trimmed.shape[0])

Remaining after trimming: 53490 authors


In [16]:
linear_metrics = ['pct_collab_linauth', 'pct_collab_linprof', 
                  'pct_collab_sqrtauth', 'pct_collab_sqrtprof', 
                  'pct_collab_unweighted', 'InterClusterRatio', 
                  'IntraClusterRatio']
log_metrics = ['IntraCommunityFocus', 'InterCommunityFocus']
all_metrics = linear_metrics + log_metrics
impact = ['avg_PR', 'max_PR', 'avg_AR', 'max_AR']
disrupt = ['avg_disrupt', 'max_disrupt', 'median_disrupt', 'min_disrupt']
unique_coauthor_profs = ['unique_coauthor_profs', 'unique_coauthor_profs_lin_damp', 'unique_coauthor_profs_sqrt_damp', 
                        'unique_coauthor_profs_square_damp', 'unique_coauthor_profs_exp_damp']


In [17]:
df_log = df_trimmed.copy()
for col in df_trimmed.columns:
    log_col = df_log[[col]]
    df_log[col + '_log'] = log_col.apply(np.log, axis=0)
df_log = df_log[[col for col in df_log.columns if 'log' in col]]

In [18]:
df_log_adjusted = df_trimmed.copy()
for col in df_trimmed.columns:
    log_col = df_log_adjusted[[col]]
    df_log_adjusted[col + '_log_adj'] = log_col.apply(lambda x: np.log(x+1), axis=0)
df_log_adjusted = df_log_adjusted[[col for col in df_log_adjusted.columns if 'log' in col]]

In [19]:
df_scaled = df_trimmed.copy()
for col in df_trimmed.columns:
    ss = StandardScaler()
    scaled_col = df_scaled[[col]]
    df_scaled[col + '_zscore'] = ss.fit_transform(scaled_col)
df_zscores = df_scaled[[col for col in df_scaled.columns if 'zscore' in col]]

In [20]:
processing = dict()
for c in linear_metrics:
    processing[c] = 'zscore'
for c in log_metrics:
    processing[c] = 'log_adj'
for c in impact:
    processing[c] = 'log'
for c in disrupt:
    processing[c] = 'log_adj'
for c in unique_coauthor_profs:
    processing[c] = 'log'
df_table = {'zscore': df_zscores, 'log': df_log, 'log_adj': df_log_adjusted}

In [21]:
def regression(x_cols, y_cols):
    d = dict()
    c = dict()
    for x_col in x_cols:
        score_list = []
        index_list = []
        coef_list = []
        for y_col in y_cols:    
            X = df_table[processing[x_col]][[x_col + '_' + processing[x_col]]]
            y = df_table[processing[y_col]][[y_col + '_' + processing[y_col]]]
            df_X_notinf = X.replace([np.inf, -np.inf], np.nan)
            df_y_notinf = y.replace([np.inf, -np.inf], np.nan)
            df_X_notna = df_X_notinf.dropna(subset=[x_col + '_' + processing[x_col]])
            df_y_notna = df_y_notinf.dropna(subset=[y_col + '_' + processing[y_col]])
            df_reg = df_X_notna.join(df_y_notna, how = 'inner')
            X = df_reg[[x_col + '_' + processing[x_col]]]
            y = df_reg[[y_col + '_' + processing[y_col]]]
            reg = LinearRegression().fit(X, y)
            score = round(reg.score(X, y), 4)
            coef = round(reg.coef_[0][0], 4)
            score_list.append(score)
            index_list.append(y_col)
            coef_list.append(coef)
        d[x_col + '_' + processing[x_col]] = pd.Series(score_list, index=index_list)
        c[x_col + '_' + processing[x_col]] = pd.Series(coef_list, index=index_list)
    return pd.DataFrame(d), pd.DataFrame(c)


In [30]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(color_codes=True)
%matplotlib inline
def joint_plot(x_cols, y_cols):
    for x_col in x_cols:
        for y_col in y_cols:   
            X = df_table[processing[x_col]][[x_col + '_' + processing[x_col]]]
            y = df_table[processing[y_col]][[y_col + '_' + processing[y_col]]]
            df_X_notinf = X.replace([np.inf, -np.inf], np.nan)
            df_y_notinf = y.replace([np.inf, -np.inf], np.nan)
            df_X_notna = df_X_notinf.dropna(subset=[x_col + '_' + processing[x_col]])
            df_y_notna = df_y_notinf.dropna(subset=[y_col + '_' + processing[y_col]])
            df_reg = df_X_notna.join(df_y_notna, how = 'inner')
            X = df_reg[[x_col + '_' + processing[x_col]]]
            y = df_reg[[y_col + '_' + processing[y_col]]]
            data = pd.concat([X, y], axis=1)
            f, ax = plt.subplots(figsize=(8,8))
            g = sns.regplot(x_col + '_' + processing[x_col], y_col + '_' + processing[y_col], data, ax=ax, dropna = True, fit_reg=False, scatter_kws={"color":"darkred","alpha":0.3,"s":1})
            plt.show()

In [11]:
r2, coef = regression(unique_coauthor_profs, impact)

In [12]:
r2

Unnamed: 0,unique_coauthor_profs_log,unique_coauthor_profs_lin_damp_log,unique_coauthor_profs_sqrt_damp_log,unique_coauthor_profs_square_damp_log,unique_coauthor_profs_exp_damp_log
avg_PR,0.0417,0.0745,0.058,0.0763,0.0018
max_PR,0.0847,0.0856,0.09,0.0492,0.0009
avg_AR,0.0579,0.0982,0.0786,0.0947,0.0022
max_AR,0.1124,0.113,0.1194,0.0629,0.0012


In [14]:
coef

Unnamed: 0,unique_coauthor_profs_log,unique_coauthor_profs_lin_damp_log,unique_coauthor_profs_sqrt_damp_log,unique_coauthor_profs_square_damp_log,unique_coauthor_profs_exp_damp_log
avg_PR,0.0716,0.1301,0.1016,0.1243,0.0007
max_PR,0.2389,0.3267,0.2965,0.2337,0.0011
avg_AR,0.0645,0.1142,0.0904,0.1058,0.0006
max_AR,0.252,0.3438,0.3127,0.2421,0.0012


In [22]:
r2, coef = regression(disrupt, impact)

In [23]:
r2

Unnamed: 0,avg_disrupt_log_adj,max_disrupt_log_adj,median_disrupt_log_adj,min_disrupt_log_adj
avg_PR,0.0193,0.0393,0.0027,0.004
max_PR,0.0112,0.037,0.0012,0.0078
avg_AR,0.0024,0.0177,0.0003,0.0027
max_AR,0.0014,0.02,0.0003,0.0063


In [24]:
coef

Unnamed: 0,avg_disrupt_log_adj,max_disrupt_log_adj,median_disrupt_log_adj,min_disrupt_log_adj
avg_PR,3.8858,0.7002,5.356,-0.5635
max_PR,5.7373,1.3209,6.9871,-1.5247
avg_AR,1.107,0.3762,-1.4632,-0.3668
max_AR,1.8708,0.9076,-3.1737,-1.2787


In [25]:
r2, coef = regression(unique_coauthor_profs, disrupt)

In [26]:
r2

Unnamed: 0,unique_coauthor_profs_log,unique_coauthor_profs_lin_damp_log,unique_coauthor_profs_sqrt_damp_log,unique_coauthor_profs_square_damp_log,unique_coauthor_profs_exp_damp_log
avg_disrupt,0.0366,0.0361,0.0404,0.0125,0.0007
max_disrupt,0.0016,0.0,0.0005,0.0052,0.0007
median_disrupt,0.0219,0.0182,0.0226,0.0041,0.0036
min_disrupt,0.0029,0.0067,0.0047,0.0077,0.0004


In [27]:
coef

Unnamed: 0,unique_coauthor_profs_log,unique_coauthor_profs_lin_damp_log,unique_coauthor_profs_sqrt_damp_log,unique_coauthor_profs_square_damp_log,unique_coauthor_profs_exp_damp_log
avg_disrupt,-0.0036,-0.0054,-0.0049,-0.0033,0.0001
max_disrupt,-0.0061,0.0014,-0.0044,0.017,0.0005
median_disrupt,-0.0008,-0.001,-0.001,-0.0005,0.0
min_disrupt,-0.0032,-0.0074,-0.0053,-0.0082,-0.0001
