In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
sns.set(color_codes=True)
%matplotlib inline
df_corr = pd.read_csv('C:\\Users\\Brend\\Downloads\\correlation_top42.csv', index_col = 'name', encoding = "UTF-8")
df_disrupt = pd.read_csv('C:\\Users\\Brend\\Downloads\\disrupt_top42.csv', index_col = 'name', encoding = "UTF-8")
df_all = df_corr.join(df_disrupt, how = 'outer')
print("Total: %d authors" % df_all.shape[0])

In [None]:
df_trimmed = df_all[df_all['num_pubs'] >= 50]
df_trimmed = df_trimmed[df_trimmed['TotalConns'] >= 25]
print("Remaining after trimming: %d authors" % df_trimmed.shape[0])

In [None]:
linear_metrics = ['pct_collab_linauth', 'pct_collab_linprof', 
                  'pct_collab_sqrtauth', 'pct_collab_sqrtprof', 
                  'pct_collab_unweighted', 'InterClusterRatio', 
                  'IntraClusterRatio']
log_metrics = ['IntraCommunityFocus', 'InterCommunityFocus']
all_metrics = linear_metrics + log_metrics
impact = ['avg_PR', 'max_PR', 'avg_AR', 'max_AR']
disrupt = ['avg_disrupt', 'max_disrupt', 'median_disrupt', 'min_disrupt']

In [None]:
df_log = df_trimmed.copy()
for col in df_trimmed.columns:
    log_col = df_log[[col]]
    df_log[col + '_log'] = log_col.apply(np.log, axis=0)
df_log = df_log[[col for col in df_log.columns if 'log' in col]]

In [None]:
df_log_adjusted = df_trimmed.copy()
for col in df_trimmed.columns:
    log_col = df_log_adjusted[[col]]
    df_log_adjusted[col + '_log_adj'] = log_col.apply(lambda x: np.log(x+1), axis=0)
df_log_adjusted = df_log_adjusted[[col for col in df_log_adjusted.columns if 'log' in col]]

In [None]:
from sklearn.preprocessing import StandardScaler
df_scaled = df_trimmed.copy()
for col in df_trimmed.columns:
    ss = StandardScaler()
    scaled_col = df_scaled[[col]]
    df_scaled[col + '_zscore'] = ss.fit_transform(scaled_col)
df_zscores = df_scaled[[col for col in df_scaled.columns if 'zscore' in col]]

In [None]:
import numpy as np
from sklearn.linear_model import LinearRegression
print('r2 scores:')
d = dict()
for x_col in linear_metrics:
    score_list = []
    index_list = []
    for y_col in impact:
        X = df_zscores[[x_col + '_zscore']]
        y = df_log[[y_col + '_log']]
        reg = LinearRegression().fit(X, y)
        score = round(reg.score(X, y), 4)
        score_list.append(score)
        index_list.append(y_col)
    d[x_col + '_zscore'] = pd.Series(score_list, index=index_list)
df_r2 = pd.DataFrame(d)
df_r2

In [None]:
print('coef_:')
d = dict()
for x_col in linear_metrics:
    score_list = []
    index_list = []
    for y_col in impact:
        X = df_zscores[[x_col + '_zscore']]
        y = df_log[[y_col + '_log']]
        reg = LinearRegression().fit(X, y)
        score = round(reg.score(X, y), 4)
        score_list.append(round(reg.coef_[0][0], 4))
        index_list.append(y_col)
    d[x_col + '_zscore'] = pd.Series(score_list, index=index_list)
df_r2 = pd.DataFrame(d)
df_r2

In [None]:
import numpy as np
from sklearn.linear_model import LinearRegression
print('r2 scores:')
d = dict()
for x_col in log_metrics:
    score_list = []
    index_list = []
    for y_col in impact:
        X = df_log_adjusted[[x_col + '_log']]
        y = df_log[[y_col + '_log']]
        reg = LinearRegression().fit(X, y)
        score = round(reg.score(X, y), 4)
        score_list.append(score)
        index_list.append(y_col)
    d[x_col + '_log'] = pd.Series(score_list, index=index_list)
df_r2 = pd.DataFrame(d)
df_r2

In [None]:
print('coef_:')
d = dict()
for x_col in log_metrics:
    score_list = []
    index_list = []
    for y_col in impact:
        X = df_log_adjusted[[x_col + '_log']]
        y = df_log[[y_col + '_log']]
        reg = LinearRegression().fit(X, y)
        score = round(reg.score(X, y), 4)
        score_list.append(round(reg.coef_[0][0], 4))
        index_list.append(y_col)
    d[x_col + '_log'] = pd.Series(score_list, index=index_list)
df_r2 = pd.DataFrame(d)
df_r2

In [None]:
import numpy as np
from sklearn.linear_model import LinearRegression
print('r2 scores:')
d = dict()
df_log_adjusted_notna = df_log_adjusted.dropna(subset=['max_disrupt_log_adj'])
df_reg = df_log_adjusted_notna.join(df_log, how = 'left')
for x_col in disrupt:
    score_list = []
    index_list = []
    for y_col in impact:
        X = df_reg[[x_col + '_log_adj']]
        y = df_reg[[y_col + '_log']]
        reg = LinearRegression().fit(X, y)
        score = round(reg.score(X, y), 4)
        score_list.append(score)
        index_list.append(y_col)
    d[x_col + '_log_adj'] = pd.Series(score_list, index=index_list)
df_r2 = pd.DataFrame(d)
df_r2

In [None]:
import numpy as np
from sklearn.linear_model import LinearRegression
print('coef_:')
d = dict()
df_log_adjusted_notna = df_log_adjusted.dropna(subset=['max_disrupt_log_adj'])
df_reg = df_log_adjusted_notna.join(df_log, how = 'left')
for x_col in disrupt:
    score_list = []
    index_list = []
    for y_col in impact:
        X = df_reg[[x_col + '_log_adj']]
        y = df_reg[[y_col + '_log']]
        reg = LinearRegression().fit(X, y)
        score = round(reg.coef_[0][0], 4)
        score_list.append(score)
        index_list.append(y_col)
    d[x_col + '_log_adj'] = pd.Series(score_list, index=index_list)
df_r2 = pd.DataFrame(d)
df_r2

In [None]:
import numpy as np
from sklearn.linear_model import LinearRegression
print('r2 scores:')
d = dict()
df_log_notna = df_log.dropna(subset=['max_disrupt_log'])
df_reg = df_log_notna.join(df_zscores, how = 'left')
df_reg = df_reg[df_reg['min_disrupt_log'] > -100]
for x_col in linear_metrics:
    score_list = []
    index_list = []
    for y_col in disrupt:
        X = df_reg[[x_col + '_zscore']]
        y = df_reg[[y_col + '_log']]
        reg = LinearRegression().fit(X, y)
        score = round(reg.score(X, y), 4)
        score_list.append(score)
        index_list.append(y_col)
    d[x_col + '_log'] = pd.Series(score_list, index=index_list)
df_r2 = pd.DataFrame(d)
df_r2

In [None]:
import numpy as np
from sklearn.linear_model import LinearRegression
print('coef_:')
d = dict()
df_log_notna = df_log.dropna(subset=['max_disrupt_log'])
df_reg = df_log_notna.join(df_zscores, how = 'left')
df_reg = df_reg[df_reg['min_disrupt_log'] > -100]
for x_col in linear_metrics:
    score_list = []
    index_list = []
    for y_col in disrupt:
        X = df_reg[[x_col + '_zscore']]
        y = df_reg[[y_col + '_log']]
        reg = LinearRegression().fit(X, y)
        score = round(reg.coef_[0][0], 4)
        score_list.append(score)
        index_list.append(y_col)
    d[x_col + '_log'] = pd.Series(score_list, index=index_list)
df_r2 = pd.DataFrame(d)
df_r2

In [None]:
import time
for y_col in impact:
    from sklearn.linear_model import LassoCV, LassoLarsCV, LassoLarsIC
    t1 = time.time()
    X = df_zscores[[x_col + '_zscore' for x_col in linear_metrics]]
    y = df_log[y_col + '_log']

    model = LassoCV(cv=20).fit(X, y)
    t_lasso_cv = time.time() - t1
    # Display results
    m_log_alphas = -np.log10(model.alphas_)

    plt.figure()
    plt.plot(m_log_alphas, model.mse_path_, ':')
    plt.plot(m_log_alphas, model.mse_path_.mean(axis=-1), 'k',
             label='Average across the folds', linewidth=2)
    plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k',
                label='alpha: CV estimate')

    plt.legend()

    plt.xlabel('-log(alpha)')
    plt.ylabel('MSE ' + y_col)
    plt.title('Mean square error on each fold: coordinate descent '
              '(train time: %.2fs)' % t_lasso_cv)
    plt.axis('tight')
    plt.show()
