# Section Order:
(1) Import Packages & Data

(2) Getting acquanted with the data

# Import Packages & Data

In [1]:
# Data Basics
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt # side-stepping mpl backend
from pandas.tools.plotting import scatter_matrix
%matplotlib inline

# Preprocessing
from sklearn import preprocessing as pp

# Regression / Modeling
import statsmodels.api as sm

# Custom functions
from project_functions import label_polynomial_features
from project_functions import model_to_dictionary
from project_functions import results_summary_to_dataframe

In [2]:
df_raw = pd.read_csv('../Resources/Data/Raw/mypersonality_final.csv')

# Getting acquanted with the data

In [3]:
df_raw.head(3).T

Unnamed: 0,0,1,2
#AUTHID,b7b7764cfa1c523e4e93ab2a79a946c4,b7b7764cfa1c523e4e93ab2a79a946c4,b7b7764cfa1c523e4e93ab2a79a946c4
STATUS,likes the sound of thunder.,is so sleepy it's not even funny that's she ca...,is sore and wants the knot of muscles at the b...
sEXT,2.65,2.65,2.65
sNEU,3,3,3
sAGR,3.15,3.15,3.15
sCON,3.25,3.25,3.25
sOPN,4.4,4.4,4.4
cEXT,n,n,n
cNEU,y,y,y
cAGR,n,n,n


In [None]:
df_raw.shape

In [None]:
df_grouped = df_raw.groupby(by = "#AUTHID", as_index = False).first()

In [None]:
df_grouped[['sEXT', 'sNEU', 'sAGR', 'sCON', 'sOPN',"NETWORKSIZE","BETWEENNESS"]].describe()

In [None]:
df_grouped_reduced = df_grouped[['sEXT', 'sNEU', 'sAGR', 'sCON', 'sOPN',"NETWORKSIZE","BETWEENNESS"]].sort_values(by= "BETWEENNESS")


In [None]:
df_grouped.quantile([0.1,0.25,0.5,0.75,0.9])

In [None]:
df_grouped["BETWEENNESS"].quantile(0.1)

In [None]:
df_grouped["BETWEENNESS"].quantile(0.9)

In [None]:
# Give me all the people who are in the bottom 10% in Betweenness Centrality
low_users_df = df_grouped_reduced[df_grouped_reduced["BETWEENNESS"]<df_grouped["BETWEENNESS"].quantile(0.1)]
average_low_user = df_grouped_reduced[df_grouped_reduced["BETWEENNESS"]<df_grouped["BETWEENNESS"].quantile(0.1)].mean()

# Fairly high openness... intelligence leads to a decrease in working in grous?

In [None]:
# Give me all the people who are in the top 10% in Betweenness Centrality
high_users_df = df_grouped_reduced[df_grouped_reduced["BETWEENNESS"]>df_grouped["BETWEENNESS"].quantile(0.9)]
average_high_user = df_grouped_reduced[df_grouped_reduced["BETWEENNESS"]<df_grouped["BETWEENNESS"].quantile(0.9)].mean()

# Fairly high openness... intelligence leads to a decrease in working in grous?

In [None]:
print "Low User:"
print average_low_user
print
print "High User:"
print average_high_user
print


In [None]:
gl.canvas.set_target('ipynb')


high_users_sl = gl.SFrame(high_users_df)
low_users_sl = gl.SFrame(low_users_df)



## Observation: High users clearly "all" are oriented toward greater extroversion. 
Low users are more spread out

In [None]:
high_users_sl["sEXT"].show()
low_users_sl["sEXT"].show()


## Observation: High Users show greater diversity of values in consciousness. Whereas low users tend to be centered in the middle.

In [None]:
high_users_sl["sCON"].show()
low_users_sl["sCON"].show()


# High users tend to have low values of neuroticism . Low users are more spread out.

In [None]:
high_users_sl["sNEU"].show()
low_users_sl["sNEU"].show()


## About the same in both groups..

In [None]:

high_users_sl["sAGR"].show()
low_users_sl["sAGR"].show()


# High users tend to have/ be focused on the extreme positive. 

In [None]:

high_users_sl["sOPN"].show()
low_users_sl["sOPN"].show()


# Extracting some time series

In [None]:
x_extroversion = df_grouped["sEXT"]
x_neuroticism = df_grouped["sNEU"]
x_agreeableness = df_grouped["sAGR"]
x_conscientiousness = df_grouped["sCON"]
x_openness = df_grouped["sOPN"]

# y_betweenness = df_grouped["BETWEENNESS"] / df_grouped["NETWORKSIZE"]   # Temporary
y_betweenness = df_grouped["BETWEENNESS"]


y_nbetweenness = df_grouped["NBETWEENNESS"]

network_size = df_grouped["NETWORKSIZE"]

variable_list = [x_extroversion,x_neuroticism,x_agreeableness,x_conscientiousness,x_openness]

# List of Basic Models

In [None]:
X_big5 = pd.DataFrame([x_extroversion,x_neuroticism,x_agreeableness,x_conscientiousness,x_openness]).T
X_network = pd.DataFrame([network_size]).T
X_big5_and_network = pd.DataFrame([x_extroversion,x_neuroticism,x_agreeableness,x_conscientiousness,x_openness,network_size]).T
X_big5_and_network_betweenness = pd.DataFrame([x_extroversion,x_neuroticism,x_agreeableness,x_conscientiousness,x_openness,network_size,y_betweenness]).T

# Creating some useful scatterplots

In [None]:
scatter_matrix(X_big5_and_network_betweenness, alpha=0.2, figsize=(15,15), diagonal='kde')
plt.show()

## Create a function which does a "polynomial expansion" for all the individual features

# Generate expanded models

In [None]:
X_big5.head()

# Testing out alternate solution to preprocessing code - Begining

In [None]:
X_big5_test = X_big5
X_big5_test.head()

In [None]:
poly = pp.PolynomialFeatures(2,include_bias=True)
output_nparray = poly.fit_transform(X_big5_test)
powers_nparray = poly.powers_

In [None]:
target_feature_names = ['x'.join(['{}^{}'.format(pair[0],pair[1]) for pair in tuple if pair[1]!=0]) for tuple in [zip(X_big5_test.columns,p) for p in poly.powers_]]
output_df = pd.DataFrame(output_nparray, columns = target_feature_names)

In [None]:
output_df.head()

## It works but doesn't add in the constant term. Regardless, a lot faster it seems than how i did it. WIll need to digest further later.

# Testing out alternate solution to preprocessing code - End

In [None]:
# THis has to come first otherwise the constant term will get duplicated
X_big5_polyexpanded = label_polynomial_features(X_big5,2,bias = True)
X_network_polyexpanded =  label_polynomial_features(X_network,2,bias = True)
X_big5_and_network_polyexpanded =  label_polynomial_features(X_big5_and_network,2,bias = True)

# The reason I'm recreating these with polynomial expansion is just so it will have consistent feature names with the others
X_big5= label_polynomial_features(X_big5,1,bias = True)
X_network=  label_polynomial_features(X_network,1,bias = True)
X_big5_and_network =  label_polynomial_features(X_big5_and_network,1,bias = True)


#target_feature_names = ['x'.join(['{}^{}'.format(pair[0],pair[1]) for pair in tuple if pair[1]!=0]) for tuple in [zip(X_big5`.columns,p) for p in poly.powers_]]
#output_df = pd.DataFrame(output_nparray, columns = target_feature_names)


In [None]:
list_of_model_names = [
                        'X_big5',
                        'X_big5_polyexpanded',
                        'X_network',
                        'X_network_polyexpanded',
                        'X_big5_and_network',
                        'X_big5_and_network_polyexpanded']

In [None]:
X_big5_polyexpanded.head()

In [None]:
X_big5_polyexpanded.head()

In [None]:
# This is a dictionary of the models. The keys are the names of the models and the values are the dataframe / 
#X matricies appropriate to them

dictionary_of_models_x_matricies = {"X_big5":X_big5,
                        "X_big5_polyexpanded":X_big5_polyexpanded,
                        "X_network":X_network,
                        "X_network_polyexpanded":X_network_polyexpanded,
                        "X_big5_and_network":X_big5_and_network,
                        "X_big5_and_network_polyexpanded":X_big5_and_network_polyexpanded,
                       }

# Clean Data as Necesesary

# Summary Statistics & Descriptive Visualizations (Pandas and Matplotlib, etc)

### Plot network size

### Printing the covaraiance matrix

In [None]:
X_big5_and_network.corr()

In [None]:
np.cov(np.vstack([x_extroversion,x_neuroticism,x_agreeableness,x_conscientiousness,x_openness,network_size]))

### Perform Scatter plots (y vs. Xi) for each of the X's -- this suggest relationship

### Check features for normality by using a density plot - plot normal on top of it. If it's not normal, then do some transformations on the features until they get to be normal

# Testing (Regression)

### Step 1a: (via intuition) Generate different models and provide explanations of why (avoid products, okay for squares and logs)

### Step 1b: Algorithmly - go throuhg all the options and create them.

### Step 2: Separate Data into 3 groupings: training, cv, and 
- Create seed

In [None]:
list_of_all_possible_features = list(dictionary_of_models_x_matricies["X_big5_and_network_polyexpanded"].columns)

In [None]:
.7*250

In [None]:
dictionary_of_model_general_performance_metrics = {}
for model_name,model_matrix in dictionary_of_models_x_matricies.items():
    print model_name
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(model_matrix, y_betweenness, test_size=0.3, random_state=0)
    model = sm.OLS(y_train,X_train)
    dictionary_of_model_general_performance_metrics[model_name] = model_to_dictionary(model,X_test,y_test)

In [None]:
dictionary_of_model_general_performance_metrics["X_big5"]['Z_Test RSqAdj']

### Step 3: Using the Training set, determine the best coefficients for each model... (Fitting a model -- sm.fit()))

In [None]:
model_names = []
number_regressors = []
features_names = []
train_rsq_adj = []
train_ssr = []
test_rsq_adj = []
test_sse = []
z_test_rsq_adj = []

z_test_sst = []
#model_dict["test_sst"] = test_sst


for model in dictionary_of_model_general_performance_metrics.keys():
    model_names.append(model)
    number_regressors.append(dictionary_of_model_general_performance_metrics[model]["Number Regressors"])
    
    string_of_features = "; ".join(dictionary_of_model_general_performance_metrics[model]["Feature Names"])  # This converts a list of feature names into a single string containing all the names
    features_names.append(string_of_features)
    train_rsq_adj.append(dictionary_of_model_general_performance_metrics[model]["TRAIN RSqAdj"])
    train_ssr.append(dictionary_of_model_general_performance_metrics[model]["TRAIN SSR"])
    test_rsq_adj.append(dictionary_of_model_general_performance_metrics[model]["Test RSqAdj"])
    test_sse.append(dictionary_of_model_general_performance_metrics[model]["Test SSE"])
    z_test_rsq_adj.append(dictionary_of_model_general_performance_metrics[model]["Z_Test RSqAdj"])   #Added for troubleshooting purposes
    z_test_sst.append(dictionary_of_model_general_performance_metrics[model]["z_test_sst"])    #Added for troubleshooting purposes

names = ["model_names","number_regressors","feature_names","train_rsq_adj","train_ssr","test_rsq_adj","z_test_rsq_adj","test_sse","z_test_sst"]
values = [model_names,number_regressors,features_names,train_rsq_adj,train_ssr,test_rsq_adj,z_test_rsq_adj,test_sse,z_test_sst]

blah = pd.DataFrame(dict(zip(names,values)), index = None)
blah = blah[["model_names","number_regressors","feature_names","train_rsq_adj","train_ssr","test_rsq_adj","z_test_rsq_adj","test_sse","z_test_sst"]]   #Fixing the order of the columns
blah

# ALthough my Z_test_rsq_adj doesn't have negative values and in general the values are smaller than jeremy's...there is still a discrepency, there is likely a problem here that needs to be addressed

In [None]:
dictionary_of_model_general_performance_metrics["X_big5"]

In [None]:
# This thing takes the final metrics from dictionary of model performance, extracts out the Coefficients and 
# P values for each model. It packages them up into a tuple, reofrmates them into lists, and then they get converted into 
# a pandas dataframe for "easy viewing"

dictionary_of_model_coefficients = {}
dictionary_of_model_pvalues = {}

for feature in list_of_all_possible_features:
    #model_name = X_big5
    for model_name,model_performance_metrics_dict in dictionary_of_model_general_performance_metrics.items():
        #print model_name
        #print model_performance_metrics
        lookup_key = (feature,model_name)
        #print lookup_key
        #temp = dictionary_of_model_general_performance_metrics[model_name]["Feature Names"] + ["Constant Term"]
        #print temp
        if feature in dictionary_of_model_general_performance_metrics[model_name]["Feature Names"]:
            p_value_for_feature = model_performance_metrics_dict["TRAIN PValues"][feature]
            coef_for_feature = model_performance_metrics_dict["Estimated Coefficients"][feature]
        else:
            p_value_for_feature = None
            coef_for_feature = None            
        
        dictionary_of_model_coefficients[lookup_key] = coef_for_feature
        dictionary_of_model_pvalues[lookup_key] = p_value_for_feature
        

In [None]:
beta_big5 = []
pvalues_big5 = []
beta_big5_polyexpanded = []
pvalues_big5_polyexpanded = []
beta_network = []
pvalues_network = []
beta_network_polyexpanded = []
pvalues_network_polyexpanded = []
beta_big5_and_network = []
pvalues_big5_and_network = []
beta_big5_and_network_polyexpanded = []
pvalues_big5_and_network_polyexpanded     = []

In [None]:
for feature in list_of_all_possible_features:
    beta_big5.append(dictionary_of_model_coefficients[(feature,"X_big5")])
    pvalues_big5.append(dictionary_of_model_pvalues[(feature,"X_big5")])
    beta_big5_polyexpanded.append(dictionary_of_model_coefficients[(feature,"X_big5_polyexpanded")])
    pvalues_big5_polyexpanded.append(dictionary_of_model_pvalues[(feature,"X_big5_polyexpanded")])
    beta_network.append(dictionary_of_model_coefficients[(feature,"X_network")])
    pvalues_network.append(dictionary_of_model_pvalues[(feature,"X_network")])
    beta_network_polyexpanded.append(dictionary_of_model_coefficients[(feature,"X_network_polyexpanded")])
    pvalues_network_polyexpanded.append(dictionary_of_model_pvalues[(feature,"X_network_polyexpanded")])
    beta_big5_and_network.append(dictionary_of_model_coefficients[(feature,"X_big5_and_network")])
    pvalues_big5_and_network.append(dictionary_of_model_pvalues[(feature,"X_big5_and_network")])
    beta_big5_and_network_polyexpanded.append(dictionary_of_model_coefficients[(feature,"X_big5_and_network_polyexpanded")])
    pvalues_big5_and_network_polyexpanded.append(dictionary_of_model_pvalues[(feature,"X_big5_and_network_polyexpanded")])

    
twinkie = pd.DataFrame({"feature":list_of_all_possible_features,
                        "beta_big5":beta_big5,
                        "pvalues_big5":pvalues_big5,
                        "beta_big5_polyexpanded":beta_big5_polyexpanded,
                        "pvalues_big5_polyexpanded":pvalues_big5_polyexpanded,
                        "beta_network":beta_network,
                        "pvalues_network":pvalues_network,
                        "beta_network_polyexpanded":beta_network_polyexpanded,
                        "pvalues_network_polyexpanded":pvalues_network_polyexpanded,
                        "beta_big5_and_network":beta_big5_and_network,
                        "pvalues_big5_and_network":pvalues_big5_and_network,
                        "beta_big5_and_network_polyexpanded":beta_big5_and_network_polyexpanded,
                        "pvalues_big5_and_network_polyexpanded":pvalues_big5_and_network_polyexpanded})
                                                                        
twinkie2 = twinkie.set_index("feature")

In [None]:
print list(twinkie2.columns)
twinkie2.head()

In [None]:
twinkie3 = twinkie2[["beta_big5","pvalues_big5","beta_network","pvalues_network","beta_big5_and_network","pvalues_big5_and_network","beta_big5","pvalues_big5","beta_network","pvalues_network","beta_big5_and_network","pvalues_big5_and_network"]]

In [None]:
twinkie3

In [None]:
blah

# Clearly there's a big need to run lasso on the biggest model! -- Radj is too big and in some cases negative (??)!

In [None]:
import pandas as pd
from sklearn import cross_validation
import statsmodels.api as sm
from sklearn.cross_validation import KFold
from sklearn.linear_model import Lasso


In [None]:
X_big5_and_network_polyexpanded.shape

In [None]:
X_big5_and_network_polyexpanded.describe()

In [None]:
type(np.array(X_big5_and_network_polyexpanded))

In [None]:
kf = KFold(n = 250, n_folds=5)

#, shuffle=True,random_state=0)
# for x in kf:
#     print x
    
lambda_range = np.arange(10990,10995,0.01)    # Ideal to start with a course grid and then make it finer. E.g. (0,1,0.05) -> (0,0.05, 0.001)
#lambda_range = np.arange(0,0.033,0.0001)    # Ideal to start with a course grid and then make it finer. E.g. (0,1,0.05) -> (0,0.05, 0.001)

error_per_lambda = []
#print lambda_range

for lambda_value in lambda_range:
    errors_per_fold_list = []
    for train_index, test_index in kf:
        #print("TRAIN:", train_index, "TEST:", test_index)
        #print("%s %s" % (train_index.shape, test_index.shape))

        X_train, X_test = np.array(X_big5_and_network_polyexpanded)[train_index], np.array(X_big5_and_network_polyexpanded)[test_index]
        y_train, y_test = np.array(y_betweenness)[train_index], np.array(y_betweenness)[test_index]

        # After this step is where we do the fitting and etc...
        # For each fold, train on the training sets (X_train & y_train)
        model = Lasso(lambda_value)    # The number entered here is the lambda /alpha variable
        model.fit(X_train,y_train)

        # For each fold, get a cost for the test sets (X_test & y_test)
        y_hat = model.predict(X_test)      # this will generate the predictions
        errors = y_test - y_hat
        squared_errors = errors * errors
        sum_squared_errors = sum(squared_errors)
   
    #   Returns the coefficient of determination R^2 of the prediction.
    #   The coefficient R^2 is defined as (1 - u/v), where u is the regression sum of squares ((y_true - y_pred) ** 2).sum() and v is the residual sum of squares ((y_true - y_true.mean()) ** 2).sum(). Best possible score is 1.0 and it can be negative (because the model can be arbitrarily worse). A constant model that always predicts the expected value of y, disregarding the input features, would get a R^2 score of 0.0.

        #r_squared = model.score(X_test,y_test)    # R_squared = 1 - (SSE/SST)
        #sse_divied_sst = 1 - r_squared

        # Store the cost in a separate cost list

        errors_per_fold_list.append(sum_squared_errors)
        # print errors_per_fold_list
        # Compute the average cost across all folds for the given lambda

    final_lambda_cost = np.mean(errors_per_fold_list)

    # Store the cost of the lambda

    error_per_lambda.append(final_lambda_cost)

In [None]:
lambda_error_df = pd.DataFrame({"Alpha/Lambda Value":lambda_range,"Mean Error per Fold": error_per_lambda}).sort(columns = "Mean Error per Fold")
print lambda_error_df.head(20)

In [None]:
#best_lambda =  0.32  #nbetweenness alpha
best_lambda =  10990.74  #betweenness alpha
model_final = Lasso(best_lambda)
lasso_results = model_final.fit(X_big5_and_network_polyexpanded,y_betweenness)

# for i,j in zip(lasso_results.coef_,X_big5_and_network_polyexpanded.columns):    #loop over results
#     print ("Lasso:", i,"Coefficient",j)
    
lasso_results_df = pd.DataFrame([lasso_results.coef_,X_big5_and_network_polyexpanded.columns]).T
lasso_results_df.columns = ["Coeff. Estimate","Coeff. Name"]
lasso_results_df

In [None]:
for x in list(lasso_results_df["Coeff. Name"]):
    print x

In [None]:
lasso_significant_features = lasso_results_df[abs(lasso_results_df["Coeff. Estimate"]) > 0.00]
lasso_significant_features

# Now I'm going to re-run a regression using only the variables which Lasso said I should include...

## Develop the X matrix

In [None]:
X_lasso_distilled = X_big5_and_network_polyexpanded[list(lasso_significant_features["Coeff. Name"])]

In [None]:
X_lasso_distilled.head()

In [None]:
 X_big5.head()

In [None]:
X_lasso_and_big5 = pd.concat([X_big5, X_lasso_distilled], axis=1, join='inner')
X_lasso_and_big5.head()

Create function which converts model resutls into dataframe

In [None]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X_network, y_betweenness, test_size=0.3, random_state=0)
model = sm.OLS(y_train,X_train)
dictionary_of_model_general_performance_metrics["X_lasso_distilled"] = model_to_dictionary(model,X_test,y_test)
results = model.fit()
print results.summary()

In [None]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X_lasso_and_big5, y_betweenness, test_size=0.3, random_state=0)
model = sm.OLS(y_train,X_train)
dictionary_of_model_general_performance_metrics["X_lasso_distilled"] = model_to_dictionary(model,X_test,y_test)
results = model.fit()
print results.summary()

In [None]:
variance_covariance_matrix = results.cov_HC0
variance_covariance_matrix.shape
variance_covariance_matrix

## Here I evaluate my lasso_model against some test data!

In [None]:
y_hat = results.predict(X_test)
X_lasso_distilled_test_ssr = sum((y_hat - y_test)**2)

# This is Jeremy's formula
X_lasso_distilled_test_RSqAdj_jeremy = 1 - ((sum((y_test-y_hat)**2.0)/(len(X_test) - len(results.params) - 1))
            /(sum((y_test - np.mean(y_test))**2.0)/(len(X_test) - 1)))

# This is my formula from here: http://onlinestatbook.com/2/effect_size/images/adjusted_rsquared.gif
n = len(X_test)
p = len(results.params)
test_ssr =  X_lasso_distilled_test_ssr
test_sst =  sum((y_test - np.mean(y_test))**2.0)

print "test_ssr:", test_ssr
print "test_sst:", test_sst


test_rsq = 1 - (test_ssr/test_sst)
print "test_rsq:", test_rsq
test_rsq_adj = 1 - (((1 - test_rsq) * (n-1)) / (n-p-1))
X_lasso_distilled_test_RSqAdj_zhanna = test_rsq_adj
    
print "X_lasso_distilled_test_ssr:", X_lasso_distilled_test_ssr
print "X_lasso_distilled_test_RSqAdj_zhanna:", X_lasso_distilled_test_RSqAdj_zhanna
print "X_lasso_distilled_test_RSqAdj_jeremy:", X_lasso_distilled_test_RSqAdj_jeremy

In [None]:
# Clearly my R squared could be either really good or really bad...
# IT could be because my sample set is so small...

## Here I create a function to take the results summary table and turn it into a dataframe

In [None]:
peanuts = results_summary_to_dataframe(results)
peanuts

# Now if you filter for only those with a low p-value, and order by coefficient size...

Based on the results above, here are the significant coefficients:

In [None]:
peanuts[peanuts["pvals"]<0.1].sort(columns="coeff", ascending = False)

# Now to take the derivatives to plot the graphs

In [None]:
X_lasso_and_big5["NETWORKSIZE^1"].describe()

In [None]:
network_range = np.arange(0,1600,1)

In [None]:
betas_list = []
betas_list_95conf_lower = []
betas_list_95conf_higher = []

for i in range(0,peanuts.shape[0]):
    betas_list.append(peanuts["coeff"][i])
    betas_list_95conf_lower.append(peanuts["conf_lower"][i])
    betas_list_95conf_higher.append(peanuts["conf_higher"][i])

In [None]:
# Actual Beta Coefficients
Chg_Betw_Centrality_from_sExt = betas_list[1] + (betas_list[7] * network_range)
Chg_Betw_Centrality_from_sNeu = betas_list[2] + (betas_list[8] * network_range)
Chg_Betw_Centrality_from_sAgr = betas_list[3] + (betas_list[9] * network_range)
Chg_Betw_Centrality_from_sCon = betas_list[4] + (betas_list[10] * network_range)
Chg_Betw_Centrality_from_sOpn = betas_list[5] + (betas_list[11] * network_range)


# All lower Estimates - lower lower

Chg_Betw_Centrality_from_sExt_low_low = betas_list_95conf_lower[1] + (betas_list_95conf_lower[7] * network_range)
Chg_Betw_Centrality_from_sNeu_low_low = betas_list_95conf_lower[2] + (betas_list_95conf_lower[8] * network_range)
Chg_Betw_Centrality_from_sAgr_low_low = betas_list_95conf_lower[3] + (betas_list_95conf_lower[9] * network_range)
Chg_Betw_Centrality_from_sCon_low_low = betas_list_95conf_lower[4] + (betas_list_95conf_lower[10] * network_range)
Chg_Betw_Centrality_from_sOpn_low_low = betas_list_95conf_lower[5] + (betas_list_95conf_lower[11] * network_range)

# All higher Estimates - higher higher

Chg_Betw_Centrality_from_sExt_high_high = betas_list_95conf_higher[1] + (betas_list_95conf_higher[7] * network_range) 
Chg_Betw_Centrality_from_sNeu_high_high = betas_list_95conf_higher[2] + (betas_list_95conf_higher[8] * network_range) 
Chg_Betw_Centrality_from_sAgr_high_high = betas_list_95conf_higher[3] + (betas_list_95conf_higher[9] * network_range) 
Chg_Betw_Centrality_from_sCon_high_high = betas_list_95conf_higher[4] + (betas_list_95conf_higher[10] * network_range) 
Chg_Betw_Centrality_from_sOpn_high_high = betas_list_95conf_higher[5] + (betas_list_95conf_higher[11] * network_range) 

# Lower on first beta, higher for second beta (the one that multiples with network)

Chg_Betw_Centrality_from_sExt_low_high = betas_list_95conf_lower[1] + (betas_list_95conf_higher[7] * network_range) 
Chg_Betw_Centrality_from_sNeu_low_high = betas_list_95conf_lower[2] + (betas_list_95conf_higher[8] * network_range) 
Chg_Betw_Centrality_from_sAgr_low_high = betas_list_95conf_lower[3] + (betas_list_95conf_higher[9] * network_range) 
Chg_Betw_Centrality_from_sCon_low_high = betas_list_95conf_lower[4] + (betas_list_95conf_higher[10] * network_range) 
Chg_Betw_Centrality_from_sOpn_low_high = betas_list_95conf_lower[5] + (betas_list_95conf_higher[11] * network_range) 

# Higher on first beta, Lower for second beta (the one that multiples with network)

Chg_Betw_Centrality_from_sExt_high_low = betas_list_95conf_higher[1] + (betas_list_95conf_lower[7] * network_range)
Chg_Betw_Centrality_from_sNeu_high_low = betas_list_95conf_higher[2] + (betas_list_95conf_lower[8] * network_range)
Chg_Betw_Centrality_from_sAgr_high_low = betas_list_95conf_higher[3] + (betas_list_95conf_lower[9] * network_range)
Chg_Betw_Centrality_from_sCon_high_low = betas_list_95conf_higher[4] + (betas_list_95conf_lower[10] * network_range)
Chg_Betw_Centrality_from_sOpn_high_low = betas_list_95conf_higher[5] + (betas_list_95conf_lower[11] * network_range)

In [None]:
peanuts

In [None]:
variance_covariance_matrix_df = pd.DataFrame(variance_covariance_matrix)
variance_covariance_matrix_df.columns = list(peanuts.index)
variance_covariance_matrix_df.index = list(peanuts.index)
variance_covariance_matrix_df

In [None]:
# This is how to account for the interactionality in creating the 95% confidence intervals.

var_sExt = variance_covariance_matrix_df["sEXT^1"]["sEXT^1"]
cov_sExt_Net = variance_covariance_matrix_df["sEXT^1"]["sEXT^1 x NETWORKSIZE^1"]
var_sExtNet_sExtNet = variance_covariance_matrix_df["sEXT^1 x NETWORKSIZE^1"]["sEXT^1 x NETWORKSIZE^1"]

var_sNeu = variance_covariance_matrix_df["sNEU^1"]["sNEU^1"]
cov_sNeu_Net = variance_covariance_matrix_df["sNEU^1"]["sNEU^1 x NETWORKSIZE^1"]
var_sNeuNet_sNeuNet = variance_covariance_matrix_df["sNEU^1 x NETWORKSIZE^1"]["sNEU^1 x NETWORKSIZE^1"]

var_sAgr = variance_covariance_matrix_df["sAGR^1"]["sAGR^1"]
cov_sAgr_Net = variance_covariance_matrix_df["sAGR^1"]["sAGR^1 x NETWORKSIZE^1"]
var_sAgrNet_sAgrNet = variance_covariance_matrix_df["sAGR^1 x NETWORKSIZE^1"]["sAGR^1 x NETWORKSIZE^1"]

var_sCon = variance_covariance_matrix_df["sCON^1"]["sCON^1"]
cov_sCon_Net = variance_covariance_matrix_df["sCON^1"]["sCON^1 x NETWORKSIZE^1"]
var_sConNet_sConNet = variance_covariance_matrix_df["sCON^1 x NETWORKSIZE^1"]["sCON^1 x NETWORKSIZE^1"]

var_sOpn = variance_covariance_matrix_df["sOPN^1"]["sOPN^1"]
cov_sOpn_Net = variance_covariance_matrix_df["sOPN^1"]["sOPN^1 x NETWORKSIZE^1"]
var_sOpnNet_sOpnNet = variance_covariance_matrix_df["sOPN^1 x NETWORKSIZE^1"]["sOPN^1 x NETWORKSIZE^1"]

In [None]:
z_95Percent = 1.96

Chg_Betw_Centrality_from_sExt_95_plus = Chg_Betw_Centrality_from_sExt + z_95Percent * np.sqrt((var_sExt + (2 * network_range * cov_sExt_Net) + ((network_range ** 2) * var_sExtNet_sExtNet)))
Chg_Betw_Centrality_from_sExt_95_minus = Chg_Betw_Centrality_from_sExt - z_95Percent * np.sqrt((var_sExt + (2 * network_range * cov_sExt_Net) + ((network_range ** 2) * var_sExtNet_sExtNet)))

Chg_Betw_Centrality_from_sNeu_95_plus = Chg_Betw_Centrality_from_sNeu + z_95Percent * np.sqrt((var_sNeu + (2 * network_range * cov_sNeu_Net) + ((network_range ** 2) * var_sNeuNet_sNeuNet)))
Chg_Betw_Centrality_from_sNeu_95_minus = Chg_Betw_Centrality_from_sNeu - z_95Percent * np.sqrt((var_sNeu + (2 * network_range * cov_sNeu_Net) + ((network_range ** 2) * var_sNeuNet_sNeuNet)))

Chg_Betw_Centrality_from_sAgr_95_plus = Chg_Betw_Centrality_from_sAgr + z_95Percent * np.sqrt((var_sAgr + (2 * network_range * cov_sAgr_Net) + ((network_range ** 2) * var_sAgrNet_sAgrNet)))
Chg_Betw_Centrality_from_sAgr_95_minus = Chg_Betw_Centrality_from_sAgr - z_95Percent * np.sqrt((var_sAgr + (2 * network_range * cov_sAgr_Net) + ((network_range ** 2) * var_sAgrNet_sAgrNet)))

Chg_Betw_Centrality_from_sCon_95_plus = Chg_Betw_Centrality_from_sCon + z_95Percent * np.sqrt((var_sCon + (2 * network_range * cov_sCon_Net) + ((network_range ** 2) * var_sConNet_sConNet)))
Chg_Betw_Centrality_from_sCon_95_minus = Chg_Betw_Centrality_from_sCon - z_95Percent * np.sqrt((var_sCon + (2 * network_range * cov_sCon_Net) + ((network_range ** 2) * var_sConNet_sConNet)))

Chg_Betw_Centrality_from_sOpn_95_plus = Chg_Betw_Centrality_from_sOpn + z_95Percent * np.sqrt((var_sOpn + (2 * network_range * cov_sOpn_Net) + ((network_range ** 2) * var_sOpnNet_sOpnNet)))
Chg_Betw_Centrality_from_sOpn_95_minus = Chg_Betw_Centrality_from_sOpn - z_95Percent * np.sqrt((var_sOpn + (2 * network_range * cov_sOpn_Net) + ((network_range ** 2) * var_sOpnNet_sOpnNet)))

In [None]:
marginal_effects_on_betweenness_df = pd.DataFrame({
                                                    "network_range": network_range,
                                                    "Chg_Betw_Centrality_from_sExt":Chg_Betw_Centrality_from_sExt,
                                                    "Chg_Betw_Centrality_from_sNeu":Chg_Betw_Centrality_from_sNeu,
                                                    "Chg_Betw_Centrality_from_sAgr":Chg_Betw_Centrality_from_sAgr,
                                                    "Chg_Betw_Centrality_from_sCon":Chg_Betw_Centrality_from_sCon,
                                                    "Chg_Betw_Centrality_from_sOpn":Chg_Betw_Centrality_from_sOpn,                                        

#                                         "Chg_Betw_Centrality_from_sExt_low_low":Chg_Betw_Centrality_from_sExt_low_low,
#                                         "Chg_Betw_Centrality_from_sNeu_low_low":Chg_Betw_Centrality_from_sNeu_low_low,
#                                         "Chg_Betw_Centrality_from_sAgr_low_low":Chg_Betw_Centrality_from_sAgr_low_low,
#                                         "Chg_Betw_Centrality_from_sCon_low_low":Chg_Betw_Centrality_from_sCon_low_low,
#                                         "Chg_Betw_Centrality_from_sOpn_low_low":Chg_Betw_Centrality_from_sOpn_low_low,
#                                         "Chg_Betw_Centrality_from_sExt_high_high":Chg_Betw_Centrality_from_sExt_high_high,
#                                         "Chg_Betw_Centrality_from_sNeu_high_high":Chg_Betw_Centrality_from_sNeu_high_high,
#                                         "Chg_Betw_Centrality_from_sAgr_high_high":Chg_Betw_Centrality_from_sAgr_high_high,
#                                         "Chg_Betw_Centrality_from_sCon_high_high":Chg_Betw_Centrality_from_sCon_high_high,
#                                         "Chg_Betw_Centrality_from_sOpn_high_high":Chg_Betw_Centrality_from_sOpn_high_high,
#                                         "Chg_Betw_Centrality_from_sExt_low_high ":Chg_Betw_Centrality_from_sExt_low_high ,
#                                         "Chg_Betw_Centrality_from_sNeu_low_high ":Chg_Betw_Centrality_from_sNeu_low_high ,
#                                         "Chg_Betw_Centrality_from_sAgr_low_high ":Chg_Betw_Centrality_from_sAgr_low_high ,
#                                         "Chg_Betw_Centrality_from_sCon_low_high ":Chg_Betw_Centrality_from_sCon_low_high ,
#                                         "Chg_Betw_Centrality_from_sOpn_low_high ":Chg_Betw_Centrality_from_sOpn_low_high ,
#                                         "Chg_Betw_Centrality_from_sExt_high_low ":Chg_Betw_Centrality_from_sExt_high_low ,
#                                         "Chg_Betw_Centrality_from_sNeu_high_low ":Chg_Betw_Centrality_from_sNeu_high_low ,
#                                         "Chg_Betw_Centrality_from_sAgr_high_low ":Chg_Betw_Centrality_from_sAgr_high_low ,
#                                         "Chg_Betw_Centrality_from_sCon_high_low ":Chg_Betw_Centrality_from_sCon_high_low ,
#                                         "Chg_Betw_Centrality_from_sOpn_high_low ":Chg_Betw_Centrality_from_sOpn_high_low 
    
    
    
    })

marginal_effects_on_betweenness_df.head()

In [None]:
import graphlab as gl

In [None]:
sf_marginal_effects_on_betweenness_df = gl.SFrame(data=marginal_effects_on_betweenness_df)
sf_marginal_effects_on_betweenness_df.show()

#https://dato.com/products/create/docs/graphlab.canvas.html
gl.canvas.set_target('ipynb')
#gl.canvas.set_target('browser')
#show(view="Summary")

In [None]:
sf_marginal_effects_on_betweenness_df.show(view="Summary")

In [None]:
# https://dato.com/products/create/docs/generated/graphlab.SFrame.show.html
sf_marginal_effects_on_betweenness_df.show(view="Scatter Plot", x="network_range", y="Chg_Betw_Centrality_from_sAgr")
sf_marginal_effects_on_betweenness_df.show(view="Scatter Plot", x="network_range", y="Chg_Betw_Centrality_from_sCon")
sf_marginal_effects_on_betweenness_df.show(view="Scatter Plot", x="network_range", y="Chg_Betw_Centrality_from_sExt")
sf_marginal_effects_on_betweenness_df.show(view="Scatter Plot", x="network_range", y="Chg_Betw_Centrality_from_sNeu")
sf_marginal_effects_on_betweenness_df.show(view="Scatter Plot", x="network_range", y="Chg_Betw_Centrality_from_sOpn")
# Other useful: https://dato.com/learn/userguide/timeseries/timeseries-data.html

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt # side-stepping mpl backend
import matplotlib.gridspec as gridspec # subplots

In [None]:
import plotly.plotly as py
from plotly.graph_objs import *
import plotly.tools as tls

# Doing the charts for sAgr

In [None]:
fig3 = plt.figure()
plt.figure(figsize = (20,10))
# Make a legend for specific lines.

# note that plot returns a list of lines.  The "l1, = plot" usage
# extracts the first element of the list into l1 using tuple
# unpacking.  So l1 is a Line2D instance, not a sequence of lines
l1, = plt.plot(network_range, y__sAgr,'rs-.')
l2, = plt.plot(np.zeros(len(network_range)))     # Adding the baseline at zero

l_95_minus, = plt.plot(network_range,Chg_Betw_Centrality_from_sAgr_95_minus)
l_95_plus,= plt.plot(network_range,Chg_Betw_Centrality_from_sAgr_95_plus)

#l4, = plt.plot(x, np.exp(-x) * np.sin(2 * np.pi * x), 'rs-.')

plt.xlabel('Network')
plt.ylabel('Ch. in Bet. Centrality')
plt.title('Change in Betweenness Centrality due to sAgr as Network Changes')


In [None]:
fig3 = plt.figure()
plt.figure(figsize = (20,10))
# Make a legend for specific lines.

# note that plot returns a list of lines.  The "l1, = plot" usage
# extracts the first element of the list into l1 using tuple
# unpacking.  So l1 is a Line2D instance, not a sequence of lines
l1, = plt.plot(network_range, y__sCon,'rs-.')
l2, = plt.plot(np.zeros(len(network_range)))     # Adding the baseline at zero

l_95_minus, = plt.plot(network_range,Chg_Betw_Centrality_from_sCon_95_minus)
l_95_plus,= plt.plot(network_range,Chg_Betw_Centrality_from_sCon_95_plus)

plt.xlabel('Network')
plt.ylabel('Ch. in Bet. Centrality')
plt.title('Change in Betweenness Centrality due to sCon as Network Changes')

In [None]:

fig3 = plt.figure()
plt.figure(figsize = (20,10))
# Make a legend for specific lines.

# note that plot returns a list of lines.  The "l1, = plot" usage
# extracts the first element of the list into l1 using tuple
# unpacking.  So l1 is a Line2D instance, not a sequence of lines
l1, = plt.plot(network_range, y__sExt,'rs-.')
l2, = plt.plot(np.zeros(len(network_range)))     # Adding the baseline at zero


l_95_minus, = plt.plot(network_range,Chg_Betw_Centrality_from_sExt_95_minus)
l_95_plus,= plt.plot(network_range,Chg_Betw_Centrality_from_sExt_95_plus)


plt.xlabel('Network')
plt.ylabel('Ch. in Bet. Centrality')
plt.title('Change in Betweenness Centrality due to sExt as Network Changes')


In [None]:
fig3 = plt.figure()
plt.figure(figsize = (20,10))
# Make a legend for specific lines.

# note that plot returns a list of lines.  The "l1, = plot" usage
# extracts the first element of the list into l1 using tuple
# unpacking.  So l1 is a Line2D instance, not a sequence of lines
l1, = plt.plot(network_range, y__sNeu,'rs-.')
l2, = plt.plot(np.zeros(len(network_range)))     # Adding the baseline at zero

l_95_minus, = plt.plot(network_range,Chg_Betw_Centrality_from_sNeu_95_minus)
l_95_plus,= plt.plot(network_range,Chg_Betw_Centrality_from_sNeu_95_plus)

plt.xlabel('Network')
plt.ylabel('Ch. in Bet. Centrality')
plt.title('Change in Betweenness Centrality due to sNeu as Network Changes')

In [None]:

fig3 = plt.figure()
plt.figure(figsize = (20,10))
# Make a legend for specific lines.

# note that plot returns a list of lines.  The "l1, = plot" usage
# extracts the first element of the list into l1 using tuple
# unpacking.  So l1 is a Line2D instance, not a sequence of lines
l1, = plt.plot(network_range, y__sOpn,'rs-.')
l2, = plt.plot(np.zeros(len(network_range)))     # Adding the baseline at zero


l_95_minus, = plt.plot(network_range,Chg_Betw_Centrality_from_sOpn_95_minus)
l_95_plus,= plt.plot(network_range,Chg_Betw_Centrality_from_sOpn_95_plus)


plt.xlabel('Network')
plt.ylabel('Ch. in Bet. Centrality')
plt.title('Change in Betweenness Centrality due to sOpn as Network Changes')


In [None]:
X_big5_and_network.describe()

# Reflection on results

We find the biggest 

Neuroticism and Openess Cobined -- Tend to greatly inccrease

Extroversion & Conscientiousnes Combined --- interesting that in one version the effect is positive, but too much extroversion and conscientiousness leads to negative effect. 

Aggreableness -- Tends to increase the value...but why to the 4th power?

What's with the (Aggreableness + Openness) combo and why do they tend to make negative results?

### Step 4: Against the CV set, compare the cost functions for each of the models and choose which model is the best (predict... and then get the difference) + ALso compare the adjusted R squared for all of them

### Step 5: Once the model is chosen, retrain that model using the combination of CV + Training

### Step 6: Test your final model against your test set.

In [None]:
Diagnostic Tool #1:

https://theclevermachine.files.wordpress.com/2013/04/bias-variance-train-test-error.png

In [None]:
Diagnostic Tool #2
### Step 7 - If you increase the size of your training set (from 0 to the full amount)
http://www.bigdataexaminer.com/wp-content/uploads/2014/11/code-9.png

# Analysis & Interpretations

# Next Steps

In [None]:
Add sentiment analysis stuff
# Create Repository called luther (?)

# Extract out #auth & status updates

# Generate some "word usage" variable per person (collapse all statuses into a single metric per person)

# Add that metric to your model
# See if that's representative for the centrality metrics.

In [None]:
#) Identify the profile for the "lowest" network users based on profile. and highest -- display the two types of profiles created.

Next Steps:
    - K means of dataset/users / networks
    - Sentiment analysis as an additional feature (word2vec?)
    - Considering alternate network theory variables (e.g. centrality, etc...)

In [None]:
import graphlab as gl

In [None]:
#df = pandas.DataFrame()
sf_grouped = gl.SFrame(data=df_grouped)

In [None]:
sf_grouped

In [None]:
8399.66/86.33

In [None]:
8096.6/92.24

In [None]:
18123.1 / 97.81 

In [None]:
sf_grouped.show()
gl.canvas.show()

In [None]:
Ask Jeremey
-- Woudl it make sense to divide betweenness by network size?