# Predictive Validity Main Result demo

This notebook reproduces the main central hypothesis: BLT's outperform questionnaire based factors on predictive social and demographic variables like IQ, Income, Likes while Questionnaire based factors outperform BLT's on psychological outcomes like SWL, CESD, Big5 Questions etc. 

In [1]:
import pandas as pd
import numpy as np

In [2]:
def print_summary_likes(filename, num_likes=100):
    records = []
    for i in np.arange(0, num_likes):
        df = pd.read_csv(filename.format(i))
        records.append((df.LV.mean(), df.LV.sem()))
    df_record = pd.DataFrame().from_records(records, columns=['Mean', 'Std'])
    return list((np.round(df_record.Mean.mean(),3), np.round(df_record.Std.mean(),3)))

In [3]:
def analyze_likes(filename, num_likes=100):
    records = []
    for i in np.arange(0, num_likes):
        df = pd.read_csv(filename.format(i))
        records.append((df.LV.mean(), df.LV.sem()))
    df_record = pd.DataFrame().from_records(records, columns=['Mean', 'Std'])
    return df_record

In [4]:
def analyze_questions(filename,qno=21):
    records = []
    for i in np.arange(qno, 101):
        df = pd.read_csv(filename.format(i))
        records.append((i, df.LV.mean(), df.LV.sem()))
    df_record = pd.DataFrame().from_records(records, columns=['Q','Mean', 'Std'])
    return df_record

In [5]:
def print_summary_questions(filename, qno=21):
    records = []
    for i in np.arange(qno, 101):
        df = pd.read_csv(filename.format(i))
        records.append((df.LV.mean(), df.LV.sem()))
    df_record = pd.DataFrame().from_records(records, columns=['Mean', 'Std'])
    return list((df_record.Mean.mean(), df_record.Std.mean()))

In [6]:
def print_summary_questions_new(filename, qno=1, num_questions=5):
    records = []
    for i in np.arange(qno, num_questions+1):
        df = pd.read_csv(filename.format(i))
        records.append((df.LV.mean(), df.LV.sem()))
    df_record = pd.DataFrame().from_records(records, columns=['Mean', 'Std'])
    return list((df_record.Mean.mean(), df_record.Std.mean()))

In [7]:
home_dir="/hlab1/vvkulkarni/psychology/plos_data_private/predictive/"

In [8]:
records=[]

In [9]:
methods=['big5atleast1000','big5_10_atleast1000', 'big5_30_atleast1000']
big5_method_num_map={'big5atleast1000':5, 'big5_10_atleast1000':10, 'big5_30_atleast1000':30}

In [10]:
for method in methods:
    factor=big5_method_num_map[method]
    g = home_dir + '{}/grid_search_multiple_runs/likes_interpretable20_private/{}_superlike_interpretable_'.format(method, method) 
    filename = g + '{}_logistic.csv'
    records.append(['{}_{}'.format(method, factor)] + print_summary_likes(filename, num_likes=20))

In [11]:
methods=['fa_not_residualized_ngrams_usa_home_current_null_standardized']
factors = [5, 10, 30]
for method in methods:
    for factor in factors:
        g = home_dir + '{}/grid_search_multiple_runs/likes_interpretable20_private/{}_{}_superlike_interpretable_'.format(method, method, factor) 
        filename = g + '{}_logistic.csv'
        records.append(['{}_{}'.format(method, factor)] + print_summary_likes(filename, num_likes=20))

In [12]:
df = pd.DataFrame().from_records(records, columns=['Method', 'Mean', 'Std'])

In [13]:
df

Unnamed: 0,Method,Mean,Std
0,big5atleast1000_5,0.525,0.001
1,big5_10_atleast1000_10,0.532,0.002
2,big5_30_atleast1000_30,0.553,0.002
3,fa_not_residualized_ngrams_usa_home_current_nu...,0.601,0.002
4,fa_not_residualized_ngrams_usa_home_current_nu...,0.634,0.002
5,fa_not_residualized_ngrams_usa_home_current_nu...,0.649,0.002


In [14]:
records  = []

In [15]:
methods=['big5atleast1000','big5_10_atleast1000', 'big5_30_atleast1000']
big5_method_num_map={'big5atleast1000':5, 'big5_10_atleast1000':10, 'big5_30_atleast1000':30}

In [16]:
for method in methods:
    factor=big5_method_num_map[method]
    g = home_dir + '{}/grid_search_multiple_runs/questions_private/{}_big5_domains_item_level_q_'.format(method,method) 
    filename = g + '{}_Ridge.csv'
    records.append(['{}_{}'.format(method, factor)] + print_summary_questions(filename))

In [17]:
methods=['fa_not_residualized_ngrams_usa_home_current_null_standardized']
factors = [5, 10, 30]
for method in methods:
    for factor in factors:
        g = home_dir + '{}/grid_search_multiple_runs/questions_private/{}_{}_big5_domains_item_level_q_'.format(method,method, factor) 
        filename = g + '{}_Ridge.csv'
        records.append(['{}_{}'.format(method, factor)] + print_summary_questions(filename))

In [18]:
df = pd.DataFrame().from_records(records, columns=['Method', 'Mean', 'Std'])

In [19]:
df

Unnamed: 0,Method,Mean,Std
0,big5atleast1000_5,0.179047,0.002122
1,big5_10_atleast1000_10,0.627048,0.003155
2,big5_30_atleast1000_30,0.766599,0.002199
3,fa_not_residualized_ngrams_usa_home_current_nu...,0.179851,0.00268
4,fa_not_residualized_ngrams_usa_home_current_nu...,0.192607,0.002498
5,fa_not_residualized_ngrams_usa_home_current_nu...,0.215548,0.002675


In [20]:
def print_target(filename):
    import numpy as np
    df = pd.read_csv(filename)
    return [np.round(df.LV.mean(),3), np.round(df.LV.sem(),3)]

In [21]:
def get_df_regress_variable(variable):
    methods=['big5atleast1000','big5_10_atleast1000', 'big5_30_atleast1000']
    records=[]
    for method in methods:
        filename = home_dir + '{}/grid_search_multiple_runs/{}_{}_private_Ridge.csv'.format(method, method, variable)
        records.append([method]+print_target(filename))

    methods=[]
    methods=methods+['fa_not_residualized_ngrams_usa_home_current_null_standardized']
    for method in methods:
        for num_topics in [5, 10, 30]:
            filename = home_dir + '{}/grid_search_multiple_runs/{}_{}_{}_private_Ridge.csv'.format(method, method, num_topics, variable)
            records.append(['{}_{}'.format(method, num_topics)] + print_target(filename))
    odf = pd.DataFrame().from_records(records, columns=['Method', 'Mean', 'Std'])
    return odf

In [22]:
get_df_regress_variable('network')

Unnamed: 0,Method,Mean,Std
0,big5atleast1000,0.187,0.003
1,big5_10_atleast1000,0.209,0.006
2,big5_30_atleast1000,0.238,0.005
3,fa_not_residualized_ngrams_usa_home_current_nu...,0.119,0.003
4,fa_not_residualized_ngrams_usa_home_current_nu...,0.18,0.004
5,fa_not_residualized_ngrams_usa_home_current_nu...,0.322,0.003


In [23]:
get_df_regress_variable('SWL')

Unnamed: 0,Method,Mean,Std
0,big5atleast1000,0.507,0.012
1,big5_10_atleast1000,0.46,0.013
2,big5_30_atleast1000,0.571,0.012
3,fa_not_residualized_ngrams_usa_home_current_nu...,0.143,0.014
4,fa_not_residualized_ngrams_usa_home_current_nu...,0.227,0.016
5,fa_not_residualized_ngrams_usa_home_current_nu...,0.255,0.01


In [24]:
get_df_regress_variable('iq')

Unnamed: 0,Method,Mean,Std
0,big5atleast1000,0.233,0.013
1,big5_10_atleast1000,0.206,0.02
2,big5_30_atleast1000,0.264,0.016
3,fa_not_residualized_ngrams_usa_home_current_nu...,0.371,0.019
4,fa_not_residualized_ngrams_usa_home_current_nu...,0.356,0.018
5,fa_not_residualized_ngrams_usa_home_current_nu...,0.442,0.013


In [25]:
get_df_regress_variable('cesd')

Unnamed: 0,Method,Mean,Std
0,big5atleast1000,0.462,0.02
1,big5_10_atleast1000,0.272,0.02
2,big5_30_atleast1000,0.457,0.029
3,fa_not_residualized_ngrams_usa_home_current_nu...,0.227,0.02
4,fa_not_residualized_ngrams_usa_home_current_nu...,0.136,0.025
5,fa_not_residualized_ngrams_usa_home_current_nu...,0.245,0.016


In [26]:
def get_df_income_variable_better(variable, add_age_gender=True):
    methods = ['sandra_big5atleast1000', 'sandra_big5_10_atleast1000']
    records=[]
    for method in methods:
        filename = home_dir + '{}/grid_search_multiple_runs/{}_{}_private_Ridge.csv'.format(method, method, variable)
        records.append([method]+print_target(filename))

    
    methods=['sandra_fa_not_residualized_ngrams_usa_home_current_null_standardized'] 
    for method in methods:
        for num_topics in [5, 10]:
            filename = home_dir + '{}/grid_search_multiple_runs/{}_{}_{}_private_Ridge.csv'.format(method, method, num_topics, variable)
            records.append(['{}_{}'.format(method, num_topics)] + print_target(filename))
    odf = pd.DataFrame().from_records(records, columns=['Method', 'Mean', 'Std'])
    return odf

In [27]:
get_df_income_variable_better('li_p1_gt1k', False)

Unnamed: 0,Method,Mean,Std
0,sandra_big5atleast1000,0.141,0.042
1,sandra_big5_10_atleast1000,0.061,0.05
2,sandra_fa_not_residualized_ngrams_usa_home_cur...,0.363,0.01
3,sandra_fa_not_residualized_ngrams_usa_home_cur...,0.352,0.01
