This notebook provides a summary of the predictive analyses using task or survey data to predict a set of factors based on demographic/health measures.  

#### Provenance:
- data generated for each run using behav_prediction.py via Singularity container
- multiple runs generated on lonestar5 using singularity_analyses/mk_singularity_script_factor.py
- individual data files combined on ls5 using singularity_analyses/ls5/check_completion.py which generates singularity_analyses/ls5/lasso_data.pkl (which is copied to mac for next step)
- data structures further collapsed using export_data_for_R_factor.py



In [2]:
import os,glob,sys
import pickle
import numpy,pandas
pandas.options.display.max_colwidth = 0
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import display, HTML
import seaborn as sns
import scipy.stats
from statsmodels.sandbox.stats.multicomp import multipletests
%load_ext rpy2.ipython
from scipy.cluster.hierarchy import dendrogram,ward,cut_tree,leaves_list
from scipy.spatial.distance import pdist
from sklearn.preprocessing import StandardScaler

import selfregulation.prediction.behavpredict as behavpredict
from prediction_notebook_utils import get_pval, get_importances,get_importance_list,plotvars

clf='lasso'
acc,features=pickle.load(open('singularity_analyses/ls5/%s_data_collapsed.pkl'%clf,'rb'))
cont_measure='r2' # use r^2 or MAE for non-binary variables

ImportError: dlopen(/Users/poldrack/anaconda3/envs/py36/lib/python3.6/site-packages/rpy2/rinterface/_rinterface.cpython-36m-darwin.so, 2): Library not loaded: @rpath/libiconv.2.dylib
  Referenced from: /Users/poldrack/anaconda3/envs/py36/lib/python3.6/site-packages/rpy2/rinterface/_rinterface.cpython-36m-darwin.so
  Reason: Incompatible library version: _rinterface.cpython-36m-darwin.so requires version 9.0.0 or later, but libiconv.2.dylib provides version 8.0.0

Check all variables to make sure they have the correct number of observations (120), and create tables summarizing data.

In [None]:

allvars={}
datasets=[]
for k in acc.keys():
    if len(acc[k])==0:
        print('no data for',k)
        continue
    datasets.append(k)
    for v in acc[k][cont_measure]['scores_cv']:
        if v=='tmp':
            continue
        allvars[v]=cont_measure
        
alldata={'r2':pandas.DataFrame(),'MAE':pandas.DataFrame(),
        'r2_pval':pandas.DataFrame()}
target_n={}
goodcount={}
for d in datasets:
    if len(acc[k])==0:
        print('no data for',k)
        continue
    
    goodcount[d]={}
    target_n[d]=120
    examplefeature=list(features[d].keys())[0]
    print(d,features[d][examplefeature].shape[1])

    for v in acc[d]['r2']['scores_cv']:
        if not v in acc[d][allvars[v]]['scores_cv']:
            goodcount[d][v]=0
        else:
            goodcount[d][v]=numpy.isfinite(acc[d][allvars[v]]['scores_cv'][v]).sum()
        if goodcount[d][v]<target_n[d]:
            print(d,v,goodcount[d][v],features[d][v].shape[1])

for v in allvars:
    vars={}
    for k in datasets:
        if not 'r2' in acc[k]:
            continue
        vars[k]=acc[k]['r2']['scores_cv'].mean().T
    df=pandas.DataFrame(vars,index=[v])
    alldata['r2']=alldata['r2'].append(df)

    vars={}
    for k in datasets:
        if not 'MAE' in acc[k]:
            continue
        vars[k]=acc[k]['MAE']['scores_cv'].mean().T
    df=pandas.DataFrame(vars,index=[v])
    alldata['MAE']=alldata['MAE'].append(df)
   


In [None]:
features['survey'].keys()

Compute p values

In [None]:
#NOTE: factors were exported from R in the wrong order
# This dict renames them
# This should be fixed in the final run


def get_pval(target,null,allvars,datasets,acc,verbose=False):
    data=[]
    vars=[i for i in list(allvars.keys()) if not i=='tmp']
    vars.sort()
    for v in vars:
        if verbose:
            print(target,null,v)
        if not v in acc[target][allvars[v]]['scores_cv'] or not v in acc[null][allvars[v]]['scores_cv']:
            data.append([allvars[v],numpy.nan,numpy.nan,numpy.nan,numpy.nan,numpy.nan])
            continue
        targdist=acc[target][allvars[v]]['scores_cv'][v].dropna()
        targmean=targdist.mean()
        nulldist=acc[null][allvars[v]]['scores_cv'][v].dropna()
        nullmean=nulldist.mean()
        targstd=targdist.std()
        pval=1-scipy.stats.percentileofscore(nulldist,targmean)/100.
        if targstd>0:
            #es=(targmean-nullmean)/targstd
            es=targmean-nullmean
        else:
            es=numpy.nan
        insample=acc[target][allvars[v]]['scores_insample_unbiased'][v].mean()
        data.append([allvars[v],targmean,nullmean,es,insample,pval])
    #newvars=[factor_renaming_dict[i] for i in vars]
    df=pandas.DataFrame(data,index=vars,columns=['Measure','Target mean','Null Mean','Effect size','In-sample','p_unc'])
    return(df)



pvals={}
for d in datasets:
    if d.find('shuffle')>-1 or len(acc[d])==0:
        continue
    print(d)
    pvals[(d,d+'_shuffle')]=get_pval(d,d+'_shuffle',allvars,datasets,acc)

pvals_fdr={}
for k in pvals:
    tmp=multipletests(pvals[k]['p_unc'])
    pvals[k]['p_fdr']=tmp[1]

## Example of overfitting

In [None]:
k=('survey','survey_shuffle')
plt.figure(figsize=(8,8))
r2data=pvals[k].query('Measure == "r2"')
plt.scatter(r2data['Target mean'],r2data['In-sample'])
plt.plot([0,1],[0,1])
plt.axis([0,0.1,0,0.1])
plt.xlabel('Out-of-sample R^2',fontsize=20)
plt.ylabel('In-sample R^2',fontsize=20)
plt.savefig('overfitting.png',dpi=300)

## Show variables with greater prediction for survey vs. baseline

In [None]:
pthresh=0.05
sigp={}
plot_sep_vars=False
k=('survey','survey_shuffle')
sigp[k]=pvals[k].query('p_fdr <= %f'% pthresh).sort_values(by='Effect size',ascending=False)
imp=get_importance_list(sigp[k],k[0],features)
sigp[k]=sigp[k].join(imp)
display(sigp[k])
if plot_sep_vars:
    for v in sigp[k].index:
        plotvars(v,pvals,datasets,allvars)

In [None]:
k=('survey','survey_shuffle')
for i in sigp[k].index:
    print(i,get_importances(i,k[0],features,10))
    print('')

## Show variables with greater prediction for task vs. baseline

In [None]:
k=('task','task_shuffle')
sigp[k]=pvals[k].query('p_fdr <= %f'% pthresh).sort_values(by='Effect size',ascending=False)
imp=get_importance_list(sigp[k],k[0],features)
sigp[k]=sigp[k].join(imp)
display(sigp[k])
if plot_sep_vars:
    for v in sigp[k].index:
        plotvars(v,pvals,datasets,allvars)

In [None]:
k=('task','task_shuffle')
for i in sigp[k].index:
    print(i,get_importances(i,k[0],features))
    print('')

In [None]:
pvals[k]

### Make figure for paper

In [None]:

def plot_paper_vars(vars,pvals,datasets,allvars):
    f, axarr = plt.subplots(2, 2,figsize=(18,16))
    xy=[[0,0],[0,1],[1,0],[1,1]]
    ctr=0
    for v in vars:
        df=[]
        errors=[]
        ds=[]
        for k in datasets:
            if not allvars[v] in acc[k]:
                continue
            if not v in acc[k][allvars[v]]['scores_cv']:
                continue
            targdist=acc[k][allvars[v]]['scores_cv'][v].dropna()
            df.append(targdist.mean())
            ds.append(k)
            errors.append(targdist.std())
        df=pandas.DataFrame({'mean':df},index=ds)
        errors=pandas.DataFrame({'mean':errors},index=ds)
        if allvars[v]=='AUROC':
            df.plot.bar(yerr=errors,legend=False,
                    ylim=(0.45,numpy.max(df.values)*1.1),
                        ax=axarr[xy[ctr][0],xy[ctr][1]],
                       title=v)
        else:
            df.plot.bar(yerr=errors,legend=False,
                        ax=axarr[xy[ctr][0],xy[ctr][1]],
                       title=v)
        if xy[ctr][0]==0:
            x_axis = axarr[xy[ctr][0],xy[ctr][1]].axes.get_xaxis()
            x_axis.set_visible(False)
        if xy[ctr][1]==0:
            plt.ylabel(allvars[v]+' +/- SE across CV runs')
        ctr+=1
                                                                  
vars=list(pvals[('survey','survey_shuffle')].index)       
#def plot_paper_vars2(vars,pvals,datasets,allvars):
if 1:
    f= plt.figure(figsize=(18,12))
    ax=plt.gca()
    data=None
    errors=None
    for v in vars:
        df=[]
        err=[]
        ds=[]
        for k in datasets:
            if k.find('_shuffle')>-1:
                continue
            if not allvars[v] in acc[k]:
                continue
            if not v in acc[k][allvars[v]]['scores_cv']:
                continue
            targdist=acc[k][allvars[v]]['scores_cv'][v].dropna()
            df.append(targdist.mean())
            ds.append(k)
            err.append(targdist.std())
        if data is None:
            data=pandas.DataFrame({v:df},index=ds)
            errors=pandas.DataFrame({v:err},index=ds)
        else:
            data[v]=pandas.DataFrame({v:df},index=ds)
            errors[v]=pandas.DataFrame({v:err},index=ds)
            
    if allvars[v]=='AUROC':
        data.plot.bar(yerr=errors,legend=True,
                ylim=(0.45,numpy.max(df.values)*1.1),
                   title=v,ax=ax,fontsize=20)
    else:
        data.plot.bar(yerr=errors,
                   ax=ax,fontsize=20)
    plt.ylabel('R-squared +/- SE across CV runs',fontsize=20)
    plt.legend(fontsize=20)                               

#plot_paper_vars(figurevars,pvals,datasets,allvars)
plt.savefig('barplots.png',dpi=300)

## Show variables with greater prediction for DDM parameters vs. baseline

### drift

In [None]:
k=('drift','drift_shuffle')
sigp[k]=pvals[k].query('p_fdr <= %f'% pthresh).sort_values(by='Effect size',ascending=False)
imp=get_importance_list(sigp[k],k[0],features)
sigp[k]=sigp[k].join(imp)
display(sigp[k])
for v in sigp[k].index:
    plotvars(v,pvals,datasets,allvars)
    print(v)
    print(get_importances(v,k[0],features)  )  
    print('')    


### Threshold

In [None]:
k=('thresh','thresh_shuffle')
sigp[k]=pvals[k].query('p_fdr <= %f'% pthresh).sort_values(by='Effect size',ascending=False)
imp=get_importance_list(sigp[k],k[0],features)
sigp[k]=sigp[k].join(imp)
display(sigp[k])
for v in sigp[k].index:
    plotvars(v,pvals,datasets,allvars)
    print(v)
    print(get_importances(v,k[0],features)  )  
    print('')

### Nondecision time

In [None]:
k=('nondecision','nondecision_shuffle')
sigp[k]=pvals[k].query('p_fdr <= %f'% pthresh).sort_values(by='Effect size',ascending=False)
imp=get_importance_list(sigp[k],k[0],features)
sigp[k]=sigp[k].join(imp)
display(sigp[k])
for v in sigp[k].index:
    plotvars(v,pvals,datasets,allvars)
    print(v)
    print(get_importances(v,k[0],features)  )  
    print('')

## Show variables with greater prediction for intelligence vs. baseline

In [None]:
k=('intelligence','intelligence_shuffle')
sigp[k]=pvals[k].query('p_fdr <= %f'% pthresh).sort_values(by='Effect size',ascending=False)
imp=get_importance_list(sigp[k],k[0],features)
sigp[k]=sigp[k].join(imp)
display(sigp[k])
for v in sigp[k].index:
    plotvars(v,pvals,datasets,allvars)

## Show variables with greater prediction for stopping vs. baseline


In [None]:
k=('stopping','stopping_shuffle')
sigp[k]=pvals[k].query('p_fdr <= %f'% pthresh).sort_values(by='Effect size',ascending=False)
imp=get_importance_list(sigp[k],k[0],features)
sigp[k]=sigp[k].join(imp)
display(sigp[k])
for v in sigp[k].index:
    plotvars(v,pvals,datasets,allvars)

## Show variables with greater prediction for discounting vs. baseline


In [None]:
k=('discounting','discounting_shuffle')
sigp[k]=pvals[k].query('p_fdr <= %f'% pthresh).sort_values(by='Effect size',ascending=False)
imp=get_importance_list(sigp[k],k[0],features)
sigp[k]=sigp[k].join(imp)
display(sigp[k])
for v in sigp[k].index:
    plotvars(v,pvals,datasets,allvars)

### Assess survey variables in terms of their overall predictive utility


In [None]:
k=('survey','baseline')
df=pandas.DataFrame()
absfeat=pandas.DataFrame()

for v in features['survey']:
    df[v]=features['survey'][v].mean(0)
    absfeat[v]=(features['survey'][v].abs()>0).mean()
    
mean_imp=df.mean(1)
meanabs_survey=pandas.DataFrame({'meanabs':absfeat.mean(1)}).sort_values(by='meanabs',ascending=False)


In [None]:
df=pandas.DataFrame()
absfeat=pandas.DataFrame()

for v in features['task']:
    df[v]=features['task'][v].mean(0)
    absfeat[v]=(features['task'][v].abs()>0).mean()
    
mean_imp=df.mean(1)
meanabs_task=pandas.DataFrame({'meanabs':absfeat.mean(1)}).sort_values(by='meanabs',ascending=False)


## Visualize structure of demographic target variables

In [None]:
bp=behavpredict.BehavPredict(verbose=True,
     drop_na_thresh=100,
     skip_vars=['RetirementPercentStocks',
     'HowOftenFailedActivitiesDrinking',
     'HowOftenGuiltRemorseDrinking',
     'AlcoholHowOften6Drinks'],
     add_baseline_vars=True,
     freq_threshold=0.1)
bp.load_demog_data()
bp.get_demogdata_vartypes()


In [None]:
demogdata=bp.demogdata.copy()
for i in demogdata.columns:
    if not i in features['task'] and not i in features['survey']:
        del demogdata[i]
        print('removing',i)
demogdata=demogdata.T
demogdata['goodvar']=demogdata.isnull().sum(1)<10
demogdata_clean=demogdata.query('goodvar==True')
print(demogdata.shape)
del demogdata_clean['goodvar']
demogdata_clean=demogdata_clean.T

# these are bad vars that don't have features
dropvars=['HowOftenCantStopDrinking',
'HowOftenFailedActivitiesDrinking',
'HowOftenGuiltRemorseDrinking','AlcoholHowOften6Drinks']

for v in dropvars:
    if v in demogdata_clean:
        del demogdata_clean[v]
        print('removing',v)
from sklearn.linear_model import LinearRegression
from fancyimpute import SimpleFill

def residualize_baseline(df):
    # remove baseline vars
    baseline=df[['Age','Sex']]
    data=df.copy()
    del data['Age']
    del data['Sex']
    #x=SimpleFill().complete(baseline)
    lr=LinearRegression()
    for v in data:
        #print('residualizing',v)
        if data[v].isnull().sum()>0:
            y=SimpleFill().complete(data[v].values[:,numpy.newaxis])
        else:
            y=data[v]
        lr.fit(baseline,y)
        data[v]=y - lr.predict(baseline)
    return data
df_resid=residualize_baseline(demogdata_clean)


In [None]:
dthresh=2.0
dist=1-numpy.abs(df_resid.corr(method='spearman'))
k=ward(numpy.triu(dist))
c=cut_tree(k,height=dthresh)
ll=leaves_list(k)

matches={}
matchnums={}
clustdict={}
for i in numpy.unique(c):
    matches[i]=[]
    matchnums[i]=[]
    for j in numpy.where(c==i)[0]:
        matches[i].append(df_resid.columns[j])
        clustdict[df_resid.columns[j]]=i
        matchnums[i].append(j)

matchdesc={0:'education/height/weight',1:'relationships',2:'domestic',3:'financial/coffee',
          4:'caffeine',5:'legal problems',6:'smoking',7:'alcohol use',
          8:'alcohol/drug problems',9:'mental health',10:'obesity'}

matches

In [None]:
# get top predictive features for each cluster
impdata={}
binarize_features=True
feature_thresh=1e-5
for i in matchdesc.keys():
    print(matchdesc[i])
    print(matches[i])
    df_subset=df_resid

    df_tmp=pandas.DataFrame()
    absfeat_tmp=pandas.DataFrame()

    for v in matches[i]:
        # drop variables that don't have features for both task and survey
        if not v in features['task'] or not v in features['survey']:
            print('ooop',v)
            continue
        minsize=numpy.min([features['survey'][v].shape[0],features['task'][v].shape[0]]).astype('int')
        sfeatures=features['survey'][v].copy()
        sfeatures=sfeatures.iloc[:minsize,:]
        #print(sfeatures.shape)
        tfeatures=features['task'][v].copy()
        tfeatures=tfeatures.iloc[:minsize,:]
        #print(tfeatures.shape)
        all_features=pandas.concat([sfeatures,tfeatures],axis=1) #sfeatures.join(tfeatures)
        if binarize_features:
            all_features[all_features>feature_thresh]=1
            all_features[all_features<-1*feature_thresh]=-1

        del all_features['Sex']
        del all_features['Age']
        df_tmp[v]=all_features.mean(0)
        absfeat_tmp[v]=(all_features.abs()>0).mean(0)

    mean_imp=pandas.DataFrame({'meanimp':df_tmp.mean(1),
                               'meanabs':absfeat_tmp.mean(1)})
    impdata[i]=mean_imp.sort_values(by='meanabs',ascending=False)

 
    display(impdata[i].iloc[:10,:])

In [None]:
fig=plt.figure(figsize=(14,12))
plt.subplot(1,2,1)
d=dendrogram(k,orientation='left',
             labels=list(df_resid.T.index),
             color_threshold=dthresh,leaf_font_size=12)
plt.plot([dthresh,dthresh],[0,500],'k--')

omult=10
breaks=[]

for i in range(len(d['ivl'])):
    if i>0:
        if clustdict[d['ivl'][i]]==clustdict[d['ivl'][i-1]]:
            continue
    #print(clustdict[d['ivl'][i]],clustdict[d['ivl'][i-1]])
    breaks.append(i)
breaks.append(len(d['ivl']))
replace_sets=[('.',':'),('_',' '),('selection optimization compensation','SOC'),
             ('theories of ','')]

for i in range(1,len(breaks)):
    plt.plot([0,20],[breaks[i]*omult,breaks[i]*omult],'k--',linewidth=0.5)
    if numpy.sum(impdata[clustdict[d['ivl'][breaks[i-1]]]]['meanabs']==1)>2:
        nfeats=numpy.sum(impdata[clustdict[d['ivl'][breaks[i-1]]]]['meanabs']==1).astype('int')
    else:
        nfeats=2
    for j in range(nfeats):
        vartitle=impdata[clustdict[d['ivl'][breaks[i-1]]]].index[j]
        for r in replace_sets:
            vartitle=vartitle.replace(r[0],r[1])
        plt.text(-11.5,0.5*(breaks[i]+breaks[i-1])*omult-j*8+nfeats,
             '%s (%0.2f/%0.2f)'%(vartitle,
                       impdata[clustdict[d['ivl'][breaks[i-1]]]]['meanimp'][j],
                        impdata[clustdict[d['ivl'][breaks[i-1]]]]['meanabs'][j]))
    plt.text(14,0.5*(breaks[i]+breaks[i-1])*omult-4+nfeats,
             matchdesc[clustdict[d['ivl'][breaks[i-1]]]],fontsize=12)

plt.savefig('dendrogram.png',dpi=300,pad_inches=8)


## Clustering on predictor loadings


looks pretty crappy

In [None]:
surveyfiles=glob.glob('/Users/poldrack/code/Self_Regulation_Ontology/prediction_analyses/R_exports_lasso/features/survey*')
dropvars=['Age','Sex']
loadingdata={'survey':None}
include_task=False

for f in surveyfiles:
    varname=f.split('survey')[1].split('_')[1]
    for d in dropvars:
        if f.find(d)>-1:
            continue
    sdata=pandas.read_csv(f).mean(0)
    if include_task:
        tf=f.replace('features/survey_','features/task_')
        if not os.path.exists(tf):
            print('skipping',varname)
            continue
        tdata=pandas.read_csv(tf).mean(0)
        alldata=pandas.concat((tdata,sdata))
    else:
        alldata=sdata
    if loadingdata['survey'] is None:
        loadingdata['survey']=pandas.DataFrame({varname:alldata})
    else:
        loadingdata['survey'][varname]=alldata
        
loadingdata['survey']=loadingdata['survey'].drop('Age').drop('Sex')
del loadingdata['survey']['Age']
del loadingdata['survey']['Sex']
allvars=[i for i in list(loadingdata['survey'].columns) if not i.find('.binarized')>-1]
for c in allvars:
    if '%s.binarized'%c in loadingdata['survey']:
        del loadingdata['survey']['%s.binarized'%c]

In [None]:
dist=1-loadingdata['survey'].corr(method='spearman')
k=ward(numpy.triu(dist))
c=cut_tree(k,height=dthresh)
ll=leaves_list(k)

fig=plt.figure(figsize=(14,12))
plt.subplot(1,2,1)
d=dendrogram(k,orientation='left',
             labels=list(loadingdata['survey'].T.index),
             color_threshold=dthresh,leaf_font_size=12)

matches={}
matchnums={}
clustdict={}
for i in numpy.unique(c):
    matches[i]=[]
    matchnums[i]=[]
    for j in numpy.where(c==i)[0]:
        matches[i].append(loadingdata['survey'].columns[j])
        clustdict[loadingdata['survey'].columns[j]]=i
        matchnums[i].append(j)


## task-specific prediction analyses - for Aim 2 task selection

In [None]:
tasks=['attention_network_task', 
       'columbia_card_task_hot', 'discount_titrate', 
       'dot_pattern_expectancy',
       'kirby', 'motor_selective_stop_signal', 'stop_signal', 
       'stroop', 
       'threebytwo', 'tower_of_london']

for t in tasks:
    k=(t,t+'_shuffle')
    if not k in pvals:
        print('skipping',k)
        continue
    sigp[k]=pvals[k].query('p_unc <= %f'% pthresh).sort_values(by='Effect size',ascending=False)
    imp=get_importance_list(sigp[k],k[0],features)
    sigp[k]=sigp[k].join(imp)
    display(sigp[k])
    #for v in sigp[k].index:
        #plotvars(v,pvals,datasets,allvars)

## Factor analysis on outcome measures 
Exploratory - don't use this

In [None]:
%%R -i df_resid -o scores,loadings,varnames


dropvars <- names(df_resid) %in% c("HeightInches", "WeightPounds", "CigsPerDay") 
print(dropvars)
df <- df_resid[,!dropvars]


library(psych)
vss.result=VSS(df,16,fm='mle',plot=FALSE)
#print(vss.result)
nfactor=which.min(vss.result$vss.stats$BIC)
fa.result=fa(df,nfactors=nfactor,fm='mle')
loadings=fa.result$loadings
print(fa.result,cut=0.2,sort=TRUE)
scores=factor.scores(df,fa.result,method='tenBerge')$scores
#clst=iclust(df_resid)

In [None]:
scores_df=pandas.DataFrame(scores,columns=['smoking severity','mental illness',
                                           'smoking','obesity',
                                           'alcohol','domestic'],index=df_resid.index)
scores_df.to_csv("../Data/Derived_Data/Complete_10-08-2017/factor_scores.csv")