In [1]:
from __future__ import division
import os
import numpy as np
import pandas as pd
import scipy.stats
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style('whitegrid')
import statsmodels.api as sm
% matplotlib inline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn import linear_model

Vendor:  Continuum Analytics, Inc.
Package: mkl
Message: license expired

    the package numpy 1.10.2 was published on 2015-12-14,
    however the mkl license expiration date is 2015-08-05.
    You may be able to run earlier versions of numpy using your
    current license.  A new license can be purchased at: http://continuum.io
    To revert to an earlier set of conda packages, use:
    $ conda list --revisions
    ...
    $ conda install --revision <REVISION NUMBER>

    
Vendor:  Continuum Analytics, Inc.
Package: mkl
Message: license expired

    the package numpy 1.10.2 was published on 2015-12-14,
    however the mkl license expiration date is 2015-08-05.
    You may be able to run earlier versions of numpy using your
    current license.  A new license can be purchased at: http://continuum.io
    To revert to an earlier set of conda packages, use:
    $ conda list --revisions
    ...
    $ conda install --revision <REVISION NUMBER>

    


In [2]:
base_dir = '/om/user/zqi/projects/slbeh/cogscigame/data_structured/concatenated'

In [7]:
def lm(X, y):
    if len(X.shape) == 1:
        X = X.reshape(-1, 1)
    clf = Pipeline([
        ('scale', StandardScaler()),
        ('lm', linear_model.LinearRegression())
    ])
    clf.fit(X, y)
    coef = clf.named_steps["lm"].coef_
    return coef

def poly(X, y):
    if len(X.shape) == 1:
        X = X.reshape(-1, 1)
    clf = Pipeline([
        ('poly', PolynomialFeatures(degree=3)),
        ('scale', StandardScaler()),
        ('lm', linear_model.LinearRegression())
    ])
    clf.fit(X, y)
    return clf.named_steps["lm"].coef_
    
def group_familiarization_summary(path):
    data = pd.read_csv(path)
    pars = data.groupby('id')
    summary_mat = np.zeros((len(pars), 6), dtype=object)
    for idx, (name, group) in enumerate(pars):
        summary_mat[idx, 0] = name
        summary_mat[idx, 1] = lm(
            group.trial_index.values.astype(float), 
            group.rt.values.astype(float)
        )[0]
        summary_mat[idx, 2] = group.rt.mean()
        coefs = poly(
            group.trial_index.values.astype(float),
            group.rt.values.astype(float)
                    )
        summary_mat[idx, 3] = coefs[1]
        summary_mat[idx, 4] = coefs[2]
        summary_mat[idx, 5] = coefs[3]
    cols = ['id','lm_coef_rt', 'mean_rt', 'poly1', 'poly2', 'poly3']
    return pd.DataFrame(summary_mat, columns=cols)

def group_fam_target_summary(path):
    data = pd.read_csv(vslfamacc)
    pars = data.groupby('id')
    summary_mat = np.zeros((len(pars), 2), dtype=object)
    for idx, (name, group) in enumerate(pars):
        hit = 0
        trials = len(group.stimulus)
        for i in range(trials):
            if group.targ.values[i] in group.stimulus.values[i]:
                hit = hit + 1
        summary_mat[idx, 0] = name
        summary_mat[idx, 1] = hit/trials
    return pd.DataFrame(summary_mat, columns=['id','fam_acc'])

def group_test_summary(path):
    data = pd.read_csv(path)
    pars = data.groupby('id')
    summary_mat = np.zeros((len(pars), 2), dtype=object)
    for idx, (name, group) in enumerate(pars):
        summary_mat[idx, 0] = name
        summary_mat[idx, 1] = (group.par_answer == group.expected_answer).mean()
    return pd.DataFrame(summary_mat, columns=['id','acc'])

def combine_summary(rt, acc, famacc):
    out = rt.set_index('id').loc[acc['id']]
    out['id_rt'] = out.index.values
    out.index = [idx for idx in range(out.shape[0])]
    out['id_acc'] = acc['id'].values
    out['id_famacc'] = famacc['id'].values
    out['acc'] = acc['acc'].values
    out['fam_acc'] = famacc['fam_acc'].values
    if not np.all(out.id_acc == out.id_rt):
        print("IDs DO NOT MATCH")
    if not np.all(out.id_famacc == out.id_rt):
        print("IDs DO NOT MATCH")
    return out

In [8]:
vslfamphase=os.path.join(base_dir, 'retest_vsl_tbt_famphase.csv')
vsltest=os.path.join(base_dir, 'retest_vsl_tbt_testphase.csv')
vslfamacc=os.path.join(base_dir, 'retest_vsl_tbt_famphase_falsePos.csv')
tslfamphase=os.path.join(base_dir, 'retest_tsl_tbt_famphase.csv')
tsltest=os.path.join(base_dir, 'retest_tsl_tbt_testphase.csv')
tslfamacc=os.path.join(base_dir, 'retest_tsl_tbt_famphase_falsePos.csv')

In [6]:
data = pd.read_csv(vslfamacc)
pars = data.groupby('id')
summary_mat = np.zeros((len(pars), 2), dtype=object)
for idx, (name, group) in enumerate(pars):
    hit = 0
    trials = len(group.stimulus)
    for i in range(trials):
        if group.targ.values[i] in group.stimulus.values[i]:
            hit = hit + 1
    summary_mat[idx, 0] = name
    summary_mat[idx, 1] = hit/trials
print pd.DataFrame(summary_mat, columns=['id','fam_acc'])

      id    fam_acc
0   3064          1
1   3067   0.958333
2   3068   0.666667
3   3130          1
4   3144   0.857143
5   3161          1
6   3181   0.916667
7   3182   0.821429
8   3213          1
9   3219   0.913043
10  3223          1
11  3236  0.0989011
12  3237   0.923077
13  3266       0.96
14  3267   0.923077
15  3268          1
16  3311  0.0909091


In [9]:
for g in [
    (vslfamphase, vsltest, vslfamacc),
    (tslfamphase, tsltest, tslfamacc)
]:
    rt = group_familiarization_summary(g[0])
    acc = group_test_summary(g[1])
    famacc = group_fam_target_summary(g[2])
    combine_summary(rt, acc, famacc).to_csv(os.path.join(base_dir,'summary/summary_retest_{}.csv'.format(g[0][73:76])))

print('')


