In [1]:
import os, sys
lib_path = os.path.abspath(os.path.join('..','_Libraries'))
sys.path.append(lib_path)
from rlabs_libutils import DataStruct, select_data, create_outlier_df, find_nearest_above
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
from itertools import izip
#%matplotlib inline

#Read raw data
**Note**: a window will apear with which you can select the data, this window can be hidden by the browser

In [2]:
path = select_data()
ds = DataStruct(path)

Plaid-15.06.11_16.14_jl_nofix_replication_eyetracker_data.txt: eyetracker data (38 colums format)
Trial 1 - 1.5 % of data was lost. SNR: 0.928862926542. CV: 1.07658511436
Trial 2 - 0.7 % of data was lost. SNR: 0.407201998552. CV: 2.45578362473
Trial 3 - 1.3 % of data was lost. SNR: 0.761812301826. CV: 1.31265929626
Trial 4 - 2.2 % of data was lost. SNR: 0.823482607092. CV: 1.21435473122
Trial 5 - 3.1 % of data was lost. SNR: 0.806924052454. CV: 1.23927400225


**Create outlier DataFrame**

In [59]:
outlier_threshold = 100  # velocity values over 100 will be outliers.
ambiguousoutlier_th = 80 # velocity values between 80 and 100 will be ambiguous outliers.
filter_samples = 5       # the samples following an outlier will not be outliers.

df = create_outlier_df(ds,outlier_threshold = outlier_threshold, 
                      ambiguousoutlier_th = ambiguousoutlier_th, filter_samples = filter_samples)

#Compute linear regression of Gaze position between outliers

In [64]:
outlier_idx = np.where(df['Outlierfiltered'])[0]
slope_array = []
intercept_array = []
r_value_array = []
n = len(outlier_idx)-1
for i in range(n):
    lr_idx = np.arange(outlier_idx[i],outlier_idx[i+1])   # linear regression idx. outlier and next outlier
    slope, intercept, r_value, p_value, std_err = stats.linregress(df['time'][lr_idx], df['LEpos_int'][lr_idx])
    
    slope_array.append(slope)
    intercept_array.append(intercept)
    r_value_array.append(r_value)
    
r_squared_array = np.power(r_value_array,2)

#Plot: 
a) gaze position with linear regression

b) velocity with outliers and ambiguous outliers

In [None]:
f, ax = plt.subplots(2, sharex = True)
ax[0].plot(df['time'], df['LEpos_int'])

for i in range(n):
    
    ax[0].plot(df['time'][outlier_idx[i:i+2]], slope_array[i]*df['time'][outlier_idx[i:i+2]] + intercept_array[i], 'r')

    # compute annotation coordinates
    x = df['time'][outlier_idx[i]] + np.diff(df['time'][outlier_idx[i:i+2]])[0]/2.0
    y = np.diff(slope_array[i]*df['time'][outlier_idx[i:i+2]] + intercept_array[i])[0]/2.0
    ax[0].annotate('{0}'.format("%.2f" % r_squared_array[i]), xy=(x,y), horizontalalignment='center', verticalalignment='bottom')
    
ax[1].plot(df['time'], df['velocity'])
ax[1].scatter(df['time'][df['Outlierfiltered']], df['velocity'][df['Outlierfiltered']],color ='r')
ax[1].scatter(df['time'][df['isAmbiguousOutlier']], df['velocity'][df['isAmbiguousOutlier']],color ='y')

f.suptitle(ds.filename)
ax[0].set_title('eye gaze with linear regression and r squared values')
ax[1].set_title('velocity with outliers. outlier threshold = {0}'.format(outlier_threshold))
plt.show()

#Refine linear regression

**method 1:** Using ambiguous outliers

**method 2:** Spliting linear regression interval into segments

*General note*: the functions are defined first, scroll down to use each method.

**To do**: weight the linear regression with the number of points used

In [90]:
def regressionbtwpoints(df, start, end, xlabel = 'time', ylabel = 'LEpos_int'):
    """
        Compute the linear regression of df between start and end points.
        
        Input:
        - df: Pandas.DataFrame containing eyetracker data
        - start: starting point for the linear regression
        - end: ending point for the linear regression
        - xlabel: df data to use as horizontal axis of the linear regression. Default as 'time'
        - ylabel: df data to use as vertical axis of the linear regression. Default as 'LEpos_int'

        Output:
        - tmp: dictionary containing linear regression results

    """
    lr_idx = np.arange(start, end)                                                                                  # indices array

    # compute linear regression
    slope, intercept, r_value, p_value, std_err = stats.linregress(df[xlabel][lr_idx], df[ylabel][lr_idx])

    # add results to temporal struct
    tmp = {'start':df['time'][start], 'end': df['time'][end], 'start_idx': start, 'end_idx':end,
                 'slope':slope, 'intercept': intercept, 'r_value': r_value, 'p_value': p_value, 'std_err': std_err}

    return tmp

# -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 

def method1_useamboutls(df, itvl_start, itvl_end, btwpoints):
    """
        Refinement method 1: Using ambiguous outliers.

        Input:
        - df: data frame
        - itvl_start: interval start (first outlier)
        - itvl_end: interval end (second outlier)
        - btwpoints: samples between points (threshold)
        
        Output:
        - m1_struct: list of dictionaries containing the regression data.
    """

    # get the indices of the ambiguous outliers in the interval
    amb_outlier_idx = np.where(df['isAmbiguousOutlier'])[0]
    amb_btwn_outlrs = amb_outlier_idx[(np.where((amb_outlier_idx >= itvl_start+btwpoints) & (amb_outlier_idx <= itvl_end-btwpoints)))]
    
    # get the ambiguous outliers that are separated by btwpoints samples
    nwlridx = [amb_btwn_outlrs[0]] # always use first ambiguous outlier
    for item in amb_btwn_outlrs: # for all the points
        if item > (nwlridx[-1]+btwpoints): # if point is greater than last point in nwlridx,
            nwlridx.append(item) # get it
    nwlridx.insert(0,itvl_start)                    # prepend interval start
    nwlridx.insert(len(nwlridx),itvl_end)           # append interval end
    
    m1_struct = []                                  # define struct with regression data

    for j in range(len(nwlridx)-1):                         # for all the points
        sgm_start = nwlridx[j]                              # get the segment start
        sgm_end = nwlridx[j+1]                              # get the segment end
        tmp = regressionbtwpoints(df, sgm_start, sgm_end)   # compute linear regression
        m1_struct.append(tmp)                               # append results to struct
        
    return m1_struct

# -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 

def method2_splitintrvl(df, itvl_start, itvl_end, maxdivisions, minsamples, fs = 120.0):
    """
        Refinement method 2: split interval in segments

        Input:
        - df: data frame
        - itvl_start: interval start (first outlier)
        - itvl_end: interval end (second outlier)
        - maxdivisions: maximum number of divisions allowed
        - minsamples: minimum number of samples in a segment
        - fs: sampling frequency of the data (120 Hz for Tobii X120)

        Output:
        - m2_struct1: list of dictionaries containing the regression data of the left segments.
        - m2_struct2: list of dictionaries containing the regression data of the right segments.
    """

    bfdur = df['time'][itvl_end] - df['time'][itvl_start]                   # get bad fit duration

    nsamples = bfdur/1000.0 * fs                                            # number of samples in segment

    while (nsamples / maxdivisions) < minsamples:                           # if shortest segment is shorter than minsamples, 
        maxdivisions -= 1                                                   # decrease number of divisions
    
    m2_struct1 = []                                                         # define struct that will contain regression data
    m2_struct2 = []                                                         # define struct that will contain regression data
    for it in range(1, maxdivisions):
        
        # segment 1 -----------------------------------------------------------------------------------------------------------
        # start and end indices segment 1
        s1_start = itvl_start                                               # segment 1 always starts where interval starts
        end   = df['time'][s1_start] + it * (bfdur / maxdivisions)          # end of segment 1 in seconds
        _, s1_end = find_nearest_above(df['time'].values, end)              # index of the end of segment 1

        tmp1 = regressionbtwpoints(df, s1_start, s1_end)                    # compute linear regression

        # segment 2 ----------------------------------------------------------------------------------------------------------
        # start and end indices segment 1
        s2_start = s1_end
        s2_end = itvl_end

        tmp2 = regressionbtwpoints(df, s2_start, s2_end)                        # compute linear regression
        
        # append to structs
        m2_struct1.append(tmp1)
        m2_struct2.append(tmp2)

    # get best fit index
    bestfit = getbestjointfit(m2_struct1, m2_struct2)


    return m2_struct1, m2_struct2, bestfit

# -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 

def getbestjointfit(segm1, segm2):
    """
        Sum the r squared value of each corresponding segment
        and return the index of the maximum.
        
        Input:
            - segm1, segm2: element of refined linear regression struct. (not rf_struct1, but rf_struct[i])
        Output:
            - max_idx: the index where the refinement segments have the greater joint r squared value.
    """
    
    r_squared1 = np.power(np.array([item['r_value'] for item in segm1]),2)
    r_squared2 = np.power(np.array([item['r_value'] for item in segm2]),2)

    joint_rsquared = r_squared1 + r_squared2
    max_idx = np.where(joint_rsquared == np.max(joint_rsquared))[0]
   
    return max_idx

# Use method 1

In [141]:
minintervallen = 50 # minimum interval length (samples)
minsamples = 10      # minimum number of samples for a segment of the interval
fs = 120.0           # sampling frequency of the tobii eyetracker
badfitth = 0.3       # bad fit threshold. a regresison fit with a r_squared value below this, will be refined
maxdivisions = 12    # maximum number of divisions to split the bad fits

outlier_idx = np.where(df['Outlierfiltered'])[0]
n = len(outlier_idx)-1

rf_struct1 = []
rf_struct2 = []
rf_bfidx = []

m1_struct = []

for i in range(n):
    if r_squared_array[i] < badfitth:

        # get bad fit number of samples
        nsamples = (df['time'][outlier_idx[i+1]] - df['time'][outlier_idx[i]])/1000.0 * fs
        
        if nsamples > minintervallen:
            itvl_start = outlier_idx[i]
            itvl_end = outlier_idx[i+1]
            
            # method 1: if there are ambiguous outliers within:
            amb_outlier_idx = np.where(df['isAmbiguousOutlier'])[0]
            rthere = np.sum((amb_outlier_idx >= itvl_start+minsamples)&(amb_outlier_idx <= itvl_end-minsamples))
            use_m1 = rthere > 0
            if use_m1:
                tmp_m1 = method1_useamboutls(df, itvl_start, itvl_end, minsamples)
                m1_struct.append(tmp_m1)

# Plot method 1 results

In [None]:
f, ax = plt.subplots(2, sharex = True)
ax[0].plot(df['time'], df['LEpos_int'])  # plot gaze

for i in range(n):
    
    # plot linear regression
    ax[0].plot(df['time'][outlier_idx[i:i+2]], slope_array[i]*df['time'][outlier_idx[i:i+2]] + intercept_array[i], 'r')
    
    # compute annotation coordinates
    x = df['time'][outlier_idx[i]] + np.diff(df['time'][outlier_idx[i:i+2]])[0]/2.0
    y = np.diff(slope_array[i]*df['time'][outlier_idx[i:i+2]] + intercept_array[i])[0]/2.0
    ax[0].annotate('{0}'.format("%.2f" % r_squared_array[i]), xy=(x,y), horizontalalignment='center', verticalalignment='bottom')


# plot refined linear regression --------------------------------------------------------------------------------------------
for fit in (m1_struct):
    for j in range(len(fit)):
        a = fit[j]
        axaxis = np.array([a['start'],a['end']])
        ax[0].plot(axaxis, a['slope']*axaxis + a['intercept'], 'y')

        # compute annotation coordinates
        x = axaxis[0] + np.diff(axaxis)[0]/2.0
        y = np.diff(a['slope']*axaxis + a['intercept'])[0]/2.0 + 2
        ax[0].annotate('{0}'.format("%.2f" % a['r_value']**2), xy=(x,y), horizontalalignment='center', verticalalignment='bottom')
    
# ------------------------------------------------------------------------------------------------------------
    
ax[1].plot(df['time'], df['velocity'])
ax[1].scatter(df['time'][df['Outlierfiltered']], df['velocity'][df['Outlierfiltered']],color ='r')
ax[1].scatter(df['time'][df['isAmbiguousOutlier']], df['velocity'][df['isAmbiguousOutlier']],color ='y')

f.suptitle(ds.filename)
ax[0].set_title('eye gaze with linear regression and r squared values')
ax[1].set_title('velocity with outliers. outlier threshold = {0}'.format(outlier_threshold))
plt.show()

# Use method 2

In [91]:
minintervallen = 50 # minimum interval length (samples)
minsamples = 25      # minimum number of samples for a segment of the interval
fs = 120.0           # sampling frequency of the tobii eyetracker
badfitth = 0.3       # bad fit threshold. a regresison fit with a r_squared value below this, will be refined
maxdivisions = 12    # maximum number of divisions to split the bad fits

outlier_idx = np.where(df['Outlierfiltered'])[0]
n = len(outlier_idx)-1

rf_struct1 = []
rf_struct2 = []
rf_bfidx = []

for i in range(n):
    if r_squared_array[i] < badfitth:

        # get bad fit number of samples
        nsamples = (df['time'][outlier_idx[i+1]] - df['time'][outlier_idx[i]])/1000.0 * fs
        
        if nsamples > minintervallen:
            itvl_start = outlier_idx[i]
            itvl_end = outlier_idx[i+1]
            sgmts1, sgmts2, bfidx = method2_splitintrvl(df, itvl_start, itvl_end, maxdivisions, minsamples, fs = 120.0)
            
            rf_struct1.append(sgmts1)
            rf_struct2.append(sgmts2)
            rf_bfidx.append(bfidx)

# Plot method 2

In [None]:
f, ax = plt.subplots(2, sharex = True)
ax[0].plot(df['time'], df['LEpos_int'])  # plot gaze

for i in range(n):
    
    # plot linear regression
    ax[0].plot(df['time'][outlier_idx[i:i+2]], slope_array[i]*df['time'][outlier_idx[i:i+2]] + intercept_array[i], 'r')
    
    # compute annotation coordinates
    x = df['time'][outlier_idx[i]] + np.diff(df['time'][outlier_idx[i:i+2]])[0]/2.0
    y = np.diff(slope_array[i]*df['time'][outlier_idx[i:i+2]] + intercept_array[i])[0]/2.0
    ax[0].annotate('{0}'.format("%.2f" % r_squared_array[i]), xy=(x,y), horizontalalignment='center', verticalalignment='bottom')


# plot refined linear regression --------------------------------------------------------------------------------------------

for i in range(len(rf_struct1)):
    a = rf_struct1[i][rf_bfidx[i]]

    axaxis = np.array([a['start'],a['end']])
    ax[0].plot(axaxis, a['slope']*axaxis + a['intercept'], 'y')

    # compute annotation coordinates
    x = axaxis[0] + np.diff(axaxis)[0]/2.0
    y = np.diff(a['slope']*axaxis + a['intercept'])[0]/2.0 + 2
    ax[0].annotate('{0}'.format("%.2f" % a['r_value']**2), xy=(x,y), horizontalalignment='center', verticalalignment='bottom')
    
    ax[0].plot([a['end'],a['end']], [-np.mean(df['LEpos_int']), np.mean(df['LEpos_int'])], 'y')

for i in range(len(rf_struct2)):
    a = rf_struct2[i][rf_bfidx[i]]

    axaxis = np.array([a['start'],a['end']])
    ax[0].plot(axaxis, a['slope']*axaxis + a['intercept'], 'y')

    # compute annotation coordinates
    x = axaxis[0] + np.diff(axaxis)[0]/2.0
    y = np.diff(a['slope']*axaxis + a['intercept'])[0]/2.0 + 2
    ax[0].annotate('{0}'.format("%.2f" % a['r_value']**2), xy=(x,y), horizontalalignment='center', verticalalignment='bottom')
# ------------------------------------------------------------------------------------------------------------
    
ax[1].plot(df['time'], df['velocity'])
ax[1].scatter(df['time'][df['Outlierfiltered']], df['velocity'][df['Outlierfiltered']],color ='r')
ax[1].scatter(df['time'][df['isAmbiguousOutlier']], df['velocity'][df['isAmbiguousOutlier']],color ='y')

f.suptitle(ds.filename)
ax[0].set_title('eye gaze with linear regression and r squared values')
ax[1].set_title('velocity with outliers. outlier threshold = {0}'.format(outlier_threshold))
plt.show()

# Plot both methods together

In [None]:
f, ax = plt.subplots(2, sharex = True)
ax[0].plot(df['time'], df['LEpos_int'])  # plot gaze

for i in range(n):
    
    # plot linear regression
    ax[0].plot(df['time'][outlier_idx[i:i+2]], slope_array[i]*df['time'][outlier_idx[i:i+2]] + intercept_array[i], 'r')
    
    # compute annotation coordinates
    x = df['time'][outlier_idx[i]] + np.diff(df['time'][outlier_idx[i:i+2]])[0]/2.0
    y = np.diff(slope_array[i]*df['time'][outlier_idx[i:i+2]] + intercept_array[i])[0]/2.0
    ax[0].annotate('{0}'.format("%.2f" % r_squared_array[i]), xy=(x,y), horizontalalignment='center', verticalalignment='bottom')

# METHOD 1 --------------------------------------------------------------------------------------------
for fit in (m1_struct):
    for j in range(len(fit)):
        a = fit[j]
        axaxis = np.array([a['start'],a['end']])
        ax[0].plot(axaxis, a['slope']*axaxis + a['intercept'], 'g')

        # compute annotation coordinates
        x = axaxis[0] + np.diff(axaxis)[0]/2.0
        y = np.diff(a['slope']*axaxis + a['intercept'])[0]/2.0 + 2
        ax[0].annotate('{0}'.format("%.2f" % a['r_value']**2), xy=(x,y), horizontalalignment='center', verticalalignment='bottom')
        
        ax[0].plot([a['end'],a['end']], [-2*np.mean(df['LEpos_int']), 2*np.mean(df['LEpos_int'])], 'm')
    
# METHOD 2 --------------------------------------------------------------------------------------------

for i in range(len(rf_struct1)):
    a = rf_struct1[i][rf_bfidx[i]]

    axaxis = np.array([a['start'],a['end']])
    ax[0].plot(axaxis, a['slope']*axaxis + a['intercept'], 'y')

    # compute annotation coordinates
    x = axaxis[0] + np.diff(axaxis)[0]/2.0
    y = np.diff(a['slope']*axaxis + a['intercept'])[0]/2.0 + 2
    ax[0].annotate('{0}'.format("%.2f" % a['r_value']**2), xy=(x,y), horizontalalignment='center', verticalalignment='bottom')
    
    ax[0].plot([a['end'],a['end']], [-np.mean(df['LEpos_int']), np.mean(df['LEpos_int'])], 'y')

for i in range(len(rf_struct2)):
    a = rf_struct2[i][rf_bfidx[i]]

    axaxis = np.array([a['start'],a['end']])
    ax[0].plot(axaxis, a['slope']*axaxis + a['intercept'], 'y')

    # compute annotation coordinates
    x = axaxis[0] + np.diff(axaxis)[0]/2.0
    y = np.diff(a['slope']*axaxis + a['intercept'])[0]/2.0 + 2
    ax[0].annotate('{0}'.format("%.2f" % a['r_value']**2), xy=(x,y), horizontalalignment='center', verticalalignment='bottom')
# ------------------------------------------------------------------------------------------------------------
    
ax[1].plot(df['time'], df['velocity'])
ax[1].scatter(df['time'][df['Outlierfiltered']], df['velocity'][df['Outlierfiltered']],color ='r')
ax[1].scatter(df['time'][df['isAmbiguousOutlier']], df['velocity'][df['isAmbiguousOutlier']],color ='y')

f.suptitle(ds.filename)
ax[0].set_title('eye gaze with linear regression and r squared values')
ax[1].set_title('velocity with outliers. outlier threshold = {0}'.format(outlier_threshold))
plt.show()

# Stats

In [119]:
print 'Number of linear regression intervals with a r_squared < 0.3:\t{2}\nNumber of intervals refined with ambiguous outliers: \t{1}\nNumber of intervals refined with splitting window: \t{0}'.format(
    len(rf_struct2), len(m1_struct),np.sum((r_squared_array < badfitth)!=0))

Number of linear regression intervals with a r_squared < 0.3:	1079
Number of intervals refined with ambiguous outliers: 	98
Number of intervals refined with splitting window: 	125


In [131]:
badfit_idx = np.where(r_squared_array < badfitth)[0]

for bfidx in badfit_idx:
#     r_squared_array[bfidx]
    print bfidx, outlier_idx[bfidx],outlier_idx[bfidx+1]
    
    print m1_struct[]
    
    
    

IndentationError: expected an indented block (<ipython-input-131-a35b8409788e>, line 6)