In [1]:
import os, sys
lib_path = os.path.abspath(os.path.join('../..','_Libraries'))
sys.path.append(lib_path)
from rlabs_libutils import DataStruct, select_data, create_outlier_df, find_nearest_above
from rlabs_liblinreg import * # new library for the linear regression functions
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
from itertools import izip
#%matplotlib inline

#Read raw data
**Note**: a window will apear with which you can select the data, this window can be hidden by the browser

In [2]:
path = select_data()
ds = DataStruct(path)

Plaid-15.06.11_16.14_jl_nofix_replication_eyetracker_data.txt: eyetracker data (38 colums format)
Trial 1 - 1.5 % of data was lost. SNR: 0.928862926542. CV: 1.07658511436
Trial 2 - 0.7 % of data was lost. SNR: 0.407201998552. CV: 2.45578362473
Trial 3 - 1.3 % of data was lost. SNR: 0.761812301826. CV: 1.31265929626
Trial 4 - 2.2 % of data was lost. SNR: 0.823482607092. CV: 1.21435473122
Trial 5 - 3.1 % of data was lost. SNR: 0.806924052454. CV: 1.23927400225


**Create outlier DataFrame**

In [3]:
outlier_threshold = 100  # velocity values over 100 will be outliers.
ambiguousoutlier_th = 80 # velocity values between 80 and 100 will be ambiguous outliers.
filter_samples = 5       # the samples following an outlier will not be outliers.

df = create_outlier_df(ds,outlier_threshold = outlier_threshold, 
                      ambiguousoutlier_th = ambiguousoutlier_th, filter_samples = filter_samples)

The minimum supported version is 2.1



#Compute linear regression of Gaze position between outliers

In [4]:
outlier_idx = np.where(df['Outlierfiltered'])[0]
n = len(outlier_idx)-1
lr_struct = []
for i in range(n):
    lr_struct.append(regressionbtwpoints(df, outlier_idx[i], outlier_idx[i+1]))

#Plot: 
a) gaze position with linear regression

b) velocity with outliers and ambiguous outliers

In [None]:
f, ax = plt.subplots(2, sharex = True)
ax[0].plot(df['time'], df['LEpos_int'])

for i in range(n):
    
    a = lr_struct[i]
    axaxis = np.array([a['start'],a['end']])
    ax[0].plot(axaxis, a['slope']*axaxis + a['intercept'], 'r')
    
    # compute annotation coordinates
    x = axaxis[0] + np.diff(axaxis)[0]/2.0
    y = np.diff(a['slope']*axaxis + a['intercept'])[0]/2.0 + 2
    ax[0].annotate('{0}'.format("%.2f" % a['r_value']**2), xy=(x,y), horizontalalignment='center', verticalalignment='bottom')
   
ax[1].plot(df['time'], df['velocity'])
ax[1].scatter(df['time'][df['Outlierfiltered']], df['velocity'][df['Outlierfiltered']],color ='r')
ax[1].scatter(df['time'][df['isAmbiguousOutlier']], df['velocity'][df['isAmbiguousOutlier']],color ='y')

f.suptitle(ds.filename)
ax[0].set_title('eye gaze with linear regression and r squared values')
ax[1].set_title('velocity with outliers. outlier threshold = {0}'.format(outlier_threshold))
plt.show()

#Refine linear regression

**method 1:** Using ambiguous outliers

**method 2:** Spliting linear regression interval into segments

**To do**: weight the linear regression with the number of points used

# Use method 1

In [24]:
minintervallen = 50 # minimum interval length (samples)
minsamples = 10      # minimum number of samples for a segment of the interval
fs = 120.0           # sampling frequency of the tobii eyetracker
badfitth = 0.3       # bad fit threshold. a regresison fit with a r_squared value below this, will be refined
maxdivisions = 12    # maximum number of divisions to split the bad fits

outlier_idx = np.where(df['Outlierfiltered'])[0]
n = len(outlier_idx)-1

m1_struct = [None]*n

for i in range(n):
    if lr_struct[i]['r_squared'] < badfitth:

        # get bad fit number of samples
        nsamples = (df['time'][outlier_idx[i+1]] - df['time'][outlier_idx[i]])/1000.0 * fs
        
        if nsamples > minintervallen:
            itvl_start = outlier_idx[i]
            itvl_end = outlier_idx[i+1]
            
            # method 1: if there are ambiguous outliers within:
            amb_outlier_idx = np.where(df['isAmbiguousOutlier'])[0]
            rthere = np.sum((amb_outlier_idx >= itvl_start+minsamples)&(amb_outlier_idx <= itvl_end-minsamples))
            use_m1 = rthere > 0
            if use_m1:
                tmp_m1 = method1_useamboutls(df, itvl_start, itvl_end, minsamples)
                m1_struct[i] = tmp_m1

# Plot method 1 results

In [None]:
f, ax = plt.subplots(2, sharex = True)
ax[0].plot(df['time'], df['LEpos_int'])  # plot gaze

for i in range(n):
    
    # plot linear regression
    ax[0].plot(df['time'][outlier_idx[i:i+2]], slope_array[i]*df['time'][outlier_idx[i:i+2]] + intercept_array[i], 'r')
    
    # compute annotation coordinates
    x = df['time'][outlier_idx[i]] + np.diff(df['time'][outlier_idx[i:i+2]])[0]/2.0
    y = np.diff(slope_array[i]*df['time'][outlier_idx[i:i+2]] + intercept_array[i])[0]/2.0
    ax[0].annotate('{0}'.format("%.2f" % r_squared_array[i]), xy=(x,y), horizontalalignment='center', verticalalignment='bottom')


    # plot refined linear regression --------------------------------------------------------------------------------------------
    if m1_struct[i] != None:
        for j in range(len(m1_struct[i])):
            a = m1_struct[i][j]
            axaxis = np.array([a['start'],a['end']])
            ax[0].plot(axaxis, a['slope']*axaxis + a['intercept'], 'y')

            # compute annotation coordinates
            x = axaxis[0] + np.diff(axaxis)[0]/2.0
            y = np.diff(a['slope']*axaxis + a['intercept'])[0]/2.0 + 2
            ax[0].annotate('{0}'.format("%.2f" % a['r_value']**2), xy=(x,y), horizontalalignment='center', verticalalignment='bottom')

    # ------------------------------------------------------------------------------------------------------------
    
ax[1].plot(df['time'], df['velocity'])
ax[1].scatter(df['time'][df['Outlierfiltered']], df['velocity'][df['Outlierfiltered']],color ='r')
ax[1].scatter(df['time'][df['isAmbiguousOutlier']], df['velocity'][df['isAmbiguousOutlier']],color ='y')

f.suptitle(ds.filename)
ax[0].set_title('eye gaze with linear regression and r squared values')
ax[1].set_title('velocity with outliers. outlier threshold = {0}'.format(outlier_threshold))
plt.show()

# Use method 2

In [26]:
minintervallen = 50 # minimum interval length (samples)
minsamples = 25      # minimum number of samples for a segment of the interval
fs = 120.0           # sampling frequency of the tobii eyetracker
badfitth = 0.3       # bad fit threshold. a regresison fit with a r_squared value below this, will be refined
maxdivisions = 12    # maximum number of divisions to split the bad fits

outlier_idx = np.where(df['Outlierfiltered'])[0]
n = len(outlier_idx)-1

rf_struct1 = [None]*n
rf_struct2 = [None]*n
rf_bfidx = [None]*n

for i in range(n):
    if lr_struct[i]['r_squared'] < badfitth:

        # get bad fit number of samples
        nsamples = (df['time'][outlier_idx[i+1]] - df['time'][outlier_idx[i]])/1000.0 * fs
        
        if nsamples > minintervallen:
            itvl_start = outlier_idx[i]
            itvl_end = outlier_idx[i+1]
            sgmts1, sgmts2, bfidx = method2_splitintrvl(df, itvl_start, itvl_end, maxdivisions, minsamples, fs = 120.0)
            
            rf_struct1[i] = sgmts1
            rf_struct2[i] = sgmts2
            rf_bfidx[i] = bfidx

# Plot method 2

In [None]:
f, ax = plt.subplots(2, sharex = True)
ax[0].plot(df['time'], df['LEpos_int'])  # plot gaze

for i in range(n):
    
    # plot linear regression
    ax[0].plot(df['time'][outlier_idx[i:i+2]], slope_array[i]*df['time'][outlier_idx[i:i+2]] + intercept_array[i], 'r')
    
    # compute annotation coordinates
    x = df['time'][outlier_idx[i]] + np.diff(df['time'][outlier_idx[i:i+2]])[0]/2.0
    y = np.diff(slope_array[i]*df['time'][outlier_idx[i:i+2]] + intercept_array[i])[0]/2.0
    ax[0].annotate('{0}'.format("%.2f" % r_squared_array[i]), xy=(x,y), horizontalalignment='center', verticalalignment='bottom')


    # plot refined linear regression --------------------------------------------------------------------------------------------
    if rf_struct1[i] != None:
        a = rf_struct1[i][rf_bfidx[i]]
    
        axaxis = np.array([a['start'],a['end']])
        ax[0].plot(axaxis, a['slope']*axaxis + a['intercept'], 'y')

        # compute annotation coordinates
        x = axaxis[0] + np.diff(axaxis)[0]/2.0
        y = np.diff(a['slope']*axaxis + a['intercept'])[0]/2.0 + 2
        ax[0].annotate('{0}'.format("%.2f" % a['r_value']**2), xy=(x,y), horizontalalignment='center', verticalalignment='bottom')
    
        ax[0].plot([a['end'],a['end']], [-np.mean(df['LEpos_int']), np.mean(df['LEpos_int'])], 'y')

        a = rf_struct2[i][rf_bfidx[i]]

        axaxis = np.array([a['start'],a['end']])
        ax[0].plot(axaxis, a['slope']*axaxis + a['intercept'], 'y')

        # compute annotation coordinates
        x = axaxis[0] + np.diff(axaxis)[0]/2.0
        y = np.diff(a['slope']*axaxis + a['intercept'])[0]/2.0 + 2
        ax[0].annotate('{0}'.format("%.2f" % a['r_value']**2), xy=(x,y), horizontalalignment='center', verticalalignment='bottom')
    # -----------------------------------------------------------------------------------------------------------------------------
    
ax[1].plot(df['time'], df['velocity'])
ax[1].scatter(df['time'][df['Outlierfiltered']], df['velocity'][df['Outlierfiltered']],color ='r')
ax[1].scatter(df['time'][df['isAmbiguousOutlier']], df['velocity'][df['isAmbiguousOutlier']],color ='y')

f.suptitle(ds.filename)
ax[0].set_title('eye gaze with linear regression and r squared values')
ax[1].set_title('velocity with outliers. outlier threshold = {0}'.format(outlier_threshold))
plt.show()

# Stats

In [119]:
print 'Number of linear regression intervals with a r_squared < 0.3:\t{2}\nNumber of intervals refined with ambiguous outliers: \t{1}\nNumber of intervals refined with splitting window: \t{0}'.format(
    len(rf_struct2), len(m1_struct),np.sum((r_squared_array < badfitth)!=0))

Number of linear regression intervals with a r_squared < 0.3:	1079
Number of intervals refined with ambiguous outliers: 	98
Number of intervals refined with splitting window: 	125


# Get the best fit from the two methods

In [10]:
# badfit_idx = np.where(r_squared_array < badfitth)[0]
badfit_idx = np.where(np.array([fit['r_squared'] for fit in lr_struct]) < badfitth)[0]
refinedout = [None] * n

for badidx in badfit_idx:
    if m1_struct[badidx] != None:           
        cumulative_rsqrd_m1 = np.sum(np.power(np.array([item['r_value'] for item in m1_struct[badidx]]),2))
        cumulative_rsqrd_m2 = np.sum(np.power(np.array([rf_struct1[badidx][rf_bfidx[badidx]]['r_value'], rf_struct2[badidx][rf_bfidx[badidx]]['r_value']]),2))
        
        if cumulative_rsqrd_m1 < cumulative_rsqrd_m2:
            refinedout.insert(badidx, [rf_struct1[badidx][rf_bfidx[badidx]], rf_struct2[badidx][rf_bfidx[badidx]]])
        else:
            refinedout.insert(badidx, m1_struct[badidx])
            
        cumulative_rsqrd_rf = np.sum(np.power(np.array([item['r_value'] for item in refinedout[badidx]]),2))
                   
    elif rf_struct1[badidx] != None:
        refinedout.insert(badidx, [rf_struct1[badidx][rf_bfidx[badidx]], rf_struct2[badidx][rf_bfidx[badidx]]])

#Plot the two methods

In [None]:
f, ax = plt.subplots(3, sharex = True,sharey = True)

# SUBPLOT 1: gaze, outliers and original linear regression  -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 

ax[0].plot(df['time'], df['LEpos_int'])  # plot gaze
ax[0].scatter(df['time'][df['Outlierfiltered']], df['LEpos_int'][df['Outlierfiltered']],color ='r')
ax[0].scatter(df['time'][df['isAmbiguousOutlier']], df['LEpos_int'][df['isAmbiguousOutlier']],color ='y')

for i in range(n):  
    
    a = lr_struct[i]
    axaxis = np.array([a['start'],a['end']])
    ax[0].plot(axaxis, a['slope']*axaxis + a['intercept'], 'r')
    
    # compute annotation coordinates
    x = axaxis[0] + np.diff(axaxis)[0]/2.0
    y = np.diff(a['slope']*axaxis + a['intercept'])[0]/2.0 + 2
    ax[0].annotate('r^2: {0}\nRSS: {1}\nslp: {2}'.format(
                    "%.2f" % a['r_value']**2,"%.2f" % a['RSS'], "%.4f" % a['slope']),
                           xy=(x,y), horizontalalignment='center', verticalalignment='bottom')

    

    
# SUBPLOT 2: gaze and refined linear regression method 1 (ambiguous outliers) -- -- -- -- -- -- -- -- -- -- -- -- -- -- 
ax[1].plot(df['time'], df['LEpos_int'])  # plot gaze

for fit in (m1_struct):
    if fit != None:
        for j in range(len(fit)):
            a = fit[j]
            axaxis = np.array([a['start'],a['end']])
            ax[1].plot(axaxis, a['slope']*axaxis + a['intercept'], 'g')

            # compute annotation coordinates
            x = axaxis[0] + np.diff(axaxis)[0]/2.0
            y = np.diff(a['slope']*axaxis + a['intercept'])[0]/2.0 + 2
            ax[1].annotate('r^2: {0}\nRSS: {1}\nslp: {2}'.format(
                    "%.2f" % a['r_value']**2,"%.2f" % a['RSS'], "%.4f" % a['slope']),
                           xy=(x,y), horizontalalignment='center', verticalalignment='bottom')
            
            ax[1].plot([a['end'],a['end']], [-2*y,2*y], 'm')
    
    
# SUBPLOT 3: gaze and refined linear regression method 2 (splitting window) -- -- -- -- -- -- -- -- -- -- -- -- -- -- 
ax[2].plot(df['time'], df['LEpos_int'])  # plot gaze

for i in range(len(rf_struct1)):
    if rf_struct1[i] != None:
        a = rf_struct1[i][rf_bfidx[i]]

        axaxis = np.array([a['start'],a['end']])
        ax[2].plot(axaxis, a['slope']*axaxis + a['intercept'], 'g')

        # compute annotation coordinates
        x = axaxis[0] + np.diff(axaxis)[0]/2.0
        y = np.diff(a['slope']*axaxis + a['intercept'])[0]/2.0 + 2
        ax[2].annotate('r^2: {0}\nRSS: {1}\nslp: {2}'.format(
                    "%.2f" % a['r_value']**2,"%.2f" % a['RSS'], "%.4f" % a['slope']),
                           xy=(x,y), horizontalalignment='center', verticalalignment='bottom')

        ax[2].plot([a['end'],a['end']], [-np.mean(df['LEpos_int']), np.mean(df['LEpos_int'])], 'y')


        a = rf_struct2[i][rf_bfidx[i]]

        axaxis = np.array([a['start'],a['end']])
        ax[2].plot(axaxis, a['slope']*axaxis + a['intercept'], 'g')

        # compute annotation coordinates
        x = axaxis[0] + np.diff(axaxis)[0]/2.0
        y = np.diff(a['slope']*axaxis + a['intercept'])[0]/2.0 + 2
        ax[2].annotate('r^2: {0}\nRSS: {1}\nslp: {2}'.format(
                    "%.2f" % a['r_value']**2,"%.2f" % a['RSS'], "%.4f" % a['slope']),
                           xy=(x,y), horizontalalignment='center', verticalalignment='bottom')
        
        ax[2].plot([a['end'],a['end']], [-2*y,2*y], 'm')

    
# # SUBPLOT 3: gaze and refined linear regression with best method -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 
# ax[3].plot(df['time'], df['LEpos_int'])  # plot gaze

# for fit in (refinedout):
#     if fit != None:
#         for j in range(len(fit)):
#             a = fit[j]
#             axaxis = np.array([a['start'],a['end']])
#             ax[3].plot(axaxis, a['slope']*axaxis + a['intercept'], 'g')

#             # compute annotation coordinates
#             x = axaxis[0] + np.diff(axaxis)[0]/2.0
#             y = np.diff(a['slope']*axaxis + a['intercept'])[0]/2.0 + 2
#             ax[3].annotate('r^2: {0}\nRSS: {1}'.format("%.2f" % a['r_value']**2, "%.2f" % a['RSS']), xy=(x,y), horizontalalignment='center', verticalalignment='bottom')

#             ax[3].plot([a['end'],a['end']], [-2*y,2*y], 'm')

    

f.suptitle(ds.filename)
ax[0].set_title('eye gaze with linear regression and r squared values')
ax[1].set_title('refined linear regression: method 1 (ambiguous outliers)')
ax[2].set_title('refined linear regression: method 2 (split interval)')
# ax[3].set_title('greater summed r squared')

plt.show()

**Plot original linear regression with r squared, RSS and slope values**

In [None]:
f, ax = plt.subplots(1, sharex = True,sharey = True)

# SUBPLOT 1: gaze, outliers and original linear regression  -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 

ax.plot(df['time'], df['LEpos_int'])  # plot gaze
ax.scatter(df['time'][df['Outlierfiltered']], df['LEpos_int'][df['Outlierfiltered']],color ='r')
ax.scatter(df['time'][df['isAmbiguousOutlier']], df['LEpos_int'][df['isAmbiguousOutlier']],color ='y')

for i in range(n):  
    a = lr_struct[i]
    if abs(a['slope']) < 10.0008:
        axaxis = np.array([a['start'],a['end']])
        ax.plot(axaxis, a['slope']*axaxis + a['intercept'], 'r')

        # compute annotation coordinates
        x = axaxis[0] + np.diff(axaxis)[0]/2.0
        y = np.diff(a['slope']*axaxis + a['intercept'])[0]/2.0 + 2
        ax.annotate('r^2: {0}\nRSS: {1}\nslp: {2}'.format(
                        "%.2f" % a['r_value']**2,"%.2f" % a['RSS'], "%.4f" % a['slope']),
                               xy=(x,y), horizontalalignment='center', verticalalignment='bottom')   

f.suptitle(ds.filename)
ax.set_title('eye gaze with linear regression and r squared values')

plt.show()

#NEXT STEP: CLASSIFY PERCEPTS

1 - Create new struct and fill it with good fits of the original linear regression and the best from the refined

In [129]:
newlrstruct = []
for i in range(n):                     # for all the outlier-intervals
    if i in badfit_idx and refinedout[i] != None:                # if it is a bad fit in the original linear regression
        for item in refinedout[i]:
            newlrstruct.append(item)
    else:                              # if it was a good fit
        newlrstruct.append(lr_struct[i])

2 - Define threshold of length of interval, slope and r squared

In [76]:
threslen = 30       # minimal segment length
thresslo = 0.0007   # slope threshold
thresrsq = badfitth # r squared threshold

2 - For each fit, get the length, the slope and the r squared

In [137]:
elsec = 0
ifc = 0
for fit in newlrstruct:
    fit['percept'] = classifyfit(fit)          

In [140]:
p = []
for fit in newlrstruct:
    p.append(fit['percept'])
# np.where(p=='B')
# p

In [96]:
def classifyfit(fit, threslen = 30, thresslo = 0.0007, thresrsq = 0.3):
    """
        Compute the percept for a given fit.
        Three possibilities: A, B, ambiguous
        
        Input:
        - fit: dict containing the result of a linear regression using regressionbtwpoints()
        - threslen: minimum interval length. If len(fit) < threslen, percept = ambiguous
        - thresslo: minimum slope absolute value.
        - thresrsq: minimum r squared value.
    """
    
    # get length of interval in samples:
    fit_len = fit['end_idx'] - fit['start_idx']

    # condition 1: interval larger than threslen:
    c1 = fit_len >= threslen

    # condition 2.1: absolute value of slope larger than thresslo:
    c21 = np.abs(fit['slope']) > thresslo

    # condition 2.2: slope value positive or negative:
    c22 = fit['slope'] > 0 # slope<0: A, slope>0: B

    # condition 3: r squared higher than thresrsq:
    c3 = fit['r_squared'] > thresrsq

    # classify percept -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 
    if c1 and c21 and c3:
        if c22:
            percept = 'B'
        else:
            percept = 'A'
    else
        percept = 'ambg'
        
    return percept

In [None]:
f, ax = plt.subplots(1, sharex = True,sharey = True)

# SUBPLOT 1: gaze, outliers and original linear regression  -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 

ax.plot(df['time'], df['LEpos_int'])  # plot gaze
ax.scatter(df['time'][df['Outlierfiltered']], df['LEpos_int'][df['Outlierfiltered']],color ='r')
ax.scatter(df['time'][df['isAmbiguousOutlier']], df['LEpos_int'][df['isAmbiguousOutlier']],color ='y')

for i in range(n):  
    a = newlrstruct[i]
    if abs(a['slope']) < 10.0008:
        axaxis = np.array([a['start'],a['end']])
        ax.plot(axaxis, a['slope']*axaxis + a['intercept'], 'r')

        # compute annotation coordinates
        x = axaxis[0] + np.diff(axaxis)[0]/2.0
        y = np.diff(a['slope']*axaxis + a['intercept'])[0]/2.0 + 2
        ax.annotate('r^2: {0}\nRSS: {1}\nslp: {2}\n{3}'.format(
                        "%.2f" % a['r_value']**2,"%.2f" % a['RSS'], "%.4f" % a['slope'], a['percept']),
                               xy=(x,y), horizontalalignment='center', verticalalignment='bottom')   

f.suptitle(ds.filename)
ax.set_title('eye gaze with linear regression and r squared values')

plt.show()