In [None]:
%matplotlib
#%matplotlib inline
import os
import csv
import fnmatch
import numpy as np
import datetime
import re 
import pandas as pd
import matplotlib.pyplot as plt
import math
import xlsxwriter
import scipy.stats
import statsmodels.api as sm
from statsmodels.formula.api import ols
import statsmodels
from statsmodels.graphics.regressionplots import abline_plot
from statsmodels.stats.api import anova_lm
import seaborn as sns

pd.options.mode.use_inf_as_na = True

Obtain data in form of pandas dataframe

In [None]:
# csv file with the saved data
fileName = r"C:\DTU\Data\201805_HealthnRehab\data_summary.csv"

df = pd.read_csv(fileName, delimiter=',')

typingMechanismDictKeys = set(df['typing_mechanism'])

# create reference data frame for age bins and gender (0:Less than 30/Male, 1:Greater than 30/Female)
labels = ['age_bins', 'gender', 'typing_mechanism']
dataReference = [['Less than 30', 'Male', 'Dwell-Time'], ['Greater than 30', 'Female', 'Multi-Key Selection']]
df_refAgeGender = pd.DataFrame.from_records(dataReference, columns=labels)

# replace data in df in string or category
df = df.replace({'Less than 30': 0, 'Greater than 30': 1, 'Male': 0, 'Female': 1, 'DT' : 0, 'MS' : 1})

# Create dataframe without Nan values
df_woNaGenderAgeExperience = df.dropna(subset=['gender','age_bins', 'gaze_interaction_experience'], how='any') 

# Create a copy of the dataframe to avoid loc errors.
df_ToAnalyze = df_woNaGenderAgeExperience.copy()

# Convert columns of age_bins, typing_mechanism, gender into categorical columns
df_ToAnalyze['gender'] = pd.Categorical(df_woNaGenderAgeExperience.gender).codes
df_ToAnalyze['typing_mechanism'] = pd.Categorical(df_woNaGenderAgeExperience.typing_mechanism).codes
df_ToAnalyze['age_bins'] = pd.Categorical(df_woNaGenderAgeExperience.age_bins).codes
df_ToAnalyze['gaze_interaction_experience'] = pd.Categorical(df_woNaGenderAgeExperience.gaze_interaction_experience).codes

In [None]:
df_ToAnalyze.subject_name

In [None]:
n_Males = df_ToAnalyze.gender[df_ToAnalyze['gender'] == 0].count()
n_Females = df_ToAnalyze.gender[df_ToAnalyze['gender'] == 1].count()

n_Below30 = df_ToAnalyze.age_bins[df_ToAnalyze['age_bins'] == 0].count()
n_Above30 = df_ToAnalyze.age_bins[df_ToAnalyze['age_bins'] == 1].count()

n_DT = df_ToAnalyze.typing_mechanism[df_ToAnalyze['typing_mechanism'] == 0].count()
n_MS = df_ToAnalyze.typing_mechanism[df_ToAnalyze['typing_mechanism'] == 1].count()

n_ExperienceNever = df_ToAnalyze.gaze_interaction_experience[df_ToAnalyze['gaze_interaction_experience'] == 1].count()
n_ExperienceMultiple = df_ToAnalyze.gaze_interaction_experience[df_ToAnalyze['gaze_interaction_experience'] == 0].count()
n_ExperienceOnce = df_ToAnalyze.gaze_interaction_experience[df_ToAnalyze['gaze_interaction_experience'] == 2].count()


print('Males:', n_Males, 'Females:', n_Females)
print('Below 30:', n_Below30, 'Above 30', n_Above30)
print('Dwell time', n_DT, 'Multi-key selection:', n_MS)
print('No previous experience with gaze:', n_ExperienceNever, 'Tried once before:', n_ExperienceOnce, 'Multiple times', n_ExperienceMultiple)

In [None]:
n_DT_Below30 = df_ToAnalyze.typing_mechanism[(df_ToAnalyze['typing_mechanism'] == 0) & (df_ToAnalyze['age_bins'] == 0)].count()
n_DT_Above30 = df_ToAnalyze.typing_mechanism[(df_ToAnalyze['typing_mechanism'] == 0) & (df_ToAnalyze['age_bins'] == 1)].count()

n_MS_Below30 = df_ToAnalyze.typing_mechanism[(df_ToAnalyze['typing_mechanism'] == 1) & (df_ToAnalyze['age_bins'] == 0)].count()
n_MS_Above30 = df_ToAnalyze.typing_mechanism[(df_ToAnalyze['typing_mechanism'] == 1) & (df_ToAnalyze['age_bins'] == 1)].count()


n_DT_Males = df_ToAnalyze.typing_mechanism[(df_ToAnalyze['typing_mechanism'] == 0) & (df_ToAnalyze['gender'] == 0)].count()
n_DT_Females = df_ToAnalyze.typing_mechanism[(df_ToAnalyze['typing_mechanism'] == 0) & (df_ToAnalyze['gender'] == 1)].count()

n_MS_Males = df_ToAnalyze.typing_mechanism[(df_ToAnalyze['typing_mechanism'] == 1) & (df_ToAnalyze['gender'] == 0)].count()
n_MS_Females = df_ToAnalyze.typing_mechanism[(df_ToAnalyze['typing_mechanism'] == 1) & (df_ToAnalyze['gender'] == 1)].count()


print(n_DT_Below30, n_DT_Above30, n_MS_Below30, n_MS_Above30)
print(n_DT_Males, n_DT_Females, n_MS_Males, n_MS_Females)

In [None]:
# Experience (comfort, challenge level, fun) - 
df_ToAnalyze.how_challenging_was_the_task

# Analysis of typing speed 

### OLS regression

In [None]:
# OLS regression to check residuals of data for anova testing 
X = df_ToAnalyze[['age_bins', 'typing_mechanism', 'gender']] # independent variable
y_typingSpeed = df_ToAnalyze.typing_speed # dependent variable

model_typingSpeed = sm.OLS(y_typingSpeed, X)
model_fit_typingSpeed = model_typingSpeed.fit()

p_typingSpeed = model_fit_typingSpeed.params

# Plot the residuals of each
residuals_typingSpeed = model_fit_typingSpeed.resid # residuals
fig = sm.qqplot(residuals_typingSpeed)
plt.show()

model_fit_typingSpeed.summary()

Comments:
Only 73.5% of variance is explained -- is high correlation value
Prob(F-statistic) --> Null hypothesis is rejected, that the variability is random and alternate hypothesis that the variability
can be explained is accepted

From the graph, we see that the residuals are normally distributed

### ANOVA 

In [None]:
formula_typingSpeed = 'df_ToAnalyze.typing_speed ~ df_ToAnalyze.age_bins + df_ToAnalyze.gender + df_ToAnalyze.typing_mechanism + df_ToAnalyze.age_bins:df_ToAnalyze.gender + df_ToAnalyze.age_bins:df_ToAnalyze.typing_mechanism + df_ToAnalyze.gender:df_ToAnalyze.typing_mechanism'
model_typingSpeed = ols(formula_typingSpeed, df_ToAnalyze).fit()
aov_table_typingSpeed = statsmodels.stats.anova.anova_lm(model_typingSpeed, typ=2)
print(aov_table_typingSpeed)

### Plots of typing speed wrt independent variables

In [None]:
# Box Plots

figBoxPlot = plt.figure()

# Age
ax = figBoxPlot.add_subplot(131)
df_ToAnalyze.boxplot( column = 'typing_speed', by='age_bins', ax = ax, grid = False)
plt.xticks([1, 2], ['Less than 30', 'Greater than 30'])
plt.yticks(np.arange(0, float(df.typing_speed.max())+2, step = 2))
plt.ylabel('Typing speed [in wpm]')

# Gender
ax = figBoxPlot.add_subplot(132)
df_ToAnalyze.boxplot(column = 'typing_speed', by='gender', ax = ax, grid = False)
plt.xticks([1, 2], ['Male', 'Female'])
plt.yticks(np.arange(0, float(df.typing_speed.max())+2, step = 2))
plt.ylabel('Typing speed [in wpm]')

# Typing mechanism
ax = figBoxPlot.add_subplot(133)
df_ToAnalyze.boxplot(column = 'typing_speed', by='typing_mechanism', ax = ax, grid = False)
plt.xticks([1, 2], ['Dwell time', 'MultiKey selection'])
plt.yticks(np.arange(0, float(df.typing_speed.max())+2, step = 2))
plt.ylabel('Typing speed [in wpm]')



In [None]:
print('Average speed and standard deviation for dwell time selection are:', df_ToAnalyze.typing_speed[df_ToAnalyze['typing_mechanism']==0].mean(), df_ToAnalyze.typing_speed[df_ToAnalyze['typing_mechanism']==0].std())
print('Average speed and standard deviation for multi-key selection are:', df_ToAnalyze.typing_speed[df_ToAnalyze['typing_mechanism']==1].mean(), df_ToAnalyze.typing_speed[df_ToAnalyze['typing_mechanism']==1].std())


In [None]:
# Histograms
figHist = plt.figure()
bins = np.arange(0, float(df_ToAnalyze.typing_speed.max()+2), step = 2)

# Age
ax = figHist.add_subplot(141)
df_ToAnalyze.hist( column = 'typing_speed', ax = ax, grid = False)
plt.xticks(bins)
plt.xlabel('Typing speed [in wpm]')
plt.yticks(np.arange(0, 11, step = 2))
plt.ylabel('Frequency')

ax = figHist.add_subplot(142)
df_ToAnalyze.typing_speed[df_ToAnalyze['age_bins'] == 0].hist(ax = ax, alpha=0.5, grid = False, label = 'Below 30 years')
#df_ToAnalyze.hist(df_ToAnalyze.typing_speed[df_ToAnalyze['age_bins'] == 0], bins, ax = ax, grid = False)
plt.xticks(bins)
plt.xlabel('Typing speed [in wpm]')
plt.ylabel('Frequency')

ax = figHist.add_subplot(142)
df_ToAnalyze.typing_speed[df_ToAnalyze['age_bins'] == 1].hist(ax = ax, alpha=0.5, grid = False, label = 'Above 30 years')
plt.xticks(bins)
plt.xlabel('Typing speed [in wpm]')
plt.ylabel('Frequency')
plt.title('Typing speed and Age')
plt.legend()

# Gender

ax = figHist.add_subplot(143)
df_ToAnalyze.typing_speed[df_ToAnalyze['gender'] == 0].hist(ax = ax, alpha=0.5, grid = False, label = 'Male')
#df_ToAnalyze.hist(df_ToAnalyze.typing_speed[df_ToAnalyze['age_bins'] == 0], bins, ax = ax, grid = False)
plt.xticks(bins)
plt.xlabel('Typing speed [in wpm]')
plt.ylabel('Frequency')

ax = figHist.add_subplot(143)
df_ToAnalyze.typing_speed[df_ToAnalyze['gender'] == 1].hist(ax = ax, alpha=0.5, grid = False, label = 'Female')
plt.xticks(bins)
plt.xlabel('Typing speed [in wpm]')
plt.ylabel('Frequency')
plt.title('Typing speed and Gender')
plt.legend()

# Typing Mechanism

ax = figHist.add_subplot(144)
df_ToAnalyze.typing_speed[df_ToAnalyze['typing_mechanism'] == 0].hist(ax = ax, alpha=0.5, grid = False, label = 'Dwell time')
#df_ToAnalyze.hist(df_ToAnalyze.typing_speed[df_ToAnalyze['age_bins'] == 0], bins, ax = ax, grid = False)
plt.xticks(bins)
plt.xlabel('Typing speed [in wpm]')
plt.ylabel('Frequency')

ax = figHist.add_subplot(144)
df_ToAnalyze.typing_speed[df_ToAnalyze['typing_mechanism'] == 1].hist(ax = ax, alpha=0.5, grid = False, label = 'Multi-key selection')
plt.xticks(bins)
plt.xlabel('Typing speed [in wpm]')
plt.ylabel('Frequency')
plt.title('Typing speed and Typing Mechanism')
plt.legend()

With the new calculations of typing speed, the histograms for typing speed are normal when distributed as per age and gender, 
but not using typing mechanism. Therefore, mann whitney test will be performed for typing mechanism (their variances are still
more or less equal) and t-test for the others.

### Typing speed statistical analysis

In [None]:
# Age


# Gender


# Typing mechanism
# the mann whitney test in scipy.stats clearly says that it is reliable only for sample size of 20 in each. Since we do not have
# that, this test will be performed manually, based on - http://psych.unl.edu/psycrs/handcomp/hcmann.PDF
n_DT = df_ToAnalyze.typing_mechanism[df_ToAnalyze['typing_mechanism'] == 0].count()
n_MS = df_ToAnalyze.typing_mechanism[df_ToAnalyze['typing_mechanism'] == 1].count()

# Rank the data and add up
RankedSumDT = sum(df_ToAnalyze.typing_speed[df_ToAnalyze['typing_mechanism'] == 0].rank(axis=0, method='average'))

U_DT = n_DT*n_MS + n_DT*(n_DT + 1)/2 - RankedSumDT
U_MS = n_DT*n_MS - U_DT
U_select = min(U_MS, U_DT)
U_critical = 59
if U_select < U_critical:
    print('reject Ho')

# using the function
U_stat, p_val = scipy.stats.mannwhitneyu(df_ToAnalyze.typing_speed[df_ToAnalyze['typing_mechanism'] == 0], df_ToAnalyze.typing_speed[df_ToAnalyze['typing_mechanism'] == 1])
print(U_stat, p_val)

In [None]:
print(n_DT, n_MS)

In [None]:
# Age
a = df_ToAnalyze.typing_speed[df_ToAnalyze['age_bins']==0]
b = df_ToAnalyze.typing_speed[df_ToAnalyze['age_bins']==1]
(h, p) = scipy.stats.kruskal(a.values, b.values)
print('Kruskal result for age: H-statistic=', h, ', pvalue=', p)

# Gender
a = df_ToAnalyze.typing_speed[df_ToAnalyze['gender']==0]
b = df_ToAnalyze.typing_speed[df_ToAnalyze['gender']==1]
(h, p) = scipy.stats.kruskal(a.values, b.values)
print('Kruskal result for gender: H-statistic=', h, ', pvalue=', p)

# Typing mechanism
a = df_ToAnalyze.typing_speed[df_ToAnalyze['typing_mechanism']==0]
b = df_ToAnalyze.typing_speed[df_ToAnalyze['typing_mechanism']==1]
(h, p) = scipy.stats.kruskal(a.values, b.values)
print('Kruskal result for typing mechanism: H-statistic=', h, ', pvalue=', p)

In [None]:
df_ToAnalyze.error_rate[df_ToAnalyze.typing_mechanism==1].mean()

# Analysis of error rate

### OLS regression

In [None]:
# OLS regression to check residuals of data for anova testing 
X = df_ToAnalyze[['age_bins', 'typing_mechanism', 'gender']] # independent variable
y_errorRate = df_ToAnalyze.error_rate # dependent variable

model_errorRate = sm.OLS(y_errorRate,X)
model_fit_errorRate = model_errorRate.fit()

p_errorRate = model_fit_errorRate.params

# Plot the residuals of each
residuals_errorRate = model_fit_errorRate.resid # residuals
fig = sm.qqplot(residuals_errorRate)
plt.show()

model_fit_errorRate.summary()

Comments: The residual plot is slightly skewed --> anova cannot be performed 
But Prob(F-statistic) is low enough to reject null hypothesis
Still, anova is not performed due to the skewed residual plot

In [None]:
## TODO: Perform Shierer Ray Hare test 

# For now, it is performed in R

## Equivalence testing of error rate for typing mechansims
Here, the null hypothesis is that error rates for both is not equivalent
For a practical equivalence value of 'del', Ho: |mu1 - mu2| > del and Ha: |mu1 - mu2| < del
So, to reject the null hypothesis, both should be rejected:
mu1 - mu2 > del and mu1 - mu2 < -del
or in terms of two-one sided test :
    ((y1m - y2m) + del)/sigma(1/n1 + 1/n2) > z(1-alpha) and ((y1m - y2m) - del)/sigma(1/n1 + 1/n2) < -z(1-alpha)

In [None]:
sd_errorRate_DT = df_ToAnalyze.error_rate[df_ToAnalyze['typing_mechanism']==0].std()
sd_errorRate_MS = df_ToAnalyze.error_rate[df_ToAnalyze['typing_mechanism']==1].std()

n_DT = df_ToAnalyze.typing_mechanism[df_ToAnalyze['typing_mechanism'] == 0].count()
n_MS = df_ToAnalyze.typing_mechanism[df_ToAnalyze['typing_mechanism'] == 1].count()

mean_errorRate_DT = df_ToAnalyze.error_rate[df_ToAnalyze['typing_mechanism']==0].mean()
mean_errorRate_MS = df_ToAnalyze.error_rate[df_ToAnalyze['typing_mechanism']==1].mean()

sd = math.sqrt(((n_DT - 1)*(sd_errorRate_DT)**2 + (n_MS - 1)*(sd_errorRate_MS)**2)/(n_DT + n_MS - 2))

print(sd*0.3)

In [None]:
delError = 10 # As per Lakens 2017 Equivalence testing, del = d*sd, where d is Cohen's d = 0.3. For the given data, sd = ~36

tL = (mean_errorRate_DT - mean_errorRate_MS - (-delError))/(math.sqrt(((sd_errorRate_DT**2)/n_DT) + (sd_errorRate_MS**2)/n_MS))
tU = (mean_errorRate_DT - mean_errorRate_MS - (delError))/(math.sqrt(((sd_errorRate_DT**2)/n_DT) + (sd_errorRate_MS**2)/n_MS))

#df = ((((sd_errorRate_DT**2)/(n_DT)) + ((sd_errorRate_MS**2)/(n_MS)))**2)/(((sd_errorRate_DT/n_DT)**2)/(n_DT-1) + ((sd_errorRate_MS/n_MS)**2)/(n_MS-1))

df = (n_DT + n_MS - 1)

print('tLower:', tL, ',tUpper:', tU,'with', df, ' degrees of freedom')

#For t(28, 0.05) = 1.701
To reject null hypothesis, both of the following should be followed - 
tU < -t(28, 0.05) AND tL > t(28, 0.05)
As both are not, null hypothesis cannot be rejected 
Therefore, both cannot be equivalent

In [None]:
print('Average error rate and standard deviation for dwell time selection are:', df_ToAnalyze.error_rate[df_ToAnalyze['typing_mechanism']==0].mean(), df_ToAnalyze.error_rate[df_ToAnalyze['typing_mechanism']==0].std())
print('Average error rate and standard deviation for multi-key selection are:', df_ToAnalyze.error_rate[df_ToAnalyze['typing_mechanism']==1].mean(), df_ToAnalyze.error_rate[df_ToAnalyze['typing_mechanism']==1].std())

### Plots of error rate wrt independent variables

In [None]:
fig = plt.figure()

# Age
ax = fig.add_subplot(131)
df_ToAnalyze.boxplot(column = 'error_rate', by='age_bins', ax = ax, grid = False)
plt.xticks([1, 2], ['Less than 30', 'Greater than 30'])
#plt.yticks(np.arange(0, float(df.typing_speed.max())+2, step = 2))
plt.ylabel('Typing speed [in wpm]')

# Gender
ax = fig.add_subplot(132)
df_ToAnalyze.boxplot(column = 'error_rate', by='gender', ax = ax, grid = False)
plt.xticks([1, 2], ['Male', 'Female'])
#plt.yticks(np.arange(0, float(df.typing_speed.max())+2, step = 2))
plt.ylabel('Typing speed [in wpm]')

# Typing mechanism
ax = fig.add_subplot(133)
df_ToAnalyze.boxplot(column = 'error_rate', by='typing_mechanism', ax = ax, grid = False)
plt.xticks([1, 2], ['Dwell time', 'MultiKey selection'])
#plt.yticks(np.arange(0, float(df.typing_speed.max())+2, step = 2))
plt.ylabel('Typing speed [in wpm]')


### Error rate statistical analysis

In [None]:
# Age
a = df_ToAnalyze.error_rate[df_ToAnalyze['age_bins']==0]
b = df_ToAnalyze.error_rate[df_ToAnalyze['age_bins']==1]
(h, p) = scipy.stats.kruskal(a.values, b.values)
print('Kruskal result for age: H-statistic=', h, ', pvalue=', p)

# Typing mechanism
a = df_ToAnalyze.error_rate[df_ToAnalyze['gender']==0]
b = df_ToAnalyze.error_rate[df_ToAnalyze['gender']==1]
(h, p) = scipy.stats.kruskal(a.values, b.values)
print('Kruskal result for gender: H-statistic=', h, ', pvalue=', p)

# Typing mechanism
a = df_ToAnalyze.error_rate[df_ToAnalyze['typing_mechanism']==0]
b = df_ToAnalyze.error_rate[df_ToAnalyze['typing_mechanism']==1]
(h, p) = scipy.stats.kruskal(a.values, b.values)
print('Kruskal result for typing mechanism: H-statistic=', h, ', pvalue=', p)

In [None]:
k = np.hstack(df_ToAnalyze.typing_speed)
plt.hist(k)

In [None]:
# Plot of error rate and typing speed

In [None]:
#plt.plot(df_ToAnalyze.typing_speed[df_ToAnalyze.typing_mechanism==0], df_ToAnalyze.error_rate[df_ToAnalyze.typing_mechanism==0], 'bo', label = 'dwell time')
#plt.plot(df_ToAnalyze.typing_speed[df_ToAnalyze.typing_mechanism==1], df_ToAnalyze.error_rate[df_ToAnalyze.typing_mechanism==1], 'ro', label = 'multi-key selection')
n = df_ToAnalyze.subject_name.values 
plt.plot(df_ToAnalyze.typing_speed, df_ToAnalyze.error_rate, 'o')


plt.xlabel('Typing speed [in wpm]')
plt.ylabel('Error rate [in %]')
plt.legend()
for i, txt in enumerate(n):
    plt.annotate(txt, (df_ToAnalyze.typing_speed.values[i], df_ToAnalyze.error_rate.values[i]))
plt.grid('on')

In [None]:
n

### Correlation between typing speed and error rate

In [None]:
c = str(df_ToAnalyze['typing_speed'].corr(df_ToAnalyze['error_rate']))

fig = plt.figure()
ax1 = fig.add_subplot(1,1,1)
ax2 = ax1.twinx()

a = df_ToAnalyze.typing_speed.sort_values()

i = 0
for ind in a.keys():
    i = i + 1
    ax1.plot(i, df_ToAnalyze['typing_speed'].loc[ind], 'bo')
    ax2.plot(i, df_ToAnalyze['error_rate'].loc[ind], 'ro')
    
ax1.set_title('Correlation between typing speed and error rate is: %s' %c)
ax1.set_ylabel('Typing speed [in wpm]', color = 'b')
ax1.set_yticks(np.arange(0, 15, 2))
ax2.set_ylabel('Error rate [in %]', color = 'r')

In [None]:
# box chart of typing speed and error rate
label_fontSize = 60
others_fontSize = 54

fig = plt.figure(figsize=(24,24))
ax1 = fig.add_subplot(1,1,1)
ax2 = ax1.twinx()

#onlyPosErr_t

ax1.bar(1, df_ToAnalyze.typing_speed[df_ToAnalyze.typing_mechanism==0].mean(), yerr = [[0], [df_ToAnalyze.typing_speed[df_ToAnalyze.typing_mechanism==0].std()]], label = 'Dwell time')
ax1.bar(2, df_ToAnalyze.typing_speed[df_ToAnalyze.typing_mechanism==1].mean(), yerr = [[0], [df_ToAnalyze.typing_speed[df_ToAnalyze.typing_mechanism==1].std()]], label = 'Multi-key selection')
ax1.set_ylabel('Typing speed [in wpm]', fontsize = label_fontSize)
ax1.set_yticks(np.arange(0, 11, 2))
ax1.legend(bbox_to_anchor=(1, 0.848), loc = 'lower right', fontsize = label_fontSize)
ax1.tick_params(axis='both', which='major', labelsize= others_fontSize)
#ax1.set_xticks([1.5])
#ax1.set_xticklabels(['Typing speed'])

ax2.bar(5, df_ToAnalyze.error_rate[df_ToAnalyze.typing_mechanism==0].mean(), yerr = [[0], [df_ToAnalyze.error_rate[df_ToAnalyze.typing_mechanism==0].std()]])
ax2.bar(6, df_ToAnalyze.error_rate[df_ToAnalyze.typing_mechanism==1].mean(), yerr = [[0], [df_ToAnalyze.error_rate[df_ToAnalyze.typing_mechanism==1].std()]])
ax2.set_ylabel('Error rate [in %]', fontsize = label_fontSize)
plt.xticks([], [])
#ax1.grid(color='b', alpha = 0.1, linestyle='-', linewidth=2)
#ax2.grid(color='b', alpha = 0.1, linestyle='-', linewidth=1.7)
ax2.set_ylim([0, 100])
ax2.tick_params(axis='both', which='major', labelsize=others_fontSize)
plt.tight_layout()
fig.savefig('typingSpeed_errorRate_DTandMS.png', dpi = 300, bbox_to_anchor = (0.95, 0.1))

In [None]:
# box chart of typing speed and error rate
label_fontSize = 98
others_fontSize = 90

fig = plt.figure(figsize=(30,30))
ax1 = fig.add_subplot(1,1,1)
ax2 = ax1.twinx()

#onlyPosErr_t
x1_TypingSpeed = 1
x2_TypingSpeed = 2
ax1.bar(x1_TypingSpeed, df_ToAnalyze.typing_speed[df_ToAnalyze.typing_mechanism==0].mean(), yerr = [[0], [df_ToAnalyze.typing_speed[df_ToAnalyze.typing_mechanism==0].std()]], label = 'Dwell time')
ax1.errorbar(x1_TypingSpeed, df_ToAnalyze.typing_speed[df_ToAnalyze.typing_mechanism==0].mean(), yerr = [[0], [df_ToAnalyze.typing_speed[df_ToAnalyze.typing_mechanism==0].std()]], elinewidth=3, ecolor='k')
ax1.bar(x2_TypingSpeed, df_ToAnalyze.typing_speed[df_ToAnalyze.typing_mechanism==1].mean(), yerr = [[0], [df_ToAnalyze.typing_speed[df_ToAnalyze.typing_mechanism==1].std()]], label = 'Multi-key selection')
ax1.errorbar(x2_TypingSpeed, df_ToAnalyze.typing_speed[df_ToAnalyze.typing_mechanism==1].mean(), yerr = [[0], [df_ToAnalyze.typing_speed[df_ToAnalyze.typing_mechanism==1].std()]], elinewidth=3, ecolor='k')
ax1.set_ylabel('Typing speed [in wpm]', fontsize = label_fontSize)
ax1.set_yticks(np.arange(0, 11, 2))
#ax1.legend(bbox_to_anchor=(1, 0.79), loc = 'lower right', fontsize = others_fontSize, frameon=False)
ax1.legend(bbox_to_anchor=(0.27, 0.802), loc = 'lower left', fontsize = others_fontSize, frameon=False)
ax1.tick_params(axis='both', which='major', labelsize= others_fontSize)
ax1.set_xticks([1.5])
ax1.set_xticklabels(['Typing speed'])
ax1.set_ylim([0, 10.4])

# annotate with significance level
maxTypingSpeed = 9.65
y_TypingSpeed, h, col = maxTypingSpeed, 0.2, 'k'
ax1.plot([x1_TypingSpeed, x1_TypingSpeed, x2_TypingSpeed, x2_TypingSpeed], [y_TypingSpeed, y_TypingSpeed+h, y_TypingSpeed+h, y_TypingSpeed], lw=2, c=col)
#plt.plot([x1, x1, x2, x2], [y, y+h, y+h, y], lw=1.5, c=col)
ax1.text((x1_TypingSpeed+x2_TypingSpeed)*.5, y_TypingSpeed+h+0.02, "p<0.001", ha='center', va='bottom', color=col, fontsize = others_fontSize)


x1_ErrorRate = 5
x2_ErrorRate = 6
ax2.bar(x1_ErrorRate, df_ToAnalyze.error_rate[df_ToAnalyze.typing_mechanism==0].mean())
ax2.errorbar(x1_ErrorRate, df_ToAnalyze.error_rate[df_ToAnalyze.typing_mechanism==0].mean(), yerr = [[0], [df_ToAnalyze.error_rate[df_ToAnalyze.typing_mechanism==0].std()]], elinewidth=3, ecolor='k')
ax2.bar(x2_ErrorRate, df_ToAnalyze.error_rate[df_ToAnalyze.typing_mechanism==1].mean())
ax2.errorbar(x2_ErrorRate, df_ToAnalyze.error_rate[df_ToAnalyze.typing_mechanism==1].mean(), yerr = [[0], [df_ToAnalyze.error_rate[df_ToAnalyze.typing_mechanism==1].std()]], linewidth = 3, ecolor='k')
ax2.set_ylabel('Error rate [in %]', fontsize = label_fontSize)
#plt.xticks([], [])
#ax1.grid(color='b', alpha = 0.1, linestyle='-', linewidth=2)
#ax2.grid(color='b', alpha = 0.1, linestyle='-', linewidth=1.7)
ax2.set_ylim([0, 104])
ax2.tick_params(axis='both', which='major', labelsize=others_fontSize)
ax2.set_xticks([1.5, 5.5])
ax2.set_xticklabels(['Typing speed', 'Error rate'])
ax2.spines['bottom'].set_linewidth(4)
ax2.spines['left'].set_linewidth(4)
ax2.spines['right'].set_linewidth(4)
ax2.spines['top'].set_linewidth(4)

# annotate with significance level
maxErrorRate = 76
y_ErrorRate, h, col = maxErrorRate, 2, 'k'
ax2.plot([x1_ErrorRate, x1_ErrorRate, x2_ErrorRate, x2_ErrorRate], [y_ErrorRate, y_ErrorRate+h, y_ErrorRate+h, y_ErrorRate], lw=2, c=col)
#plt.plot([x1, x1, x2, x2], [y, y+h, y+h, y], lw=1.5, c=col)
ax2.text((x1_ErrorRate+x2_ErrorRate)*.5, y_ErrorRate+h, "n.s.", ha='center', va='bottom', color=col, fontsize = others_fontSize)



plt.tight_layout()
fig.savefig('typingSpeed_errorRate_DTandMS.png', dpi = 300, bbox_to_anchor = (0.95, 0.1))


#plt.show()
#plt.ioff()

In [None]:
import pylab
fig = pylab.figure()
#figlegend = pylab.figure(figsize=(3,2))
ax = fig.add_subplot(111)
lines = ax.plot(range(10), pylab.randn(10), range(10), pylab.randn(10))
figlegend.legend(lines, ('Dwell time', 'Multi-key selection'), 'center', prop={'size': 6})
#fig.show()
figlegend.show()
figlegend.savefig('legend.png', dpi = 300)

# Typing Mechanism

## Typing Mechanism and other Independent variables

### Pie charts

In [None]:
fig = plt.figure()

nSubj = len(df_ToAnalyze)

# Overall
# Gender
ax = fig.add_subplot(131)
df_ToAnalyze.gender.value_counts(sort = False).plot.pie(ax = ax, labels = ['Male', 'Female'], autopct='%1.1f%%')
ax.set_title('Gender distribution \n Total count = ' + str(nSubj))

# Age
ax = fig.add_subplot(132)
df_ToAnalyze.age_bins.value_counts(sort = False).plot.pie(ax = ax, labels = ['Less than 30', 'Greater than 30'], autopct='%1.1f%%')
ax.set_title('Age distribution \n Total count = ' + str(nSubj))

# Typing mechanism
ax = fig.add_subplot(133)
df_ToAnalyze.typing_mechanism.value_counts(sort = False).plot.pie(ax = ax, labels = ['Dwell time', 'MultiKey selection'], autopct='%1.1f%%')
ax.set_title('Typing mechanism distribution \n Total count = ' + str(nSubj))


In [None]:
fig = plt.figure()

# DWELL TIME
# Gender and Typing mechanism
ax = fig.add_subplot(221)
df_ToAnalyze.gender[df['typing_mechanism']==0].value_counts(sort = False).plot.pie(ax = ax, labels = ['Male', 'Female'], autopct='%1.1f%%')
ax.set_title('Dwell time')

# Age and Typing mechanism
ax = fig.add_subplot(223)
df_ToAnalyze.age_bins[df['typing_mechanism']==0].value_counts(sort = False).plot.pie(ax = ax, labels = ['Less than 30', 'Greater than 30'], autopct='%1.1f%%')

# MULTIKEY SELECTION
# Gender and Typing mechanism
ax = fig.add_subplot(222)
df_ToAnalyze.gender[df['typing_mechanism']==1].value_counts(sort = False).plot.pie(ax = ax, labels = ['Male', 'Female'], autopct='%1.1f%%')
ax.set_title('MultiKey Selection')

# Age and Typing mechanism
ax = fig.add_subplot(224)
df_ToAnalyze.age_bins[df['typing_mechanism']==1].value_counts(sort = False).plot.pie(ax = ax, labels = ['Less than 30', 'Greater than 30'], autopct='%1.1f%%')


The minimum number in the pie charts is 5 (Female, MultiKey Selection). This is the number of subjects that will be choosen 
randomly from every category and the statistical tests will be performed again

# Attended But Not Selected Rate

In [None]:
# Box Plots

figBoxPlot = plt.figure()

# Typing mechanism
ax = figBoxPlot.add_subplot(111)
df_ToAnalyze.boxplot( column = 'attended_but_not_selected_rate_time', by='typing_mechanism', ax = ax, grid = False)
plt.xticks([1, 2], ['Dwell Time', 'Multi-key Selection'])
plt.ylabel('Attended but not selected ratio of time')

In [None]:
# Equivalence testing on ansr

sd_ansr_DT = df_ToAnalyze.attended_but_not_selected_rate_time[df_ToAnalyze['typing_mechanism']==0].std()
sd_ansr_MS = df_ToAnalyze.attended_but_not_selected_rate_time[df_ToAnalyze['typing_mechanism']==1].std()

mean_ansr_DT = df_ToAnalyze.attended_but_not_selected_rate_time[df_ToAnalyze['typing_mechanism']==0].mean()
mean_ansr_MS = df_ToAnalyze.attended_but_not_selected_rate_time[df_ToAnalyze['typing_mechanism']==1].mean()

n_DT = df_ToAnalyze.typing_mechanism[df_ToAnalyze['typing_mechanism'] == 0].count()
n_MS = df_ToAnalyze.typing_mechanism[df_ToAnalyze['typing_mechanism'] == 1].count()

sd = math.sqrt(((n_DT - 1)*(sd_ansr_DT)**2 + (n_MS - 1)*(sd_ansr_MS)**2)/(n_DT + n_MS - 2))

print(sd*0.3)

print(mean_ansr_DT, sd_ansr_DT)
print(mean_ansr_MS, sd_ansr_MS)

delError = 0.005 # As per Lakens 2017 Equivalence testing, del = d*sd, where d is Cohen's d = 0.3. For the given data, sd = ~0.006

tL = (mean_ansr_DT - mean_ansr_MS - (-delError))/(math.sqrt(((sd_ansr_DT**2)/n_DT) + (sd_ansr_MS**2)/n_MS))
tU = (mean_ansr_DT - mean_ansr_MS - (delError))/(math.sqrt(((sd_ansr_DT**2)/n_DT) + (sd_ansr_MS**2)/n_MS))

#df = ((((sd_errorRate_DT**2)/(n_DT)) + ((sd_errorRate_MS**2)/(n_MS)))**2)/(((sd_errorRate_DT/n_DT)**2)/(n_DT-1) + ((sd_errorRate_MS/n_MS)**2)/(n_MS-1))

df = (n_DT + n_MS - 1)

print('tLower:', tL, ',tUpper:', tU,'with', df, ' degrees of freedom')

 For t(28, 0.05) = 1.701 To reject null hypothesis, both of the following should be followed : tU < -t(28, 0.05) AND 
tL > t(28, 0.05) , but here, the former is true but the latter isnot. So, the null hypothesis cannot be rejected, and both
cannot be said to be equivalent.


In [None]:
# Histogram of the two typing mechanisms

# Histograms
figHist = plt.figure()
bins = np.arange(0, float(df_ToAnalyze.attended_but_not_selected_rate_time.max())+0.01, step = 0.01)

# Typing Mechanism

ax = figHist.add_subplot(111)
df_ToAnalyze.attended_but_not_selected_rate_time[df_ToAnalyze['typing_mechanism'] == 0].hist(ax = ax, alpha=0.5, grid = False, label = 'Dwell time')
#df_ToAnalyze.hist(df_ToAnalyze.typing_speed[df_ToAnalyze['age_bins'] == 0], bins, ax = ax, grid = False)
#plt.xticks(bins)
#plt.xlabel('Attended but Not Selected Ratio of Time')
plt.ylabel('Frequency')

ax = figHist.add_subplot(111)
df_ToAnalyze.attended_but_not_selected_rate_time[df_ToAnalyze['typing_mechanism'] == 1].hist(ax = ax, alpha=0.5, grid = False, label = 'Multi-key selection')
plt.xticks(bins)
plt.xlabel('Attended but Not Selected Ratio of Time')
plt.ylabel('Frequency')
plt.title('Typing speed and Attended but Not Selected Ratio')
plt.legend()

 Since the histograms are more or less normal and the means are also more or less the same, t-test can be performed

In [None]:
scipy.stats.ttest_ind(df_ToAnalyze.attended_but_not_selected_rate_time[df.typing_mechanism == 0], df_ToAnalyze.attended_but_not_selected_rate_time[df.typing_mechanism == 1])

### Correlation between ansr and error rate

In [None]:

c = str(df_ToAnalyze['attended_but_not_selected_rate_time'].corr(df_ToAnalyze['error_rate']))

fig = plt.figure()
ax1 = fig.add_subplot(1,1,1)
ax2 = ax1.twinx()

a = df_ToAnalyze.attended_but_not_selected_rate_time.sort_values()

i = 0
for ind in a.keys():
    i = i + 1
    ax1.plot(i, df_ToAnalyze['attended_but_not_selected_rate_time'].loc[ind], 'bo')
    ax2.plot(i, df_ToAnalyze['error_rate'].loc[ind], 'ro')
    
ax1.set_title('Correlation between ansr and error rate is: %s' %c)
ax1.set_ylabel('Attended but not selected time ratio', color = 'b')
ax1.set_yticks(np.arange(0,0.13,0.02))
ax2.set_ylabel('Error rate [in %]', color = 'r')