In [1]:
%matplotlib
#%matplotlib inline
import os
import csv
import fnmatch
import numpy as np
import datetime
import re 
import pandas as pd
import matplotlib.pyplot as plt
import math
import xlsxwriter
import scipy.stats
import statsmodels.api as sm
from statsmodels.formula.api import ols
import statsmodels
from statsmodels.graphics.regressionplots import abline_plot
from statsmodels.stats.api import anova_lm
import seaborn as sns

pd.options.mode.use_inf_as_na = True

Using matplotlib backend: Qt5Agg


  from pandas.core import datetools


Obtain data in form of pandas dataframe

In [6]:
# csv file with the saved data
fileName = r"C:\DTU\Data\201805_HealthnRehab\data_summary.csv"

df = pd.read_csv(fileName, delimiter=',')

typingMechanismDictKeys = set(df['typing_mechanism'])

# create reference data frame for age bins and gender (0:Less than 30/Male, 1:Greater than 30/Female)
labels = ['age_bins', 'gender', 'typing_mechanism']
dataReference = [['Less than 30', 'Male', 'Dwell-Time'], ['Greater than 30', 'Female', 'Multi-Key Selection']]
df_refAgeGender = pd.DataFrame.from_records(dataReference, columns=labels)

# replace data in df in string or category
df = df.replace({'Less than 30': 0, 'Greater than 30': 1, 'Male': 0, 'Female': 1, 'DT' : 0, 'MS' : 1})

# Create dataframe without Nan values
df_woNaGenderAgeExperience = df.dropna(subset=['gender','age_bins', 'gaze_interaction_experience'], how='any') 

# Create a copy of the dataframe to avoid loc errors.
df_ToAnalyze = df_woNaGenderAgeExperience.copy()

# Convert columns of age_bins, typing_mechanism, gender into categorical columns
df_ToAnalyze['gender'] = pd.Categorical(df_woNaGenderAgeExperience.gender).codes
df_ToAnalyze['typing_mechanism'] = pd.Categorical(df_woNaGenderAgeExperience.typing_mechanism).codes
df_ToAnalyze['age_bins'] = pd.Categorical(df_woNaGenderAgeExperience.age_bins).codes
df_ToAnalyze['gaze_interaction_experience'] = pd.Categorical(df_woNaGenderAgeExperience.gaze_interaction_experience).codes

In [7]:
df_ToAnalyze.head

<bound method NDFrame.head of           timestamp subject_name  typing_mechanism    age  age_bins  gender  \
1   5/15/2018 14:56        be_DT                 0     40         1       1   
3   5/15/2018 12:35       KEA_MS                 1  20-25         0       0   
4   5/15/2018 11:11      lone_DT                 0  50-55         1       1   
5   5/15/2018 12:12       mcc_MS                 1  20-25         0       0   
6   5/15/2018 14:44        MK_DT                 0  45-50         1       1   
7   5/15/2018 10:58        MT_MS                 1     25         0       0   
8   5/15/2018 12:28        ok_MS                 1  20-25         0       0   
9   5/15/2018 12:51        pt_DT                 0     27         0       0   
11  5/15/2018 15:42        sh_MS                 1  20-25         0       1   
12  5/15/2018 11:48       slh_DT                 0     10         0       1   
13  5/16/2018 15:13        ae_DT                 0     68         1       0   
14  5/16/2018 13:12   

In [8]:
n_Males = df_ToAnalyze.gender[df_ToAnalyze['gender'] == 0].count()
n_Females = df_ToAnalyze.gender[df_ToAnalyze['gender'] == 1].count()

n_Below30 = df_ToAnalyze.age_bins[df_ToAnalyze['age_bins'] == 0].count()
n_Above30 = df_ToAnalyze.age_bins[df_ToAnalyze['age_bins'] == 1].count()

n_DT = df_ToAnalyze.typing_mechanism[df_ToAnalyze['typing_mechanism'] == 0].count()
n_MS = df_ToAnalyze.typing_mechanism[df_ToAnalyze['typing_mechanism'] == 1].count()

n_ExperienceNever = df_ToAnalyze.gaze_interaction_experience[df_ToAnalyze['gaze_interaction_experience'] == 1].count()
n_ExperienceMultiple = df_ToAnalyze.gaze_interaction_experience[df_ToAnalyze['gaze_interaction_experience'] == 0].count()
n_ExperienceOnce = df_ToAnalyze.gaze_interaction_experience[df_ToAnalyze['gaze_interaction_experience'] == 2].count()


print('Males:', n_Males, 'Females:', n_Females)
print('Below 30:', n_Below30, 'Above 30', n_Above30)
print('Dwell time', n_DT, 'Multi-key selection:', n_MS)
print('No previous experience with gaze:', n_ExperienceNever, 'Tried once before:', n_ExperienceOnce, 'Multiple times', n_ExperienceMultiple)

Males: 16 Females: 13
Below 30: 15 Above 30 14
Dwell time 15 Multi-key selection: 14
No previous experience with gaze: 18 Tried once before: 1 Multiple times 10


In [9]:
n_DT_Below30 = df_ToAnalyze.typing_mechanism[(df_ToAnalyze['typing_mechanism'] == 0) & (df_ToAnalyze['age_bins'] == 0)].count()
n_DT_Above30 = df_ToAnalyze.typing_mechanism[(df_ToAnalyze['typing_mechanism'] == 0) & (df_ToAnalyze['age_bins'] == 1)].count()

n_MS_Below30 = df_ToAnalyze.typing_mechanism[(df_ToAnalyze['typing_mechanism'] == 1) & (df_ToAnalyze['age_bins'] == 0)].count()
n_MS_Above30 = df_ToAnalyze.typing_mechanism[(df_ToAnalyze['typing_mechanism'] == 1) & (df_ToAnalyze['age_bins'] == 1)].count()


n_DT_Males = df_ToAnalyze.typing_mechanism[(df_ToAnalyze['typing_mechanism'] == 0) & (df_ToAnalyze['gender'] == 0)].count()
n_DT_Females = df_ToAnalyze.typing_mechanism[(df_ToAnalyze['typing_mechanism'] == 0) & (df_ToAnalyze['gender'] == 1)].count()

n_MS_Males = df_ToAnalyze.typing_mechanism[(df_ToAnalyze['typing_mechanism'] == 1) & (df_ToAnalyze['gender'] == 0)].count()
n_MS_Females = df_ToAnalyze.typing_mechanism[(df_ToAnalyze['typing_mechanism'] == 1) & (df_ToAnalyze['gender'] == 1)].count()


print(n_DT_Below30, n_DT_Above30, n_MS_Below30, n_MS_Above30)
print(n_DT_Males, n_DT_Females, n_MS_Males, n_MS_Females)

7 8 8 6
7 8 9 5


In [13]:
# Experience (comfort, challenge level, fun) - 
df_ToAnalyze.how_challenging_was_the_task

AttributeError: 'DataFrame' object has no attribute 'how_challenging_was_the_task'

# Analysis of typing speed 

### OLS regression

In [3]:
# OLS regression to check residuals of data for anova testing 
X = df_ToAnalyze[['age_bins', 'typing_mechanism', 'gender']] # independent variable
y_typingSpeed = df_ToAnalyze.typing_speed # dependent variable

model_typingSpeed = sm.OLS(y_typingSpeed, X)
model_fit_typingSpeed = model_typingSpeed.fit()

p_typingSpeed = model_fit_typingSpeed.params

# Plot the residuals of each
residuals_typingSpeed = model_fit_typingSpeed.resid # residuals
fig = sm.qqplot(residuals_typingSpeed)
plt.show()

model_fit_typingSpeed.summary()

0,1,2,3
Dep. Variable:,typing_speed,R-squared:,0.731
Model:,OLS,Adj. R-squared:,0.7
Method:,Least Squares,F-statistic:,23.6
Date:,"Fri, 27 Jul 2018",Prob (F-statistic):,1.37e-07
Time:,10:51:14,Log-Likelihood:,-76.312
No. Observations:,29,AIC:,158.6
Df Residuals:,26,BIC:,162.7
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
age_bins,0.5286,1.227,0.431,0.670,-1.994,3.051
typing_mechanism,6.2813,1.064,5.903,0.000,4.094,8.468
gender,2.7654,1.239,2.232,0.034,0.219,5.312

0,1,2,3
Omnibus:,0.106,Durbin-Watson:,2.013
Prob(Omnibus):,0.948,Jarque-Bera (JB):,0.036
Skew:,0.042,Prob(JB):,0.982
Kurtosis:,2.849,Cond. No.,2.2


Comments:
Only 73.5% of variance is explained -- is high correlation value
Prob(F-statistic) --> Null hypothesis is rejected, that the variability is random and alternate hypothesis that the variability
can be explained is accepted

From the graph, we see that the residuals are normally distributed

### ANOVA 

In [4]:
formula_typingSpeed = 'df_ToAnalyze.typing_speed ~ df_ToAnalyze.age_bins + df_ToAnalyze.gender + df_ToAnalyze.typing_mechanism + df_ToAnalyze.age_bins:df_ToAnalyze.gender + df_ToAnalyze.age_bins:df_ToAnalyze.typing_mechanism + df_ToAnalyze.gender:df_ToAnalyze.typing_mechanism'
model_typingSpeed = ols(formula_typingSpeed, df_ToAnalyze).fit()
aov_table_typingSpeed = statsmodels.stats.anova.anova_lm(model_typingSpeed, typ=2)
print(aov_table_typingSpeed)

                                                       sum_sq    df  \
df_ToAnalyze.age_bins                               28.096317   1.0   
df_ToAnalyze.gender                                  0.038918   1.0   
df_ToAnalyze.typing_mechanism                       48.519850   1.0   
df_ToAnalyze.age_bins:df_ToAnalyze.gender            8.279473   1.0   
df_ToAnalyze.age_bins:df_ToAnalyze.typing_mecha...   0.089929   1.0   
df_ToAnalyze.gender:df_ToAnalyze.typing_mechanism    3.831038   1.0   
Residual                                            71.913248  22.0   

                                                            F    PR(>F)  
df_ToAnalyze.age_bins                                8.595342  0.007720  
df_ToAnalyze.gender                                  0.011906  0.914101  
df_ToAnalyze.typing_mechanism                       14.843394  0.000863  
df_ToAnalyze.age_bins:df_ToAnalyze.gender            2.532891  0.125764  
df_ToAnalyze.age_bins:df_ToAnalyze.typing_mecha...   0.027511

### Plots of typing speed wrt independent variables

In [37]:
# Box Plots

figBoxPlot = plt.figure()

# Age
ax = figBoxPlot.add_subplot(131)
df_ToAnalyze.boxplot( column = 'typing_speed', by='age_bins', ax = ax, grid = False)
plt.xticks([1, 2], ['Less than 30', 'Greater than 30'])
plt.yticks(np.arange(0, float(df.typing_speed.max())+2, step = 2))
plt.ylabel('Typing speed [in wpm]')

# Gender
ax = figBoxPlot.add_subplot(132)
df_ToAnalyze.boxplot(column = 'typing_speed', by='gender', ax = ax, grid = False)
plt.xticks([1, 2], ['Male', 'Female'])
plt.yticks(np.arange(0, float(df.typing_speed.max())+2, step = 2))
plt.ylabel('Typing speed [in wpm]')

# Typing mechanism
ax = figBoxPlot.add_subplot(133)
df_ToAnalyze.boxplot(column = 'typing_speed', by='typing_mechanism', ax = ax, grid = False)
plt.xticks([1, 2], ['Dwell time', 'MultiKey selection'])
plt.yticks(np.arange(0, float(df.typing_speed.max())+2, step = 2))
plt.ylabel('Typing speed [in wpm]')

print('Average speed and standard deviation for dwell time selection are:', df_ToAnalyze.typing_speed[df_ToAnalyze['typing_mechanism']==0].mean(), df_ToAnalyze.typing_speed[df_ToAnalyze['typing_mechanism']==0].std())
print('Average speed and standard deviation for multi-key selection are:', df_ToAnalyze.typing_speed[df_ToAnalyze['typing_mechanism']==1].mean(), df_ToAnalyze.typing_speed[df_ToAnalyze['typing_mechanism']==1].std())


Average speed and standard deviation for dwell time selection are: 4.6125712236 2.123543537879204
Average speed and standard deviation for multi-key selection are: 7.495420426000001 1.9917046006795018




<matplotlib.legend.Legend at 0x20d68b8e518>

In [40]:
# Histograms
figHist = plt.figure()
bins = np.arange(0, float(df_ToAnalyze.typing_speed.max()+2), step = 2)

# Age
ax = figHist.add_subplot(141)
df_ToAnalyze.hist( column = 'typing_speed', ax = ax, grid = False)
plt.xticks(bins)
plt.xlabel('Typing speed [in wpm]')
plt.yticks(np.arange(0, 11, step = 2))
plt.ylabel('Frequency')

ax = figHist.add_subplot(142)
df_ToAnalyze.typing_speed[df_ToAnalyze['age_bins'] == 0].hist(ax = ax, alpha=0.5, grid = False, label = 'Below 30 years')
#df_ToAnalyze.hist(df_ToAnalyze.typing_speed[df_ToAnalyze['age_bins'] == 0], bins, ax = ax, grid = False)
plt.xticks(bins)
plt.xlabel('Typing speed [in wpm]')
plt.ylabel('Frequency')

ax = figHist.add_subplot(142)
df_ToAnalyze.typing_speed[df_ToAnalyze['age_bins'] == 1].hist(ax = ax, alpha=0.5, grid = False, label = 'Above 30 years')
plt.xticks(bins)
plt.xlabel('Typing speed [in wpm]')
plt.ylabel('Frequency')
plt.title('Typing speed and Age')
plt.legend()

# Gender

ax = figHist.add_subplot(143)
df_ToAnalyze.typing_speed[df_ToAnalyze['gender'] == 0].hist(ax = ax, alpha=0.5, grid = False, label = 'Male')
#df_ToAnalyze.hist(df_ToAnalyze.typing_speed[df_ToAnalyze['age_bins'] == 0], bins, ax = ax, grid = False)
plt.xticks(bins)
plt.xlabel('Typing speed [in wpm]')
plt.ylabel('Frequency')

ax = figHist.add_subplot(143)
df_ToAnalyze.typing_speed[df_ToAnalyze['gender'] == 1].hist(ax = ax, alpha=0.5, grid = False, label = 'Female')
plt.xticks(bins)
plt.xlabel('Typing speed [in wpm]')
plt.ylabel('Frequency')
plt.title('Typing speed and Gender')
plt.legend()

# Typing Mechanism

ax = figHist.add_subplot(144)
df_ToAnalyze.typing_speed[df_ToAnalyze['typing_mechanism'] == 0].hist(ax = ax, alpha=0.5, grid = False, label = 'Dwell time')
#df_ToAnalyze.hist(df_ToAnalyze.typing_speed[df_ToAnalyze['age_bins'] == 0], bins, ax = ax, grid = False)
plt.xticks(bins)
plt.xlabel('Typing speed [in wpm]')
plt.ylabel('Frequency')

ax = figHist.add_subplot(144)
df_ToAnalyze.typing_speed[df_ToAnalyze['typing_mechanism'] == 1].hist(ax = ax, alpha=0.5, grid = False, label = 'Multi-key selection')
plt.xticks(bins)
plt.xlabel('Typing speed [in wpm]')
plt.ylabel('Frequency')
plt.title('Typing speed and Typing Mechanism')
plt.legend()



<matplotlib.legend.Legend at 0x20d68ae5c88>

With the new calculations of typing speed, the histograms for typing speed are normal when distributed as per age and gender, 
but not using typing mechanism. Therefore, mann whitney test will be performed for typing mechanism (their variances are still
more or less equal) and t-test for the others.

### Typing speed statistical analysis

In [55]:
# Age


# Gender


# Typing mechanism
# the mann whitney test in scipy.stats clearly says that it is reliable only for sample size of 20 in each. Since we do not have
# that, this test will be performed manually, based on - http://psych.unl.edu/psycrs/handcomp/hcmann.PDF
n_DT = df_ToAnalyze.typing_mechanism[df_ToAnalyze['typing_mechanism'] == 0].count()
n_MS = df_ToAnalyze.typing_mechanism[df_ToAnalyze['typing_mechanism'] == 1].count()

# Rank the data and add up
RankedSumDT = sum(df_ToAnalyze.typing_speed[df_ToAnalyze['typing_mechanism'] == 0].rank(axis=0, method='average'))

U_DT = n_DT*n_MS + n_DT*(n_DT + 1)/2 - RankedSumDT
U_MS = n_DT*n_MS - U_DT
U_select = min(U_MS, U_DT)
U_critical = 59
if U_select < U_critical:
    print('reject Ho')

# using the function
U_stat, p_val = scipy.stats.mannwhitneyu(df_ToAnalyze.typing_speed[df_ToAnalyze['typing_mechanism'] == 0], df_ToAnalyze.typing_speed[df_ToAnalyze['typing_mechanism'] == 1])
print(U_stat, p_val)

reject Ho
31.0 0.0006687654485574123


In [56]:
print(n_DT, n_MS)

15 14


In [None]:
# Age
a = df_ToAnalyze.typing_speed[df_ToAnalyze['age_bins']==0]
b = df_ToAnalyze.typing_speed[df_ToAnalyze['age_bins']==1]
(h, p) = scipy.stats.kruskal(a.values, b.values)
print('Kruskal result for age: H-statistic=', h, ', pvalue=', p)

# Gender
a = df_ToAnalyze.typing_speed[df_ToAnalyze['gender']==0]
b = df_ToAnalyze.typing_speed[df_ToAnalyze['gender']==1]
(h, p) = scipy.stats.kruskal(a.values, b.values)
print('Kruskal result for gender: H-statistic=', h, ', pvalue=', p)

# Typing mechanism
a = df_ToAnalyze.typing_speed[df_ToAnalyze['typing_mechanism']==0]
b = df_ToAnalyze.typing_speed[df_ToAnalyze['typing_mechanism']==1]
(h, p) = scipy.stats.kruskal(a.values, b.values)
print('Kruskal result for typing mechanism: H-statistic=', h, ', pvalue=', p)

# Analysis of error rate

### OLS regression

In [None]:
# OLS regression to check residuals of data for anova testing 
X = df_ToAnalyze[['age_bins', 'typing_mechanism', 'gender']] # independent variable
y_errorRate = df_ToAnalyze.error_rate # dependent variable

model_errorRate = sm.OLS(y_errorRate,X)
model_fit_errorRate = model_errorRate.fit()

p_errorRate = model_fit_errorRate.params

# Plot the residuals of each
residuals_errorRate = model_fit_errorRate.resid # residuals
fig = sm.qqplot(residuals_errorRate)
plt.show()

model_fit_errorRate.summary()

Comments: The residual plot is slightly skewed --> anova cannot be performed 
But Prob(F-statistic) is low enough to reject null hypothesis
Still, anova is not performed due to the skewed residual plot

In [None]:
## TODO: Perform Shierer Ray Hare test 

# For now, it is performed in R

## Equivalence testing of error rate for typing mechansims
Here, the null hypothesis is that error rates for both is not equivalent
For a practical equivalence value of 'del', Ho: |mu1 - mu2| > del and Ha: |mu1 - mu2| < del
So, to reject the null hypothesis, both should be rejected:
mu1 - mu2 > del and mu1 - mu2 < -del
or in terms of two-one sided test :
    ((y1m - y2m) + del)/sigma(1/n1 + 1/n2) > z(1-alpha) and ((y1m - y2m) - del)/sigma(1/n1 + 1/n2) < -z(1-alpha)

In [None]:
sd_errorRate_DT = df_ToAnalyze.error_rate[df_ToAnalyze['typing_mechanism']==0].std()
sd_errorRate_MS = df_ToAnalyze.error_rate[df_ToAnalyze['typing_mechanism']==1].std()

n_DT = df_ToAnalyze.typing_mechanism[df_ToAnalyze['typing_mechanism'] == 0].count()
n_MS = df_ToAnalyze.typing_mechanism[df_ToAnalyze['typing_mechanism'] == 1].count()

mean_errorRate_DT = df_ToAnalyze.error_rate[df_ToAnalyze['typing_mechanism']==0].mean()
mean_errorRate_MS = df_ToAnalyze.error_rate[df_ToAnalyze['typing_mechanism']==1].mean()

sd = math.sqrt(((n_DT - 1)*(sd_errorRate_DT)**2 + (n_MS - 1)*(sd_errorRate_MS)**2)/(n_DT + n_MS - 2))

print(sd*0.3)

In [None]:
delError = 10 # As per Lakens 2017 Equivalence testing, del = d*sd, where d is Cohen's d = 0.3. For the given data, sd = ~36

tL = (mean_errorRate_DT - mean_errorRate_MS - (-delError))/(math.sqrt(((sd_errorRate_DT**2)/n_DT) + (sd_errorRate_MS**2)/n_MS))
tU = (mean_errorRate_DT - mean_errorRate_MS - (delError))/(math.sqrt(((sd_errorRate_DT**2)/n_DT) + (sd_errorRate_MS**2)/n_MS))

#df = ((((sd_errorRate_DT**2)/(n_DT)) + ((sd_errorRate_MS**2)/(n_MS)))**2)/(((sd_errorRate_DT/n_DT)**2)/(n_DT-1) + ((sd_errorRate_MS/n_MS)**2)/(n_MS-1))

df = (n_DT + n_MS - 1)

print('tLower:', tL, ',tUpper:', tU,'with', df, ' degrees of freedom')

#For t(28, 0.05) = 1.701
To reject null hypothesis, both of the following should be followed - 
tU < -t(28, 0.05) AND tL > t(28, 0.05)
As both are not, null hypothesis cannot be rejected 
Therefore, both cannot be equivalent

In [None]:
print('Average error rate and standard deviation for dwell time selection are:', df_ToAnalyze.error_rate[df_ToAnalyze['typing_mechanism']==0].mean(), df_ToAnalyze.error_rate[df_ToAnalyze['typing_mechanism']==0].std())
print('Average error rate and standard deviation for multi-key selection are:', df_ToAnalyze.error_rate[df_ToAnalyze['typing_mechanism']==1].mean(), df_ToAnalyze.error_rate[df_ToAnalyze['typing_mechanism']==1].std())

### Plots of error rate wrt independent variables

In [None]:
fig = plt.figure()

# Age
ax = fig.add_subplot(131)
df_ToAnalyze.boxplot(column = 'error_rate', by='age_bins', ax = ax, grid = False)
plt.xticks([1, 2], ['Less than 30', 'Greater than 30'])
#plt.yticks(np.arange(0, float(df.typing_speed.max())+2, step = 2))
plt.ylabel('Typing speed [in wpm]')

# Gender
ax = fig.add_subplot(132)
df_ToAnalyze.boxplot(column = 'error_rate', by='gender', ax = ax, grid = False)
plt.xticks([1, 2], ['Male', 'Female'])
#plt.yticks(np.arange(0, float(df.typing_speed.max())+2, step = 2))
plt.ylabel('Typing speed [in wpm]')

# Typing mechanism
ax = fig.add_subplot(133)
df_ToAnalyze.boxplot(column = 'error_rate', by='typing_mechanism', ax = ax, grid = False)
plt.xticks([1, 2], ['Dwell time', 'MultiKey selection'])
#plt.yticks(np.arange(0, float(df.typing_speed.max())+2, step = 2))
plt.ylabel('Typing speed [in wpm]')


### Error rate statistical analysis

In [None]:
# Age
a = df_ToAnalyze.error_rate[df_ToAnalyze['age_bins']==0]
b = df_ToAnalyze.error_rate[df_ToAnalyze['age_bins']==1]
(h, p) = scipy.stats.kruskal(a.values, b.values)
print('Kruskal result for age: H-statistic=', h, ', pvalue=', p)

# Typing mechanism
a = df_ToAnalyze.error_rate[df_ToAnalyze['gender']==0]
b = df_ToAnalyze.error_rate[df_ToAnalyze['gender']==1]
(h, p) = scipy.stats.kruskal(a.values, b.values)
print('Kruskal result for gender: H-statistic=', h, ', pvalue=', p)

# Typing mechanism
a = df_ToAnalyze.error_rate[df_ToAnalyze['typing_mechanism']==0]
b = df_ToAnalyze.error_rate[df_ToAnalyze['typing_mechanism']==1]
(h, p) = scipy.stats.kruskal(a.values, b.values)
print('Kruskal result for typing mechanism: H-statistic=', h, ', pvalue=', p)

In [None]:
k = np.hstack(df_ToAnalyze.typing_speed)
plt.hist(k)

# Typing Mechanism and other Independent variables

### Pie charts

In [None]:
fig = plt.figure()

nSubj = len(df_ToAnalyze)

# Overall
# Gender
ax = fig.add_subplot(131)
df_ToAnalyze.gender.value_counts(sort = False).plot.pie(ax = ax, labels = ['Male', 'Female'], autopct='%1.1f%%')
ax.set_title('Gender distribution \n Total count = ' + str(nSubj))

# Age
ax = fig.add_subplot(132)
df_ToAnalyze.age_bins.value_counts(sort = False).plot.pie(ax = ax, labels = ['Less than 30', 'Greater than 30'], autopct='%1.1f%%')
ax.set_title('Age distribution \n Total count = ' + str(nSubj))

# Typing mechanism
ax = fig.add_subplot(133)
df_ToAnalyze.typing_mechanism.value_counts(sort = False).plot.pie(ax = ax, labels = ['Dwell time', 'MultiKey selection'], autopct='%1.1f%%')
ax.set_title('Typing mechanism distribution \n Total count = ' + str(nSubj))


In [None]:
fig = plt.figure()

# DWELL TIME
# Gender and Typing mechanism
ax = fig.add_subplot(221)
df_ToAnalyze.gender[df['typing_mechanism']==0].value_counts(sort = False).plot.pie(ax = ax, labels = ['Male', 'Female'], autopct='%1.1f%%')
ax.set_title('Dwell time')

# Age and Typing mechanism
ax = fig.add_subplot(223)
df_ToAnalyze.age_bins[df['typing_mechanism']==0].value_counts(sort = False).plot.pie(ax = ax, labels = ['Less than 30', 'Greater than 30'], autopct='%1.1f%%')

# MULTIKEY SELECTION
# Gender and Typing mechanism
ax = fig.add_subplot(222)
df_ToAnalyze.gender[df['typing_mechanism']==1].value_counts(sort = False).plot.pie(ax = ax, labels = ['Male', 'Female'], autopct='%1.1f%%')
ax.set_title('MultiKey Selection')

# Age and Typing mechanism
ax = fig.add_subplot(224)
df_ToAnalyze.age_bins[df['typing_mechanism']==1].value_counts(sort = False).plot.pie(ax = ax, labels = ['Less than 30', 'Greater than 30'], autopct='%1.1f%%')


The minimum number in the pie charts is 5 (Female, MultiKey Selection). This is the number of subjects that will be choosen 
randomly from every category and the statistical tests will be performed again