This notebook contains codes that can be used to reproduce the results reported in the paper:
Reading differences in eye-tracking data as a marker of high functioning autism in adults: a machine-learning study.


In [None]:
from xgboost import XGBClassifier
import pandas as pd
from sklearn.utils import shuffle
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.utils import shuffle
from sklearn import linear_model
import numpy as np
import random
import scipy.stats as st
import sklearn
from tqdm import tqdm
import json



def LoadTextDifficulties():
    """
    Load difficulties for each text file into dictionaries
    
    difficulties1: the text difficulty measure according to how many users
        answer questions about them correctly
    
    difficulties2: the text difficulty measure according to text complexity measures
    """
    
    df = pd.read_csv("./data/reading/t_difficulty_1.csv",header=None)
    
    difficulties1 = {
        x[0].replace("Text ","").strip(): x[1] for x in df.values
    }
    
    df = pd.read_csv("./data/reading/t_difficulty_2.csv",header=None)
    
    difficulties2 = {
        x[0].replace("T","").strip(): x[1] for x in df.values
    }
    
    return difficulties1,difficulties2


def GetAnswer(user_id, user_name, text_id, question_id):
    """
    Get the answer of a particular question from an user
    
    Arguments:
        user_id: the user id
        user_name: the name of the user.
        text_id: the text_id of the question
        question_id: the question_id
        
        
    Returns:
        int: whether the question has been answered correctly (1) or incorrectly (0)
        
    """
    
    q = question_id
    
    df = pd.read_csv("./data/reading/ParticipantAnswer_N.csv")
    
    r = df[df.apply(lambda x: 
                    (x["User initials"]+"_"+x["Group"]==user_name.strip()) 
                    and (x[" Text ID"]==text_id) ,axis=1)]["Score"].values
    
    if((int(q)==0) or (int(q)==4)):
        return sum([int(x) for x in r])/len(r)
    
    if (len(r)!=3):
        return r[int(q)-1]
    
    return r[int(q)-1]
   

def GetData(file_name,text_set):
    """
    Get the data given the file_name and text_set. 
    Only choose users that we have at least one data point for each of the text in the dataset 
    
    
    Returns: a tuple of 
        attentions_original: the dataframe containing all the data points of the chosen users
        user_asd_original: list, the list of chosen asd users
        user_control_original: list, the list of chosen control users
    """
    
    difficulties1,difficulties2 = LoadTextDifficulties()
    
    attentions = pd.read_csv(file_name)
    attentions = attentions[attentions["Text ID"].isin(text_set)]
    attentions["Difficulty_1"] = attentions["Text ID"].apply(lambda x: difficulties1[str(int(x))])
    attentions["Difficulty_2"] = attentions["Text ID"].apply(lambda x: difficulties2[str(int(x))])
    
    if("sentence" not in file_name.lower()):
    
        attentions["Answer"] = attentions.apply(
            lambda x: GetAnswer(x[" User ID"], x[" User Name"], x["Text ID"], x["Paragraph_Number"]),axis=1)
        
    #attentions = attentions[~attentions[" User Name"].isin(ignored_users)]
#attentions = attentions.sample(frac=1).reset_index(drop=True)
    attentions_original=attentions[attentions[" Time to 1st View (sec)"]>0].reset_index(drop=True)
    attentions_original = attentions_original[attentions_original[" Time to 1st View (sec)"]>0]
    
    if("Group" not in attentions_original.columns):
        attentions_original["Group"] = attentions_original[" User Name"].apply(
            lambda x: "ASD" if x.endswith("ASD") else "Control")
        
    attentions_original["Group_bin"] = attentions_original["Group"].apply(
        lambda x: 0 if x=="ASD" else 1)
    attentions_original["Group_bin"] = attentions_original["Group"].apply(
        lambda x: 0 if x=="ASD" else 1)
    
    #only choose users that we have at least one data point for each of the text in the dataset
    text_id_count  = attentions_original[
        ["Text ID", " User ID", " User Name"]].drop_duplicates().groupby(
        " User Name", as_index=False
    )[["Text ID"]].count()
    
    good_text_id_count_users = text_id_count[text_id_count["Text ID"]==len(text_set)][" User Name"]
    
    attentions_original = attentions_original[
        attentions_original[" User Name"].isin(good_text_id_count_users)]
    user_asd_original = attentions_original[
        (attentions_original["Group"]=='ASD')][" User Name"].unique()
    user_control_original=attentions_original[
        (attentions_original["Group"]=='Control')][" User Name"].unique()
    
    if(len(set(user_asd_original).intersection(user_control_original))!=0):
        attentions_original[" User Name"] = attentions_original.apply(
            lambda x: x[" User Name"]+"_"+x["Group"], axis=1)

        user_asd_original = attentions_original[
            attentions_original["Group"]=='ASD'][" User Name"].unique()
        user_control_original=attentions_original[
            attentions_original["Group"]=='Control'][" User Name"].unique()
        
    print("number of asd: {}; control: {}".format(len(user_asd_original),len(user_control_original)))
    
    return attentions_original,user_asd_original,user_control_original


def RunSingleExperiment(
    attentions,
    user_asd_original,
    user_control_original,
    n_user_test,
    train_length,
    n_folds,
    one_hot_columns,eclf):
    """Run one experiment for a single set of feature.
    
      Arguments:
        attentions: the dataframe of data points for the experiment.
        user_asd_original: list, selected asd users to be included.
        user_control_original: list, selected control users to be included.
        n_user_test: int, the number of users in the test set for each group (half the total number of users used for test).
        train_length: int, the number of users in the train set for each group (half the total number of users used for train).
        n_folds: int, number of folds to run,
        one_hot_columns: list, the list of additional features
        eclf: a voting classifier ensemble
        
      Returns:
        results_clf: results for each classifier in the ensemble, in the form of accuracy of each fold
    """
    
    features_one_hots = [
        x for x in attentions.columns if any(x.startswith(y+"_") for y in one_hot_columns)]
    
    results_clf = {
        x[0]:[] for x in eclf.estimators
    }
    
    results_clf.update({"ensemble":[]})
    
    for i in range(0,n_folds):
        #sample users for training and testing
        user_asd_test = np.random.choice(user_asd_original,n_user_test)
        user_asd_train = random.sample(
            [x for x in user_asd_original if x not in user_asd_test],train_length)
        
        user_control_test = np.random.choice(user_control_original,n_user_test)
        user_control_train = random.sample(
            [x for x in user_control_original if x not in user_control_test],train_length)

        #get the data for training and testing
        attentions_train=attentions[
            (((attentions[" User Name"].isin(user_asd_train)) & (attentions["Group"]=="ASD"))|
             ((attentions[" User Name"].isin(user_control_train)) & (attentions["Group"]=="Control")))]

        attentions_test=attentions[
            (((attentions[" User Name"].isin(user_asd_test)) & (attentions["Group"]=="ASD"))
             |((attentions[" User Name"].isin(user_control_test)) & (attentions["Group"]=="Control")))]
        

        attentions_train_X=attentions_train[features+features_one_hots]
        attentions_train_Y=attentions_train["Group"]
        
        attentions_test_X=attentions_test[features+features_one_hots]
        attentions_test_Y=attentions_test["Group"]
        
        eclf.fit(attentions_train_X,attentions_train_Y)
        
        attentions_test_Y_n = attentions_test_Y.apply(lambda x: list(eclf.classes_).index(x))
        
        for clf in eclf.named_estimators_:
            #clfs[clf].fit(attentions_train_X,attentions_train_Y)
            results=eclf.named_estimators_[clf].predict(attentions_test_X)==attentions_test_Y_n
            train_outcome=[]
        
            for user in user_control_test:
                user_asd_x=attentions[(attentions[" User Name"]==user) & (attentions["Group"]=="Control")].index
                r_x=results[user_asd_x].value_counts()
                train_outcome.append(r_x.index[0])

            for user in user_asd_test:
                user_asd_x=attentions[(attentions[" User Name"]==user) & (attentions["Group"]=="ASD")].index
                r_x=results[user_asd_x].value_counts()
                train_outcome.append(r_x.index[0])
                
            correct = sum(train_outcome)
            results_clf[clf].append(correct)
            
        results=eclf.predict(attentions_test_X)==attentions_test_Y
        
        train_outcome=[]
        
        for user in user_control_test:
            user_asd_x=attentions[(attentions[" User Name"]==user) & (attentions["Group"]=="Control")].index
            r_x=results[user_asd_x].value_counts()
            train_outcome.append(r_x.index[0])

        for user in user_asd_test:
            user_asd_x=attentions[(attentions[" User Name"]==user) & (attentions["Group"]=="ASD")].index
            r_x=results[user_asd_x].value_counts()
            train_outcome.append(r_x.index[0])
        correct = sum(train_outcome)
        results_clf["ensemble"].append(correct)
    #print(one_hot_columns,results_clf)

    
    for m in results_clf:
        a = results_clf[m]
        a = np.array(a)/n_user_test/2
        results_clf[m] = a
        
        print(
            one_hot_columns,
            m, 
            np.mean(a), 
            st.t.interval(0.95, len(a)-1, loc=np.mean(a), scale=st.sem(a)),
            sep="|"
        )
        
    return results_clf



In [None]:
def RunExperiments(file_name,text_set,n_user_test,n_folds,features,one_hot_columns_all,eclf):
    """
    Run all permutations of feature sets for a particular AOI setting and text set
    
    
    Arguments:
        file_name: string, input file name.
        text_set: list, the list of text ID in the set.
        n_user_test: int, number of test users for each group, -1: 33% test, 66% train
        n_folds: int, number of folds to run,
        one_hot_columns_all: list, the list of all possible additional features
        eclf: a voting classifier ensemble
    Returns:
        grand_results: results for all feature sets
    """
    
    attentions_original,user_asd_original,user_control_original = GetData(file_name,text_set)
    
    grand_results = {}
    
    attentions = pd.get_dummies(attentions_original,columns=one_hot_columns_all)
    
    min_length = min(len(user_asd_original),len(user_control_original))
    print("number of selected users: ", min_length*2)
    n_user_test = round(min_length*0.33) if n_user_test == -1 else n_user_test
    print("number of selected test users: ", n_user_test*2)
    train_length = min_length - n_user_test
    print("number of selected train users: ", train_length*2)
    
    for n_onehot in range(1,len(one_hot_columns_all)+1):
        for one_hot_columns in list(itertools.combinations(one_hot_columns_all,n_onehot))+[()]:    
            if(one_hot_columns in grand_results):
                continue
            grand_results[one_hot_columns] = RunSingleExperiment(attentions,
                                              user_asd_original,
                                              user_control_original,
                                              n_user_test,
                                              train_length,
                                              n_folds,
                                              one_hot_columns,
                                              eclf)
            
    return grand_results

In [None]:
"""
Run the experiments on all the files
Take sometimes to run, so first test the notebook with n_folds=1, and then if it run fine, test it with n_folds=100
"""
one_hot_columns_by_AOI = {}
one_hot_columns_by_AOI["general"] = [" AOI Name", "Text ID"]#, "Paragraph_Number","Answer","Difficulty_1", "Flag"]
one_hot_columns_by_AOI["Title_paragraphs"] = one_hot_columns_by_AOI["general"] +  [
    "Paragraph_Number","Answer","Difficulty_1"]

one_hot_columns_by_AOI["Questions"] = one_hot_columns_by_AOI["general"] +  ["Paragraph_Number","Answer"]
one_hot_columns_by_AOI["Sentences"] = one_hot_columns_by_AOI["general"] +  ["Difficulty_1"]

one_hot_columns_by_AOI["Title_paragraphs_Questions_combined"] = one_hot_columns_by_AOI["general"] +  [
    "Paragraph_Number","Answer","Difficulty_1","Flag"]


set_1 = [1,2,3,4,5,6,7,8,9]
set_2 = [10,11,12,13,14,15,16,17]
set_3 = [18,19,20]

features = [' Time to 1st View (sec)', ' Time Viewed (sec)',
        ' Fixations (#)', ' Revisits (#)']

eclf = sklearn.ensemble.VotingClassifier(estimators=[
    ('randomForest', RandomForestClassifier(n_estimators=100, n_jobs=-1)),
    ('kNeigh', KNeighborsClassifier()), 
    ('svc', svm.SVC()), 
    ('logistic', linear_model.LogisticRegression(max_iter=10000)), 
    ('logisticCV', linear_model.LogisticRegressionCV(max_iter=10000)), 
    ("XGBClassifier",XGBClassifier(n_jobs=-1,nthreads=-1))],
                                         voting='hard')
import os

input_files_by_AOI_setting = {}
input_files_by_AOI_setting["Title_paragraphs"] = "./data/reading/Title_paragraph.csv"
input_files_by_AOI_setting["Questions"] = "./data/reading/Questions.csv"
input_files_by_AOI_setting["Sentences"] = "./data/reading/Sentence_aggregate.csv"
input_files_by_AOI_setting["Title_paragraphs_Questions_combined"] = "./data/reading/Text_question_combined.csv"

#Test the code with n_folds=1 first, and then set it to 100 for the final run. The final run could take up to 20 hours.
n_folds = 1
AOI_setting_results = {}

for AOI_setting in input_files_by_AOI_setting:
    
    print("current AOI setting:", AOI_setting)
    file_name = input_files_by_AOI_setting[AOI_setting]
    
    text_set_results = {}
    
    for text_set in zip(["set_1","set_2","set_3"],[set_1,set_2,set_3]):
        
        (attentions_original,
         user_asd_original,
         user_control_original) = GetData(file_name,text_set[1])
        
        n_user_test = -1
        attentions = pd.get_dummies(attentions_original,columns=one_hot_columns_by_AOI[AOI_setting])
        
        min_length = min(len(user_asd_original),len(user_control_original))
        print("number of selected users: ", min_length*2)
        n_user_test = round(min_length*0.33) if n_user_test == -1 else n_user_test
        print("number of selected test users: ", n_user_test*2)
        train_length = min_length - n_user_test
        print("number of selected train users: ", train_length*2)
        
        one_hot_columns_set = [()]
        for column in one_hot_columns_by_AOI[AOI_setting]:
            one_hot_columns_set = one_hot_columns_set + [
                x+(column,) for x in one_hot_columns_set]
        
        by_columns_results = {}
        
        for one_hot_columns in tqdm(one_hot_columns_set):
            
            by_columns_results[one_hot_columns] = RunSingleExperiment(
                attentions,
                user_asd_original,
                user_control_original,
                n_user_test,
                train_length,
                n_folds,
                one_hot_columns,
                eclf
            )
            
        text_set_results[text_set[0]] = by_columns_results
        
    AOI_setting_results[AOI_setting] = text_set_results


        






In [None]:
"""
Save the results
"""
for_df = []
for AOI_setting in input_files_by_AOI_setting:
    for text_set in ["set_1","set_2","set_3"]:
        
        one_hot_columns_set = [()]
        
        for column in one_hot_columns_by_AOI[AOI_setting]:
            one_hot_columns_set = one_hot_columns_set + [
                x+ (column,) for x in one_hot_columns_set]
        for one_hot_columns in one_hot_columns_set:
            for ml in AOI_setting_results[AOI_setting][text_set][one_hot_columns]:
                a = AOI_setting_results[AOI_setting][text_set][one_hot_columns][ml]
                for_df.append ([
                    text_set,
                    AOI_setting,
                    one_hot_columns,
                    ml,
                    np.mean(a), st.t.interval(0.95, len(a)-1, loc=np.mean(a), scale=st.sem(a))
                ])
                
df_results = pd.DataFrame(for_df)
df_results.columns = ["text_set","AOI_Setting","Feature set","ml","mean_acc","ci"]
results_filename = "./results_test.csv"
df_results.to_csv(results_filename, index=None)
            #break
        #break
    #break