In [1]:
import pandas as pd
import os
from tqdm import tqdm
import itertools
from utils.eyeMovement import DataUtils, EyeMovement
from config.settings import *
import warnings
warnings.filterwarnings('ignore')

In [2]:
info = pd.read_excel('../data/data_info.xlsx', index_col=0)

In [3]:
info.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20 entries, 0 to 19
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   id             20 non-null     int64 
 1   gender         20 non-null     object
 2   edu            20 non-null     int64 
 3   age            20 non-null     int64 
 4   resources_url  20 non-null     object
 5   moca           20 non-null     int64 
dtypes: int64(4), object(2)
memory usage: 1.1+ KB


In [40]:
prefix = '../data/raw_eye_data/'
file_dirs = os.listdir(prefix)
for id, file in zip(info['id'].values, file_dirs):
    os.rename(prefix + file, prefix + str(id) + '.txt')

In [4]:
info['resources_url'] = info.apply(lambda x: './data/raw_eye_data/' + str(x['id']) + '.txt', axis=1)

In [5]:
info.to_excel('../data/data_info.xlsx')

## preprocess

In [2]:
info = pd.read_excel('../data/data_info.xlsx', index_col=0)
# info

In [49]:
fix_base_cat = ['abs', 'calc4', 'calc5', 'calc6', 'exec', 'mem8', 'mem9', 'mem10', 'recall']
fix_eye_cat = ['_aoi_ratio']
fix_feat_name = ['att'] + [x[0] + x[1] for x in itertools.product(fix_base_cat, fix_eye_cat)]
print(len(fix_feat_name))

10


In [51]:
levels = list(range(3, 12, 1))
for i in tqdm(range(info.shape[0])):
    # rocket
    url = info.iloc[i]["resources_url"]
    id = info.iloc[i]['id']
    # name = info.iloc[i]["name"]
    # t = str(info.iloc[i]["test_end_time"])
    gaze_data = pd.read_csv(url)
    util = DataUtils(gaze_data)
    x, y, time = util.get_lvl_state(util.prepare_data(), 2, 2)
    detector_l2 = EyeMovement(x, y, time, AOIs, BEZIER_POINTS)
    att = detector_l2.measureFollowRate()
    feats = [att]
    # other
    for level in levels:
        x, y, time = util.get_lvl_state(util.prepare_data(), level, 2)
        detector = EyeMovement(x, y, time, AOIs[level], BEZIER_POINTS)
        fix_data = detector.eye_movements_detector(x, y, time)
        _, _, merged = detector.merge_fixation(fix_data)
        feats.append(detector.AOI_fixation_ratio(merged))
    info.loc[info["id"] == id, fix_feat_name] = feats

100%|██████████| 20/20 [00:04<00:00,  4.95it/s]


In [54]:
info.drop(['resources_url'], axis=1).to_csv('../data/train_data.csv')

In [11]:
train_data = pd.read_csv('../data/train_data.csv', index_col=0)
train_data = train_data.drop(['id'], axis=1)
train_data

Unnamed: 0,gender,edu,age,moca,att,abs_aoi_ratio,calc4_aoi_ratio,calc5_aoi_ratio,calc6_aoi_ratio,exec_aoi_ratio,mem8_aoi_ratio,mem9_aoi_ratio,mem10_aoi_ratio,recall_aoi_ratio
0,M,0,74,10,0.700516,0.023488,1.0,0.927747,0.866292,0.817096,0.785373,0.696157,0.950593,0.133944
1,F,0,66,11,0.718593,0.066612,0.107886,0.047976,0.057994,0.320304,0.648489,0.623994,0.902747,0.677299
2,F,0,70,12,0.792929,0.068506,0.141368,0.136911,0.133884,0.411576,0.108127,0.327781,0.358554,0.282859
3,M,0,70,13,0.03263,0.077213,0.079644,0.0,0.0,0.228632,0.138101,0.127408,0.270879,0.105182
4,F,0,69,14,0.359016,0.0,0.842863,0.038973,0.835256,0.232106,0.535628,0.182919,0.577401,0.183762
5,M,0,73,15,0.473515,0.105415,0.181165,0.227017,0.098039,0.209366,0.227841,0.040755,0.326768,0.101266
6,F,1,66,16,0.533557,0.040704,0.156981,0.102299,0.672142,0.566667,0.751434,0.099722,0.049837,0.365417
7,F,1,73,17,0.743687,0.029881,0.111111,0.329261,0.199941,0.127786,0.072624,0.243182,0.193497,0.055449
8,F,1,72,18,0.486622,0.085902,0.421842,0.015788,0.109764,0.157802,0.195732,0.193154,0.29571,0.213156
9,M,1,66,19,0.295775,0.160945,0.168642,0.245254,0.0,0.407618,0.310333,0.147443,0.171828,0.145253


In [12]:
cat_cols = ['gender', 'edu']
label_col = ['moca']
num_cols = [x for x in train_data.columns if x not in cat_cols and x not in label_col]
# num_cols

In [13]:
train_data = pd.get_dummies(train_data, prefix_sep='_', columns=['gender', 'edu'])
cat_cols = ['gender_F', 'gender_M', 'edu_0', 'edu_1', 'edu_2']

In [6]:
# import numpy as np
# np.corrcoef(train_data['att'], train_data['abs_aoi_ratio'])[0][1]

from scipy.stats import pearsonr
pearsonr(train_data['att'], train_data['abs_aoi_ratio'])

(0.1225942254663756, 0.6066191599165137)

In [32]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.ensemble import GradientBoostingRegressor

X, y = train_data[cat_cols + num_cols], train_data[label_col]
X_train, X_test, y_train, y_test = train_test_split(X, y)

gb_gs = GridSearchCV(
    estimator=GradientBoostingRegressor(), 
    param_grid={
        'learning_rate': [0.01, 0.1, 0.5],
        'n_estimators': [20, 50, 100],
        'min_samples_split': [10, 20, 30],
        'min_samples_leaf': [3, 5, 7],
        'max_depth': [2, 3, 5]
    }, 
    cv=5, scoring='neg_mean_absolute_error', refit=True
)
gb_gs.fit(X_train, y_train)

In [None]:
scaler = StandardScaler()

In [33]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, median_absolute_error


model = gb_gs.best_estimator_
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
medae = median_absolute_error(y_test, y_pred)
print('GBDT MSE on testing data:', mse)
print('GBDT MAE on testing data:', mae)
print('GBDT MedianAE on testing data:', medae)

GBDT MSE on testing data: 14.087083637577635
GBDT MAE on testing data: 3.329686760143544
GBDT MedianAE on testing data: 3.5746263915064063


In [8]:
from utils.preprocess import get_train_data

get_train_data('../data/data_info.xlsx')

Unnamed: 0,gender,edu,age,moca,att,abs_aoi_ratio,calc4_aoi_ratio,calc5_aoi_ratio,calc6_aoi_ratio,exec_aoi_ratio,mem8_aoi_ratio,mem9_aoi_ratio,mem10_aoi_ratio,recall_aoi_ratio
0,M,0,74,10,0.700516,0.023488,1.0,0.927747,0.866292,0.817096,0.785373,0.696157,0.950593,0.133944
1,F,0,66,11,0.718593,0.066612,0.107886,0.047976,0.057994,0.320304,0.648489,0.623994,0.902747,0.677299
2,F,0,70,12,0.792929,0.068506,0.141368,0.136911,0.133884,0.411576,0.108127,0.327781,0.358554,0.282859
3,M,0,70,13,0.03263,0.077213,0.079644,0.0,0.0,0.228632,0.138101,0.127408,0.270879,0.105182
4,F,0,69,14,0.359016,0.0,0.842863,0.038973,0.835256,0.232106,0.535628,0.182919,0.577401,0.183762
5,M,0,73,15,0.473515,0.105415,0.181165,0.227017,0.098039,0.209366,0.227841,0.040755,0.326768,0.101266
6,F,1,66,16,0.533557,0.040704,0.156981,0.102299,0.672142,0.566667,0.751434,0.099722,0.049837,0.365417
7,F,1,73,17,0.743687,0.029881,0.111111,0.329261,0.199941,0.127786,0.072624,0.243182,0.193497,0.055449
8,F,1,72,18,0.486622,0.085902,0.421842,0.015788,0.109764,0.157802,0.195732,0.193154,0.29571,0.213156
9,M,1,66,19,0.295775,0.160945,0.168642,0.245254,0.0,0.407618,0.310333,0.147443,0.171828,0.145253


In [39]:
# X_test = X_test.values
y_baseline = X_test[:, 6:].mean(axis=1)*30
pearsonr(y_baseline, y_pred)

(0.2528699504991701, 0.6815009014095731)