# Human Factors Module (HFM)

 Authors - Mustafa Demir, Nancy Cooke, Sarah Ligda, Chris  Lieber, Meet Sanghvi, Ashish Amresh <br>
 Institution - Arizona State University <br>
 Email - <mustafa.demir@asu.edu>, <Nancy.Cooke@asu.edu>, <sligda@asu.edu>, <clieber@asu.edu>, <Meet.Sanghvi@asu.edu>, <amresh@asu.edu>;

This module performs a Step Wise AIC to find the best formula out of all the independent variables. After finding the best formula from this module, it can be used in the HPMMv2 Module to use the catboost regressor to predict the loss of separation

HFM (this module) is a purely statistical model

## Code Requirements

- pandas - https://pandas.pydata.org/pandas-docs/stable/getting_started/install.html <br>
- numpy - https://numpy.org/install/

## How to use this file?
- Edit the fields and file name to be used from the csv data file you are passing
- Edit the mapping dictionary = data_type_for_each_column ==> which state which column from the csv file is what type of data
- Edit the list of independent variables

## Preprocessing Note
- The code does processing of the data file provided
- it removes/deletes all rows from the csv where value is NaN

In [6]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as sm

#### Edit this dictionary to map your column with the data type

In [7]:
data_type_for_each_column = {
    "Ss": "int",
    "condtn": "str",
    "baseline": "int",
    "highworkload": "int",
    "highworkload offnominal": "int",
    "at_sec": "int",
    "traffic_density": "int",
    "los_freq": "int",
    "los_duration_over5min": "int",
    "query": "str",
    "ready_latency": "float",
    "ready_latency_adj": "float",
    "query_latency": "float",
    "response_index": "int",
    "ready_timed_out": "int",
    "query_timed_out": "int",
    "stimuli": "str",
    "response_text": "str",
    "sa_correct": "float",
    "wl_rating": "int",
    "interbeat_interval": "float",
    "condtn_num": "int",
    "face_conf": "str",
    "rx": "float",
    "ry": "float",
    "rz": "float",
    "eyeblink": "float",
    "positive": "float",
    "neutral": "float",
    "negative": "float",
    "emo_conf": "int",
    "los_severity": "float",
    "CLCD": "int",
    "In_transmission (binary)": "int",
    "transmission started (count)": "int",
    "transmission ended (count)": "int",
    "words per transmission (syn_complexity)": "int",
    "length of transmission(in_sec)": "float",
    "Words_sec": "float",
    "time filled in previous interval (up to 5 seconds)": "float",
    "time since last transmissions": "float",
    "True Pilot or ATC": "str",
    "Interval-Pilot (p) OR ATC (a) (ap for shared intervals)": "str",
    "pilot communication time": "float",
    "air traffic communication time in previous interval": "float",
    "ratio of comms at interval P:A": "str",
}

In [8]:
def step_aic(model, independent_variables, dependent_variables, **kwargs):
    """
    This function is used to select the best formula with smallest AIC
    Both exog and endog values can be either str or list.

    Note: This adopt only "forward" selection

    Args:
        model: model from statsmodels.formula.api
        independent_variables (str or list): dependent variables
        dependent_variables (str or list): independent variables
        kwargs: extra keyword argments for model (e.g., data, family)

    Returns:
        model: a model that seems to have the smallest AIC
        formula: str rep of the best formula
    """

    independent_variables = np.r_[[independent_variables]].flatten()
    dependent_variables = np.r_[[dependent_variables]].flatten()
    remaining = set(independent_variables)
    selected = []

    formula_head = ' + '.join(dependent_variables) + ' ~ '
    formula = formula_head + '1'
    aic = model(formula=formula, **kwargs).fit().aic
    print('AIC: {}, formula: {}'.format(round(aic, 3), formula))

    current_score, best_new_score = np.ones(2) * aic

    while remaining and current_score == best_new_score:
        scores_with_candidates = []
        for candidate in remaining:
            formula_tail = ' + '.join(selected + [candidate])
            formula = formula_head + formula_tail
            aic = model(formula=formula, **kwargs).fit().aic
            print('AIC: {}, formula: {}'.format(round(aic, 3), formula))

            scores_with_candidates.append((aic, candidate))

        scores_with_candidates.sort()
        scores_with_candidates.reverse()
        best_new_score, best_candidate = scores_with_candidates.pop()

        if best_new_score < current_score:
            remaining.remove(best_candidate)
            selected.append(best_candidate)
            current_score = best_new_score

    formula = formula_head + ' + '.join(selected)
    print('The best formula: {}'.format(formula))
    return model(formula, **kwargs).fit(), formula

In [9]:
class HFM:
    def __init__(self, cfg: dict):
        self.data = pd.read_csv(cfg['file_name'], usecols=cfg['fields'])

    def preprocess(self):
        # drop the rows which have NaN elements
        self.data = self.data.dropna()

        # convert each column as the data type it is supposed to be - by default pandas sometimes reads columns as
        # objects instead of strings or ints
        for col in self.data.columns.values:
            self.data[col] = self.data[col].astype(data_type_for_each_column[col])

        return self.data

## Code Starts Here

#### Edit the fields name & the file_name of the csv data which you are passing

In [10]:
cfg = {
        "fields": ['traffic_density', 'ready_latency', 'traffic_density', 'ready_latency', 'query_latency',
                   'query_timed_out', 'sa_correct', 'interbeat_interval', 'rx', 'ry', 'rz', 'eyeblink', 'positive',
                   'neutral', 'negative', 'CLCD', 'Words_sec', 'los_severity', 'los_freq'],
        'file_name': "data.csv"
}

In [11]:
hfm = HFM(cfg)
data = hfm.preprocess()

#### Edit these variables

In [12]:
list_of_independent_variables = ['traffic_density', 'ready_latency', 'query_latency', 'query_timed_out', 'sa_correct',
                                   'interbeat_interval', 'rx', 'ry', 'rz', 'eyeblink', 'positive', 'neutral',
                                   'negative', 'CLCD', 'Words_sec']
list_of_dependent_variables = ['los_freq'] # or los_severity

In [13]:
model, formula = step_aic(model=sm.ols,
                          independent_variables=list_of_independent_variables,
                          dependent_variables=list_of_dependent_variables,
                          data=data)

AIC: 3114.908, formula: los_freq ~ 1
AIC: 3116.754, formula: los_freq ~ query_latency
AIC: 3116.017, formula: los_freq ~ neutral
AIC: 3106.166, formula: los_freq ~ Words_sec
AIC: 3104.985, formula: los_freq ~ query_timed_out
AIC: 3107.269, formula: los_freq ~ rx
AIC: 3116.799, formula: los_freq ~ ry
AIC: 3085.111, formula: los_freq ~ negative
AIC: 2674.504, formula: los_freq ~ traffic_density
AIC: 3116.44, formula: los_freq ~ CLCD
AIC: 3115.657, formula: los_freq ~ eyeblink
AIC: 3113.361, formula: los_freq ~ positive
AIC: 3096.299, formula: los_freq ~ interbeat_interval
AIC: 3116.858, formula: los_freq ~ sa_correct
AIC: 3115.155, formula: los_freq ~ ready_latency
AIC: 3115.158, formula: los_freq ~ rz
AIC: 2672.289, formula: los_freq ~ traffic_density + query_latency
AIC: 2675.628, formula: los_freq ~ traffic_density + neutral
AIC: 2662.508, formula: los_freq ~ traffic_density + Words_sec
AIC: 2672.699, formula: los_freq ~ traffic_density + query_timed_out
AIC: 2673.767, formula: los_fr

AIC: 2486.144, formula: los_freq ~ traffic_density + interbeat_interval + rz + negative + positive + Words_sec + query_latency + sa_correct + neutral + CLCD
AIC: 2487.13, formula: los_freq ~ traffic_density + interbeat_interval + rz + negative + positive + Words_sec + query_latency + sa_correct + neutral + eyeblink
AIC: 2486.42, formula: los_freq ~ traffic_density + interbeat_interval + rz + negative + positive + Words_sec + query_latency + sa_correct + neutral + ready_latency
AIC: 2483.561, formula: los_freq ~ traffic_density + interbeat_interval + rz + negative + positive + Words_sec + query_latency + sa_correct + neutral + rx + query_timed_out
AIC: 2483.07, formula: los_freq ~ traffic_density + interbeat_interval + rz + negative + positive + Words_sec + query_latency + sa_correct + neutral + rx + ry
AIC: 2483.003, formula: los_freq ~ traffic_density + interbeat_interval + rz + negative + positive + Words_sec + query_latency + sa_correct + neutral + rx + CLCD
AIC: 2482.84, formula: l

In [14]:
print(F"\n\n*****************************************************")
print(F"Best Formula = {formula}\n")



*****************************************************
Best Formula = los_freq ~ traffic_density + interbeat_interval + rz + negative + positive + Words_sec + query_latency + sa_correct + neutral + rx



In [15]:
print(F"*****************************************************")
print(F"{model.summary()}")

*****************************************************
                            OLS Regression Results                            
Dep. Variable:               los_freq   R-squared:                       0.439
Model:                            OLS   Adj. R-squared:                  0.434
Method:                 Least Squares   F-statistic:                     87.55
Date:                Mon, 26 Jul 2021   Prob (F-statistic):          5.64e-133
Time:                        11:58:10   Log-Likelihood:                -1229.9
No. Observations:                1130   AIC:                             2482.
Df Residuals:                    1119   BIC:                             2537.
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------

In [16]:
print(F"*****************************************************")
print(f'AIC = {model.aic}')
# model is of type ==> https://www.statsmodels.org/stable/generated/statsmodels.regression.linear_model.RegressionResults.html
# you can access variables within model as mentioned in this link
# some examples

*****************************************************
AIC = 2481.795130480846


In [17]:
# print(F"*****************************************************")
# print(f'P Values = \n{model.pvalues}')
# print(F"*****************************************************")
# print(f'T Values = \n{model.tvalues}')
# print(F"*****************************************************")
# print(f'R Square = {model.rsquared}')
# print(F"*****************************************************")
# print(f'F-Statistic = {model.fvalue}')
# print(F"*****************************************************")