# Question latency


1. Calculate the response latency for each question, i.e. the time in seconds from the previous questions being answered/commented to the current question being answered/commented.
2. Considers only the timing for questions answered by interviewers prior to any supervisor/HQ rejection/review event.
3. It calculates the total time spent for one question/roster-level, and counts the number of time the question was visited (answer set or commented, ignoring consecutive events for one question/roster-level).
5. We need to move over the fillna part to the paradata generation.

In [3]:
%load_ext autoreload
%autoreload 2
import pandas as pd
from utils.import_utils import *
from hydra import initialize, initialize_config_module, initialize_config_dir, compose
from omegaconf import OmegaConf
with initialize(config_path='../configuration', version_base='1.1'):
    config = compose(config_name='main.yaml')
survey_list = SurveyManager(config)
dfs_paradata, dfs_questionnaires, dfs_microdata = survey_list.get_dataframes(reload=True)



In [23]:
import pandas as pd

# generate new df
vars_needed = ['interview__id', 'order', 'event', 'responsible', 'role', 'tz_offset', 'param', 'answer','roster_level', 'datetime_utc', 'VariableName', 'question_seq', 'type', 'QuestionType']
df_time = dfs_paradata[vars_needed].copy()

# streamline missings (empty, NaN) to '', important to identify duplicates in terms of roster below
df_time.fillna('', inplace=True)

In [None]:
# only keep  interviewing events prior to Supervisor/HQ interaction
events_split = ['RejectedBySupervisor', 'OpenedBySupervisor', 'OpenedByHQ', 'RejectedByHQ']
grouped = df_time.groupby('interview__id')
df_time['interviewing'] = False
for _, group_df in grouped:
    first_reject_index = group_df['event'].isin(events_split).idxmax()-1
    min_index = group_df.index.min()
    df_time.loc[min_index:first_reject_index, 'interviewing'] = True
df_time = df_time[df_time['interviewing']]
df_time = df_time.drop(columns=['interviewing'])

In [24]:
 # keep only events relevant for calculating response latency

#events_to_drop = ['SupervisorAssigned', 'InterviewerAssigned', 'KeyAssigned', 'VariableDisabled','ReceivedByInterviewer', 'KeyAssigned', 'VariableEnabled', 'VariableSet', 'QuestionDeclaredInvalid', 'QuestionDeclaredValid', 'Completed', 'TranslationSwitched','ReceivedBySupervisor','OpenedBySupervisor','ApproveBySupervisor','ClosedBySupervisor', 'InterviewModeChanged', 'Paused', 'RejectedBySupervisor']

events_to_keep = ['InterviewCreated', 'AnswerSet', 'Resumed', 'AnswerRemoved', 'CommentSet', 'Restarted'] # check in other example data sets that there are no other relevant events
df_time = df_time[df_time['event'].isin(events_to_keep)]

#df_time = df_time[~df_time['event'].isin(events_to_drop)] # to x-check we have all interviewer events
#df_time['event'].unique()

In [25]:
# keep only events done by interview (should not exist for most cases after above filters, just in case supervisor or HQ answered something while interviewer answered on web mode)
df_time = df_time[df_time['role']==1]

In [27]:
# if the same question was repeatedly answered/commented on the same roster level, keep only the last one (to take the overall time for the question)
group_col = ['interview__id', 'VariableName', 'roster_level']
df_time['is_diff'] = (df_time[group_col].shift() != df_time[group_col]).any(axis=1)
df_time['keep'] = df_time['is_diff'].shift(-1, fill_value=True)
df_time = df_time[df_time['keep']]
df_time.drop(columns=['is_diff', 'keep'], inplace=True)


In [28]:
# calculate time difference in seconds
df_time['time_difference'] = df_time.groupby('interview__id')['datetime_utc'].diff()
df_time['time_difference'] = df_time['time_difference'].dt.total_seconds()


In [30]:
# keep only AnswerSet and CommentSet events, we ignore timing for AnswerRemoved as it is also system generated
df_time = df_time[df_time['event'].isin(['AnswerSet', 'CommentSet'])]


In [32]:
# sum total time per question and roster level, count number of times the question was revisited (answered or commented, after other questions were answered)

df_latency = df_time.groupby(group_col).agg(
    total_duration=('time_difference', 'sum'),
    n_revisited=('time_difference', 'count')
    ).reset_index()
