# Generate Features

In [None]:
%load_ext autoreload
%autoreload 2
import pandas as pd
from utils.import_utils import *
from hydra import initialize, initialize_config_module, initialize_config_dir, compose
from omegaconf import OmegaConf
with initialize(config_path='../configuration', version_base='1.1'):
    config = compose(config_name='main.yaml')
survey_list = SurveyManager(config)
dfs_paradata, dfs_questionnaires, dfs_microdata = survey_list.get_dataframes(reload=True)

# Microdata based features

In [None]:
#group_columns = [col for col in dfs_microdata.columns if col.endswith("__id")]+['survey_name', 'survey_version']
item_level_columns = ['interview__id', 'variable_name', 'roster_level']

feat_item = dfs_microdata[item_level_columns+['value', 'type', 'is_integer', 'n_answers', 'answer_sequence']].copy()

feat_item['value'].fillna('', inplace=True)

text_question_mask = (feat_item['type'] == 'TextQuestion')
numeric_question_mask = (feat_item['type'] == 'NumericQuestion') & (feat_item['value'] != '')
decimal_question_mask = (feat_item['is_integer'] == False) & (feat_item['value'] != '')

# TODO, should we limit to active questions, interviewer only, etc?


In [None]:
# f__string_length, length of string answer, if TextQuestions, empty if not
feat_item['f__string_length'] = pd.NA
feat_item.loc[text_question_mask, 'f__string_length'] = feat_item.loc[text_question_mask, 'value'].str.len()
feat_item['f__string_length']=feat_item['f__string_length'].astype('Int64')


In [None]:
# f__numeric_response, response, if NumericQuestions, empty if not
feat_item['f__numeric_response'] = np.nan
feat_item.loc[numeric_question_mask, 'f__numeric_response'] = feat_item[numeric_question_mask]['value'].astype(float)

In [None]:
# f__first_digit, first digit of the response if numeric question, empty if not
feat_item['f__first_digit'] = pd.NA
feat_item.loc[numeric_question_mask, 'f__first_digit'] = feat_item.loc[numeric_question_mask, 'value'].astype(str).str[0].astype('Int64')

In [None]:
# f__last_digit, modulus of 10 of the response if numeric question, empty if not
feat_item['f__last_digit'] = pd.NA
feat_item.loc[numeric_question_mask, 'f__last_digit'] = feat_item.loc[numeric_question_mask, 'value'].astype(int) % 10

In [None]:
# f__first_decimal, first decimal digit if numeric question, empty if not
feat_item['f__first_decimal'] = pd.NA
values = feat_item.loc[decimal_question_mask, 'value'].astype(float)
feat_item.loc[decimal_question_mask, 'f__first_decimal'] = np.floor(values * 10) % 10
feat_item['f__first_decimal']=feat_item['f__first_decimal'].astype('Int64')

In [None]:
# f__rel_answer_position, relative position of the selected answer
feat_item['f__answer_position'] = pd.NA
single_question_mask = (feat_item['type']=='SingleQuestion') & (feat_item['n_answers'] > 2 ) # only questions with more than two answers
feat_item.loc[single_question_mask, 'f__answer_position'] = feat_item.loc[single_question_mask].apply(lambda row: round(row['answer_sequence'].index(row['value'])/(row['n_answers']-1),3) if (row['value'] in row['answer_sequence']) and pd.notnull(row['value']) else None, axis=1)

In [None]:
# f__Latitude, f__Longitude, f__Accuracy
gps_mask = feat_item['type'] == 'GpsCoordinateQuestion'
gps_df = feat_item.loc[gps_mask, 'value'].str.split(',', expand=True)
gps_df.columns = ['gps__Latitude', 'gps__Longitude', 'gps__Accuracy', 'gps__Altitude', 'gps__Timestamp']
feat_item.loc[gps_mask, 'f__Latitude'] = pd.to_numeric(gps_df['gps__Latitude'], errors='coerce')
feat_item.loc[gps_mask, 'f__Longitude'] = pd.to_numeric(gps_df['gps__Longitude'], errors='coerce')
feat_item.loc[gps_mask, 'f__Accuracy'] = pd.to_numeric(gps_df['gps__Accuracy'], errors='coerce')
feat_item.drop([col for col in feat_item.columns if col.startswith('gps__')], axis=1, inplace=True)


In [None]:
import numpy as np
# f__answers_selected, number of answers selected in a multi-answer or list question
# f__share_selected, share between answers selected, and available answers (only for unlinked questions)

def count_elements_or_nan(val): # Function to calculate number of elements in a list or return nan
    if isinstance(val, list):
        return len(val)
    else:
        return np.nan

multi_list_mask = feat_item['type'].isin(['MultyOptionsQuestion', 'TextListQuestion'])
feat_item.loc[multi_list_mask,'f__answers_selected'] = feat_item.loc[multi_list_mask, 'value'].apply(count_elements_or_nan)
feat_item['f__share_selected'] = round(feat_item['f__answers_selected'] / feat_item['n_answers'],3)

# Paradata based features

In [None]:
# dfs_paradata modifications, move to import-utils?

# streamline missing (empty, NaN) to '', important to identify duplicates in terms of roster below
dfs_paradata.fillna('', inplace=True)

# interviewing, True prior to Supervisor/HQ interaction, else False
events_split = ['RejectedBySupervisor', 'OpenedBySupervisor', 'OpenedByHQ', 'RejectedByHQ']
grouped = dfs_paradata.groupby('interview__id')
dfs_paradata['interviewing'] = False
for _, group_df in grouped:
    matching_events = group_df['event'].isin(events_split)
    min_index = group_df.index.min()
    if matching_events.any():
        max_index = matching_events.idxmax() - 1
    else:
        max_index = group_df.index.max()
    dfs_paradata.loc[min_index:max_index, 'interviewing'] = True
# TODO: @GABRIELE, please update above, fix to interviewing column

In [None]:
# df_para_active, active events, prior rejection/review events, for questions with scope interviewer

active_events = ['InterviewCreated', 'AnswerSet', 'Resumed', 'AnswerRemoved', 'CommentSet', 'Restarted']

# keep active events, prior rejection/review events, for questions with scope interviewer
active_mask = (dfs_paradata['event'].isin(active_events)) & \
              (dfs_paradata['interviewing']) & \
              (dfs_paradata['question_scope'] == 0) & \
              (dfs_paradata['role']==1)


vars_needed = ['interview__id', 'order', 'event', 'responsible', 'role', 'tz_offset', 'param', 'answer','roster_level', 'datetime_utc', 'variable_name', 'question_sequence', 'question_scope', 'type', 'question_type',  'survey_name', 'survey_version', 'interviewing', 'yes_no_view']

df_para_active = dfs_paradata.loc[active_mask,vars_needed].copy().sort_values(['interview__id', 'order']).reset_index()
# TODO @Gabriele, reset the index after appending in para

# only keep events done by interview (in most cases this should be all, after above filters, just in case supervisor or HQ answered something while interviewer answered on web mode)
#df_active = df_active[df_active['role']==1]

In [None]:
# f__duration_answer, total time spent to record answers, i.e. sum of all time-intervals from active events ending with the item being AnswerSet or AnswerRemoved
# f__duration_comment, total time spent to comment, i.e. sum of all time-intervals from active events ending with the item being CommentSet
# f__time_reset
df_time = df_para_active.copy()

# calculate time difference in seconds
df_time['time_difference'] = df_time.groupby('interview__id')['datetime_utc'].diff()
df_time['time_difference'] = df_time['time_difference'].dt.total_seconds()
df_time['f__time_changed'] = np.where(df_time['time_difference'] < -120, df_time['time_difference'], np.nan)
df_time.loc[df_time['time_difference'] < 0, 'time_difference'] = pd.NA
# time for answers/comments
df_time['f__duration_answer'] = df_time.loc[df_time['event'].isin(['AnswerSet', 'AnswerRemoved']), 'time_difference']
df_time['f__duration_comment'] = df_time.loc[df_time['event']=='CommentSet', 'time_difference']

# summarize on item level
df_time = df_time.groupby(item_level_columns).agg(
    f__duration_answer=('f__duration_answer', 'sum'),
    f__duration_comment=('f__duration_comment', 'sum'),
    f__time_changed=('f__time_changed', 'sum')
    ).reset_index()

# drop rows without VariableName
df_time = df_time[df_time['variable_name']!='']

# merge into feat_item
feat_item = feat_item.merge(df_time, on=item_level_columns, how='outer', indicator=True)
# TODO: Gabriele, integrate this cell, sets negative numbers to NA and generates new feature
# TODO: I think we should we use TZ adjusted time instead of utc, no?

# Find rows from df_time that didn't have a match
#unmatched_rows = df_time[~df_time.isin(merged_df)]
#merged_df = merged_df[(merged_df['value']!='') & (merged_df['f__duration_answer'].isna())]

In [None]:
# last AnswerSet on item-level
df_last = df_para_active[df_para_active['event']=='AnswerSet'].groupby(item_level_columns).last()
df_last = df_last.sort_values(['interview__id', 'order']).reset_index()

In [None]:
# f__previous_question, f__previous_answer, f__previous_roster for previous answer set
df_last['f__previous_question'] = df_last.groupby('interview__id')['variable_name'].shift(fill_value='')
df_last['f__previous_answer'] = df_last.groupby('interview__id')['answer'].shift(fill_value='')
df_last['f__previous_roster'] = df_last.groupby('interview__id')['roster_level'].shift(fill_value='')


In [None]:
# f__half_hour, half-hour interval of last time answered
df_last['f__half_hour'] = df_last['datetime_utc'].dt.hour + df_last['datetime_utc'].dt.round('30min').dt.minute / 60

# f__in_working_hours, indication if f__half_hour is within working hours
half_hour_counts = df_last['f__half_hour'].value_counts().sort_index()

threshold = half_hour_counts.median()*0.33  # approach 1: interval < 1/3 of the median count of answers set
working_hours_1 = half_hour_counts[half_hour_counts >= threshold].index.tolist()

cumulative_share = (half_hour_counts.sort_values().cumsum()/half_hour_counts.sum()).sort_index()
working_hours_2 = half_hour_counts[cumulative_share >= 0.05].index.tolist() # approach 2: the least frequent intervals with total of 5% of answers set

df_last['f__in_working_hours'] = df_last['f__half_hour'].isin(working_hours_2)

# f__half_hour_prob_norm
df_last['half_hour_probability'] = df_last['f__half_hour'].map(df_last['f__half_hour'].value_counts(normalize=True))
max_probability = df_last['half_hour_probability'].max()
df_last['f__half_hour_prob_norm'] =  ((max_probability - df_last['half_hour_probability']) / max_probability)

# to be merged into df_item

# TODO: add timezone offset, think about if we want to do this by day of the week or by calendar day?


In [None]:
# f__sequence_jump, Difference between actual answer sequence and question sequence in the questionnaire, in difference to previous question
df_last['answer_sequence'] = df_last.groupby('interview__id').cumcount() + 1
df_last['diff'] = df_last['question_sequence'] - df_last['answer_sequence']
df_last['f__sequence_jump'] = df_last.groupby('interview__id')['diff'].diff()

merge_columns = item_level_columns + [c for c in df_last.columns if c.startswith('f__')]
feat_item = feat_item.merge(df_last[merge_columns], on=item_level_columns, how='outer')


In [None]:
# f__answer_changed

df_changed_temp = df_para_active[df_para_active['event'] == 'AnswerSet'].copy()
df_changed_temp['f__answer_changed'] = False

# list and multi-select questions (without yes_no_mode)
list_mask = (df_changed_temp['type'] == 'TextListQuestion')
multi_mask = (df_changed_temp['yes_no_view'] == False)
df_changed_temp['answer_list'] = pd.NA
df_changed_temp.loc[list_mask, 'answer_list'] = df_changed_temp.loc[list_mask,'answer'].str.split('|')
df_changed_temp.loc[multi_mask, 'answer_list'] = df_changed_temp.loc[multi_mask,'answer'].str.split(', |\\|')
df_changed_temp['prev_answer_list'] = df_changed_temp.groupby(item_level_columns)['answer_list'].shift()
answers_mask = df_changed_temp['prev_answer_list'].notna()
df_changed_temp.loc[answers_mask,'f__answer_changed'] = df_changed_temp.loc[answers_mask].apply(lambda row: not set(row['prev_answer_list']).issubset(set(row['answer_list'])), axis=1)

# single answer question
df_changed_temp['prev_answer'] = df_changed_temp.groupby(item_level_columns)['answer'].shift()
single_answer_mask = (~df_changed_temp['type'].isin(['MultyOptionsQuestion', 'TextListQuestion'])) & \
                     (df_changed_temp['prev_answer'].notna()) & \
                     (df_changed_temp['answer'] != df_changed_temp['prev_answer'])
df_changed_temp.loc[single_answer_mask, 'f__answer_changed'] = True

# yes_no_view questions
yesno_mask = (df_changed_temp['yes_no_view'] == True)
df_filtered = df_changed_temp[yesno_mask].copy()
df_filtered[['yes_list', 'no_list']] = df_filtered['answer'].str.split('|', expand=True)
df_filtered['yes_list'] = df_filtered['yes_list'].str.split(', ').apply(lambda x: [] if x == [''] or x is None else x)
df_filtered['no_list'] = df_filtered['no_list'].str.split(', ').apply(lambda x: [] if x == [''] or x is None else x)
df_filtered['prev_yes_list'] = df_filtered.groupby(item_level_columns)['yes_list'].shift(fill_value=[])
df_filtered['prev_no_list'] = df_filtered.groupby(item_level_columns)['no_list'].shift(fill_value=[])
df_changed_temp.loc[yesno_mask,'f__answer_changed'] = df_filtered.apply(lambda row: not set(row['prev_yes_list']).issubset(set(row['yes_list'])), axis=1)
df_changed_temp.loc[yesno_mask,'f__answer_changed'] = df_filtered.apply(lambda row: not set(row['prev_no_list']).issubset(set(row['no_list'])), axis=1)

# count on item level
df_changed_temp = df_changed_temp.groupby(item_level_columns)['f__answer_changed'].sum().reset_index()

# TODO: merge into df_item

## Other paradata based features

In [None]:
splits = df_para_multi['answer'].str.split(', |\\|')

In [None]:
# f__answer_removed, answers removed (by interviewer, or by system as a result of interviewer action).
removed_mask = (dfs_paradata['interviewing']) & \
               (dfs_paradata['interviewing']) & \
               (dfs_paradata['event']=='AnswerRemoved')
df_item_removed = dfs_paradata[removed_mask]

df_item_removed = df_item_removed.groupby(item_level_columns).agg(
    f__answer_removed=('order', 'count'),
    ).reset_index()
# to be merged into df_item

df_unit_removed = df_item_removed.groupby('interview__id').agg(
    f__answer_removed=('f__answer_removed', 'sum'),
)
# to be merged into df_unit


In [None]:
# f__comments_set, f_comment_length
comment_mask = (dfs_paradata['event']=='CommentSet') & \
               (dfs_paradata['role']==1) & \
               (dfs_paradata['interviewing'])
df_item_comment = dfs_paradata[comment_mask].copy()
df_item_comment['f__comment_length'] = df_item_comment['answer'].str.len()
df_item_comment = df_item_comment.groupby(item_level_columns).agg(
    f__comments_set=('order', 'count'),
    f__comment_length=('f__comment_length', 'sum'),
    ).reset_index()
# to be merged into df_item

df_unit_comment = df_item_comment.groupby('interview__id').agg(
    f__comments_set=('f__comments_set', 'sum'),
    f__comment_length=('f__comment_length', 'sum')
).reset_index()
# to be merged into df_unit


In [None]:
# f__pause_count, f__pause_duration, f__pause_list
df_paused_temp =  dfs_paradata[['interview__id', 'order', 'event', 'datetime_utc', 'interviewing']].copy()
df_paused_temp['prev_event'] = df_paused_temp.groupby('interview__id')['event'].shift(fill_value='')
df_paused_temp['prev_datetime'] = df_paused_temp.groupby('interview__id')['datetime_utc'].shift()
pause_mask = df_paused_temp['event'].isin(['Restarted', 'Resumed']) & \
             df_paused_temp['prev_event'].isin(['Paused']) & \
             (df_paused_temp['interviewing'])
df_paused_temp = df_paused_temp.loc[pause_mask].copy()
df_paused_temp['pause_duration'] = df_paused_temp['datetime_utc'] - df_paused_temp['prev_datetime']
df_paused_temp['pause_seconds'] = df_paused_temp['pause_duration'].dt.total_seconds().astype('Int64')
df_paused_temp = df_paused_temp.groupby('interview__id').agg({
    'pause_seconds': ['count', 'sum', lambda x: x.tolist()]
})

df_paused_temp.columns = ['f_pause_count', 'f_pause_duration', 'f_pause_list']
df_paused_temp = df_paused_temp.reset_index()

# to be merged into df_unit


In [None]:
trans_mask = (dfs_paradata['interviewing']) & \
             (dfs_paradata['event'].isin(['AnswerSet','TranslationSwitched']))

df_trans_temp =  dfs_paradata.loc[trans_mask,['interview__id', 'order', 'event', 'param']].copy().reset_index()
df_trans_temp['seq'] = df_trans_temp.groupby('interview__id').cumcount() + 1

# Define a function to calculate the relative positions
def relative_translation_positions(group):
    total_rows = len(group)
    translation_position = group.loc[group['event'] == 'TranslationSwitched','seq']
    relative_positions = [pos / total_rows for pos in translation_position]
    return relative_positions

# Group by 'interview__id' and apply the function
df_trans_temp = df_trans_temp.groupby('interview__id').apply(relative_translation_positions).reset_index().rename(columns={0: 'f__translation_positions'})

# to be merged into df_unit

# TODO
1. add timezone to hour of the day
2. add index to paradata import utils
3. redo interviewing column
4. @Gabriele, after the refactor, roster_level seems empty and in column answer
5. remove the answer changed from refactor
6. @ Gabriele, I will need interview__errors in long