# Generate Features

In [125]:
%load_ext autoreload
%autoreload 2
import pandas as pd
from utils.import_utils import *
from hydra import initialize, initialize_config_module, initialize_config_dir, compose
from omegaconf import OmegaConf
with initialize(config_path='../configuration', version_base='1.1'):
    config = compose(config_name='main.yaml')
survey_list = SurveyManager(config)
dfs_paradata, dfs_questionnaires, dfs_microdata = survey_list.get_dataframes(reload=True)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


  else:


# Microdata based features

In [126]:
#group_columns = [col for col in dfs_microdata.columns if col.endswith("__id")]+['survey_name', 'survey_version']
item_level_columns = ['interview__id', 'VariableName', 'roster_level']

feat_item = dfs_microdata[item_level_columns+['value', 'type', 'IsInteger', 'n_answers', 'answer_sequence']].copy()

feat_item['value'].fillna('', inplace=True)

text_question_mask = (feat_item['type'] == 'TextQuestion')
numeric_question_mask = (feat_item['type'] == 'NumericQuestion') & (feat_item['value'] != '')
decimal_question_mask = (feat_item['IsInteger'] == False) & (feat_item['value'] != '')

# TODO, should we limit to active questions, interviewer only, etc?


In [127]:
# f__string_length, length of string answer, if TextQuestions, empty if not
feat_item['f__string_length'] = pd.NA
feat_item.loc[text_question_mask, 'f__string_length'] = feat_item.loc[text_question_mask, 'value'].str.len()
feat_item['f__string_length']=feat_item['f__string_length'].astype('Int64')


In [128]:
# f__numeric_response, response, if NumericQuestions, empty if not
feat_item['f__numeric_response'] = np.nan
feat_item.loc[numeric_question_mask, 'f__numeric_response'] = feat_item[numeric_question_mask]['value'].astype(float)

In [129]:
# f__first_digit, first digit of the response if numeric question, empty if not
feat_item['f__first_digit'] = pd.NA
feat_item.loc[numeric_question_mask, 'f__first_digit'] = feat_item.loc[numeric_question_mask, 'value'].astype(str).str[0].astype('Int64')

In [130]:
# f__last_digit, modulus of 10 of the response if numeric question, empty if not
feat_item['f__last_digit'] = pd.NA
feat_item.loc[numeric_question_mask, 'f__last_digit'] = feat_item.loc[numeric_question_mask, 'value'].astype(int) % 10

In [131]:
# f__first_decimal, first decimal digit if numeric question, empty if not
feat_item['f__first_decimal'] = pd.NA
values = feat_item.loc[decimal_question_mask, 'value'].astype(float)
feat_item.loc[decimal_question_mask, 'f__first_decimal'] = np.floor(values * 10) % 10
feat_item['f__first_decimal']=feat_item['f__first_decimal'].astype('Int64')

In [132]:
# f__rel_answer_position, relative position of the selected answer
feat_item['f__answer_position'] = pd.NA
single_question_mask = (feat_item['type']=='SingleQuestion') & (feat_item['n_answers'] > 2 ) # only questions with more than two answers
feat_item.loc[single_question_mask, 'f__answer_position'] = feat_item.loc[single_question_mask].apply(lambda row: round(row['answer_sequence'].index(row['value'])/(row['n_answers']-1),3) if (row['value'] in row['answer_sequence']) and pd.notnull(row['value']) else None, axis=1)

In [134]:
# f__Latitude, f__Longitude, f__Accuracy
gps_mask = feat_item['type'] == 'GpsCoordinateQuestion'
gps_df = feat_item.loc[gps_mask, 'value'].str.split(',', expand=True)
gps_df.columns = ['gps__Latitude', 'gps__Longitude', 'gps__Accuracy', 'gps__Altitude', 'gps__Timestamp']
feat_item.loc[gps_mask, 'f__Latitude'] = pd.to_numeric(gps_df['gps__Latitude'], errors='coerce')
feat_item.loc[gps_mask, 'f__Longitude'] = pd.to_numeric(gps_df['gps__Longitude'], errors='coerce')
feat_item.loc[gps_mask, 'f__Accuracy'] = pd.to_numeric(gps_df['gps__Accuracy'], errors='coerce')
feat_item.drop([col for col in feat_item.columns if col.startswith('gps__')], axis=1, inplace=True)


In [135]:
import numpy as np
# f__answers_selected, number of answers selected in a multi-answer or list question
# f__share_selected, share between answers selected, and available answers (only for unlinked questions)

def count_elements_or_nan(val): # Function to calculate number of elements in a list or return nan
    if isinstance(val, list):
        return len(val)
    else:
        return np.nan

multi_list_mask = feat_item['type'].isin(['MultyOptionsQuestion', 'TextListQuestion'])
feat_item.loc[multi_list_mask,'f__answers_selected'] = feat_item.loc[multi_list_mask, 'value'].apply(count_elements_or_nan)
feat_item['f__share_selected'] = round(feat_item['f__answers_selected'] / feat_item['n_answers'],3)

# Paradata based features

In [136]:
# generate df with active events done by interviewer prior to rejection/review

vars_needed = ['interview__id', 'order', 'event', 'responsible', 'role', 'tz_offset', 'param', 'answer','roster_level', 'datetime_utc', 'VariableName', 'question_seq', 'type', 'QuestionType',  'survey_name', 'survey_version']
df_active = dfs_paradata[vars_needed].copy().sort_values(['interview__id', 'order']).reset_index()
# TODO @Gabriele, reset the index after appending in para
# TODO, remove hidden questions


# streamline missing (empty, NaN) to '', important to identify duplicates in terms of roster below
df_active.fillna('', inplace=True)

# only keep interviewing events prior to Supervisor/HQ interaction
events_split = ['RejectedBySupervisor', 'OpenedBySupervisor', 'OpenedByHQ', 'RejectedByHQ']
grouped = df_active.groupby('interview__id')
df_active['interviewing'] = False
for _, group_df in grouped:
    matching_events = group_df['event'].isin(events_split)
    if matching_events.any():
        first_reject_index = matching_events.idxmax() - 1
        min_index = group_df.index.min()
        df_active.loc[min_index:first_reject_index, 'interviewing'] = True
df_active = df_active[df_active['interviewing']]
df_active = df_active.drop(columns=['interviewing'])

 # only keep active events
events_to_keep = ['InterviewCreated', 'AnswerSet', 'Resumed', 'AnswerRemoved', 'CommentSet', 'Restarted']
df_active = df_active[df_active['event'].isin(events_to_keep)]

# only keep events done by interview (in most cases this should be all, after above filters, just in case supervisor or HQ answered something while interviewer answered on web mode)
df_active = df_active[df_active['role']==1]

In [194]:
# f__duration_answer, total time spent to record answers, i.e. sum of all time-intervals from active events ending with the item being AnswerSet or AnswerRemoved
# f__duration_comment, total time spent to comment, i.e. sum of all time-intervals from active events ending with the item being CommentSet

df_time = df_active.copy()

# calculate time difference in seconds
df_time['time_difference'] = df_time.groupby('interview__id')['datetime_utc'].diff()
df_time['time_difference'] = df_time['time_difference'].dt.total_seconds()

# time for answers/comments
df_time['f__duration_answer'] = df_time.loc[df_time['event'].isin(['AnswerSet', 'AnswerRemoved']), 'time_difference']
df_time['f__duration_comment'] = df_time.loc[df_time['event']=='CommentSet', 'time_difference']

# summarize on item level
df_time = df_time.groupby(item_level_columns).agg(
    f__duration_answer=('f__duration_answer', 'sum'),
    f__duration_comment=('f__duration_comment', 'sum')
    ).reset_index()

# drop rows without VariableName
df_time = df_time[df_time['VariableName']!='']

# merge into feat_item
feat_item['on_feat_item']=True
merged_df = feat_item.merge(df_time, on=item_level_columns, how='outer', indicator=True)

# Find rows from df_time that didn't have a match
#unmatched_rows = df_time[~df_time.isin(merged_df)]
#merged_df = merged_df[(merged_df['value']!='') & (merged_df['f__duration_answer'].isna())]

# TODO: We have negative durations, let's work out why

In [178]:
# last AnswerSet on item-level
df_last = df_active[df_active['event']=='AnswerSet'].groupby(item_level_columns).last()
df_last = df_last.sort_values(['interview__id', 'order']).reset_index()

In [184]:
# f__previous_question, f__previous_answer, f__previous_roster for previous answer set
df_last['f__previous_question'] = df_last.groupby('interview__id')['VariableName'].shift(fill_value='')
df_last['f__previous_answer'] = df_last.groupby('interview__id')['answer'].shift(fill_value='')
df_last['f__previous_roster'] = df_last.groupby('interview__id')['roster_level'].shift(fill_value='')


In [186]:
# f__half_hour, half-hour interval of last time answered
df_last['f__half_hour'] = df_last['datetime_utc'].dt.hour + df_last['datetime_utc'].dt.round('30min').dt.minute / 60

# f__in_working_hours, indication if f__half_hour is within working hours
half_hour_counts = df_last['f__half_hour'].value_counts().sort_index()

threshold = half_hour_counts.median()*0.33  # approach 1: interval < 1/3 of the median count of answers set
working_hours_1 = half_hour_counts[half_hour_counts >= threshold].index.tolist()

cumulative_share = (half_hour_counts.sort_values().cumsum()/half_hour_counts.sum()).sort_index()
working_hours_2 = half_hour_counts[cumulative_share >= 0.05].index.tolist() # approach 2: the least frequent intervals with total of 5% of answers set

df_last['f__in_working_hours'] = df_last['f__half_hour'].isin(working_hours_2)

# TODO: add timezone offset, think about if we want to do this by day of the week or by calendar day?


In [189]:
# sequence, work in progress
df_last['sequential'] = df_last.groupby('interview__id').cumcount() + 1

df_last['diff'] = df_last['sequential']  - df_last['question_seq']
df_last['seq_jump'] = df_last['diff'].diff()
