# Load dependencies

In [39]:
import pandas as pd
import json

In [None]:
%load_ext autoreload
%autoreload 2

In [1]:


process_data()

usage: ipykernel_launcher.py [--help] [--hydra-help] [--version]
                             [--cfg {job,hydra,all}] [--resolve]
                             [--package PACKAGE] [--run] [--multirun]
                             [--shell-completion] [--config-path CONFIG_PATH]
                             [--config-name CONFIG_NAME]
                             [--config-dir CONFIG_DIR]
                             [--experimental-rerun EXPERIMENTAL_RERUN]
                             [--info [{all,config,defaults,defaults-tree,plugins,searchpath}]]
                             [overrides ...]
ipykernel_launcher.py: error: unrecognized arguments: -f


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [40]:
para_path = "01_sample_data/noonu_prefilled/noonu_mdv_census_2_Paradata_All"
micro_path = "01_sample_data/ifad_tunesia/EndlineFINALV5_1_STATA_All"  # not even sure we will need this
json_path = "01_sample_data/noonu_prefilled/noonu_mdv_census_2_Tabular_All/Questionnaire/content"

# Read the qnr structure

In [41]:
with open(f'{json_path}/document.json', 'r') as file:
    qnr_structure = json.load(file)

# create a mapping of VariableName to $type
type_mapping = {}

def process_json_structure(children):
    for child in children:
        if "$type" in child:
            variable_name = child.get("VariableName")

            type_value = child["$type"]
            if "YesNoView" in child:
                if child["YesNoView"]:
                    type_value = "YesNoQuestion"  # for yes/no questions, overwrite with custom question type

            if variable_name:
                type_mapping[variable_name] = type_value

        if "Children" in child:
            process_json_structure(child["Children"])


process_json_structure(qnr_structure["Children"])

# Prepare paradata

In [42]:
df_para = pd.read_csv(f'{para_path}/paradata.tab', delimiter='\t')
df_para[['param', 'answer', 'roster_level']] = df_para['parameters'].str.split('\|\|',
                                                                               expand=True)  # split the parameter column
df_para['datetime_utc'] = pd.to_datetime(df_para['timestamp_utc'])  # generate date-time, TZ not yet considered
df_para['type'] = df_para['param'].map(type_mapping)
# TODO: set to nan, streamline missing and none, etc

In [43]:
df_para['answer_changed'] = False
group_columns = ['interview__id', 'param', 'roster_level']  # group dfs by interview, variable and roster instance

## List questions

In [44]:
df_para_list = df_para[(df_para['type'] == 'TextListQuestion') & (df_para['event'] == 'AnswerSet')].copy()
grouped_list = df_para_list.groupby(group_columns)
for _, group in grouped_list:
    prev_answers = set()  # set an empty set for previous answers
    for index, row in group.iterrows():
        row_answers = set(row['answer'].split('|')) if pd.notnull(row['answer']) else set()
        if prev_answers.difference(row_answers):
            df_para_list.at[
                index, 'answer_changed'] = True  # can be removed, just to verify more easily
            df_para.at[index, 'answer_changed'] = True
        prev_answers = row_answers

Currently counting any change, irrespective if it is deleting a list item, or changing the spelling or updating the list item. There is a lot of the latter, which is not an issue per se, more a sign of better control. Maybe we should redefine how change answer and remove answer are defined. E.g unselecting a multi_select, changing yes/no, removing a list item as remove answer? These actions are worse if they entail a lot of enablements.

In [45]:
df_para_question = df_para[(~df_para['type'].isin(['YesNoQuestion', 'MultyOptionsQuestion', 'TextListQuestion', 'Variable'])) & (df_para['event'] == 'AnswerSet')].copy()
df_para_question = df_para_question[df_para_question.duplicated(subset=group_columns, keep=False)]
if df_para_question.shape[0] > 0:
    grouped_question = df_para_question.groupby(group_columns)
    for _, group in grouped_question:
        prev_answer = None  # set an empty answer for previous answers
        for index, row in group.iterrows():
            row_answer = row['answer']
            if prev_answer is not None and prev_answer != row_answer:
                df_para_question.at[index, 'answer_changed'] = True  # can be removed, just to verify more easily
                df_para.at[index, 'answer_changed'] = True
            prev_answer = row_answer

@Gabriele, above cell takes quite long to execute, can we write this faster?

## YesNo questions

In [46]:
df_para_yesno = df_para[(df_para['type'] == 'YesNoQuestion') & (df_para['event'] == 'AnswerSet')].copy()
if df_para_yesno.shape[0] > 0:
    df_para_yesno[['yes_answers', 'no_answers']] = df_para_yesno['answer'].str.split('|', expand=True)
    grouped_yesno = df_para_yesno.groupby(group_columns)

    for _, group in grouped_yesno:
        prev_yes_answers = set()  # set an empty set for previous yes-answers
        for index, row in group.iterrows():
            yes_answers = set(row['yes_answers'].split(', ')) if pd.notnull(row['yes_answers']) else set()
            no_answers = set(row['no_answers'].split(', ')) if pd.notnull(row['no_answers']) else set()

            if len(prev_yes_answers.intersection(no_answers)) > 0:
                df_para_yesno.at[
                    index, 'answer_changed'] = True  # can be removed, just to verify more easily
                df_para.at[index, 'answer_changed'] = True
            prev_yes_answers = yes_answers

## Normal multi-answer questions

In [47]:
df_para_multi = df_para[(df_para['type'] == 'MultyOptionsQuestion') & (df_para['event'] == 'AnswerSet')].copy()
if df_para_multi.shape[0] > 0:
    grouped_multi = df_para_multi.groupby(group_columns)
    for _, group in grouped_multi:
        prev_answers = set()  # set an empty set for previous answers
        for index, row in group.iterrows():
            row_answers = set(row['answer'].split(', ')) if pd.notnull(row['answer']) else set()
            if prev_answers.difference(row_answers):
                df_para_multi.at[
                    index, 'answer_changed'] = True  # can be removed, just to verify more easily
                df_para.at[index, 'answer_changed'] = True
            prev_answers = row_answers

# Summary

In [48]:
summarize_group = ['interview__id', 'param']
n_answer_changed = df_para[df_para['answer_changed']].groupby(summarize_group).size().reset_index(name='n_answer_changed')
#interview_questions = pd.DataFrame(df_para[df_para['event'] == 'AnswerSet'][summarize_group].drop_duplicates())
#n_answer_changed = interview_questions.merge(n_answer_changed, on=summarize_group, how='left').fillna({'n_answer_changed': 0})
# @Gabriele, so we need the 0 counts?
n_answer_changed

Unnamed: 0,interview__id,param,n_answer_changed
0,000bd8ca8f0e47f49294facfa53c41d8,n10,1
1,000bd8ca8f0e47f49294facfa53c41d8,n15,2
2,000bd8ca8f0e47f49294facfa53c41d8,n16,1
3,000bd8ca8f0e47f49294facfa53c41d8,n16aa,1
4,000bd8ca8f0e47f49294facfa53c41d8,n7,3
...,...,...,...
65961,fffd69e86d1c474da9f035f4bb8a900c,n13g,1
65962,fffd69e86d1c474da9f035f4bb8a900c,n13h,1
65963,fffd69e86d1c474da9f035f4bb8a900c,n14,1
65964,fffd69e86d1c474da9f035f4bb8a900c,n22,1
