In [7]:
# Setup
import pandas as pd
from pandas.testing import assert_series_equal

pd.options.display.float_format = '{:.1f}'.format

# Assign A,B,C,... column names to source data frame.

df = pd.read_csv('./data/source.csv')
questions = pd.read_csv('./config/questions.csv', index_col=0).squeeze()
col_names = questions.index.to_series()
questions_in_source = pd.Series(df.columns).replace(regex=r'\.[0-9]+$', value=r'')
assert_series_equal(questions, questions_in_source, check_names=False, check_index=False)

df.columns = col_names
df.index += 1
df.index.name = '回答番号'

drop_df = df[df['B'] != '了承して、回答する']

ac_drop_df = pd.read_csv('./data/after_codings/drops.csv')
df.drop(ac_drop_df['回答番号'], inplace=True)
assert(df['B'].unique().tolist() == ['了承して、回答する'])

with_column_names_df = df.copy()
with_column_names_df.columns = col_names + ': ' + questions


In [8]:
# Setup choices_df
# choices_df contains multiple candidates of choice indexed by column name
choices_df = pd.read_csv('./config/choices.csv')
assert_series_equal(choices_df['設問文章'], questions[questions != "タイムスタンプ"], check_names=False, check_index=False)
choices_df.index = col_names.drop(index=questions[questions == "タイムスタンプ"].index).values
choices_df['選択肢'] = choices_df['選択肢'].str.split(',')


In [9]:
# Output raw data with modified column names

with_column_names_df.to_csv('out/raw.csv')


In [10]:
out_drop_df = drop_df.copy()
out_drop_df.columns = col_names + ': ' + questions
out_drop_df.to_csv('out/drop.csv')

In [11]:
# Explode multiple choices into list, extract free answers

free_answer_to_indices_dict_by_col = { col: {} for col in choices_df[choices_df['自由記述'] == 1].index }

# single choice cols
for col in choices_df.loc[(choices_df['複数回答'] != 1) & (choices_df['自由記述'] == 1), :].index:
    free_answer_to_indices_dict_by_col[col] = df.groupby(col).apply(lambda x: list(x.index)).drop(choices_df.loc[col, '選択肢'], errors='ignore').to_dict()

# multi choice cols, with list
multi_choice_answers_df = df[choices_df.loc[choices_df['複数回答'] == 1, :].index].copy()
for col in multi_choice_answers_df.columns:
    choices_set = frozenset(choices_df.loc[col, '選択肢'])

    def split_choices(row):
        text = row.values[0]
        if not isinstance(text, str):
            return []
        attrs = text.split(', ')
        ret = []
        for i, attr in enumerate(attrs):
            if attr in choices_set:
                ret.append(attr)
            else:
                # Join texts after first free answer text (users can input exact ', ')
                rem = ', '.join(attrs[i:])
                ret.append(rem)
                dic = free_answer_to_indices_dict_by_col[col]
                dic.setdefault(rem, [])
                dic[rem].append(row.name)
                break
        return ret

    multi_choice_answers_df[col] = multi_choice_answers_df[col].to_frame().apply(split_choices, axis='columns')

col_free_answer_and_count_tuple_tuple = map(lambda col_dict: (col_dict[0], sorted(col_dict[1].items(), key=lambda t: len(t[1]), reverse=True)), free_answer_to_indices_dict_by_col.items())

out_free_answers_df = pd.DataFrame(col_free_answer_and_count_tuple_tuple, columns=['列番号', 'group']).explode('group').reset_index()
out_free_answers_df = pd.concat([out_free_answers_df['列番号'], pd.DataFrame(map(lambda item: (item[0], len(item[1]), item[1]), out_free_answers_df['group'].tolist()), columns=['回答文字列', '出現回数', '回答番号'])], axis='columns')
out_free_answers_df = out_free_answers_df.explode('回答番号')
out_free_answers_df.insert(1, '設問文章', choices_df.loc[out_free_answers_df['列番号'], '設問文章'].values)
out_free_answers_df = pd.merge(out_free_answers_df, with_column_names_df, how='left', left_on='回答番号', right_index=True)
out_free_answers_df.to_csv('out/free_answers.csv', index=False)

In [12]:
# Find duplicated 

dup_config_df = pd.read_csv('./config/duplicated.csv')


dup_groups_list = []

full_dup_index = df[df.duplicated(df.columns.difference(['A']), keep=False)].index
full_dup_groups = pd.DataFrame(full_dup_index, columns=['回答番号'])
full_dup_groups.insert(0, '列番号', 'ALL')
full_dup_groups.insert(1, '回答', 'すべて一致')
dup_groups_list.append(full_dup_groups)

non_full_dup_df = df.drop(index=full_dup_index)


for col in dup_config_df.loc[:, '質問（列）']:
    dup_series = non_full_dup_df.loc[non_full_dup_df.duplicated(col, keep=False), col]
    dup_groups = dup_series.groupby(dup_series)
    dup_groups = dup_groups.apply(lambda x: x.index)
    dup_groups = dup_groups.drop(choices_df.loc[choices_df.index == col, '選択肢'][0], errors="ignore").explode().to_frame()
    dup_groups.columns = ['回答番号']
    dup_groups.insert(0, '列番号', col)
    dup_groups.insert(1, '回答', dup_groups.index)
    dup_groups_list.append(dup_groups)

out_dup_df = pd.concat(dup_groups_list, ignore_index=True)
out_dup_df = pd.merge(out_dup_df, with_column_names_df, how='left', left_on='回答番号', right_index=True)
out_dup_df.to_csv('out/duplicates.csv', index=False)

In [13]:
# Exclusive choices


excl_config_df = pd.read_csv('./config/exclusive_choices.csv')

other_answer_keyword = "ほかの選択肢"

excl_entry_list = []

for index, row in excl_config_df.iterrows():
    col = row['質問（列）']
    before = row['変更前']
    if pd.isna(before):
        continue
    if not before.endswith('＋' + other_answer_keyword):
        raise "Unsupported exclusive before text: {before}"
    before_choice = before.replace('＋' + other_answer_keyword, '')
    choice_list = choices_df.loc[col, '選択肢']
    if not before_choice in choice_list:
        raise "\"{before_choice}\" is not a valid choice"

    matched_series = df.loc[multi_choice_answers_df[col].apply(lambda arr: before_choice in arr and len(arr) >= 2), col]

    if row['変更後'] == other_answer_keyword:
        after_series = multi_choice_answers_df.loc[matched_series.index, col].apply(lambda arr: ','.join([x for x in arr if x == before_choice])) 
    else:
        after_series = [row['変更後']] * len(matched_series)

    excl_entry_df = pd.DataFrame({
        '回答番号': matched_series.index,
        '列番号': [col] * len(matched_series), 
        '排他的選択肢': [before_choice] * len(matched_series),
        '回答': matched_series.values,
        '変更後': after_series
    })
    excl_entry_list.append(excl_entry_df)

out_excl_df = pd.concat(excl_entry_list, ignore_index=True)
out_excl_df.to_csv('out/exclusives.csv', index=False)


In [14]:
# Detect age contradictions

current_age_column = 'D'
answer_age_columns = ['AT', 'AX', 'AY', 'CY']

has_age_contradictions_df = df[answer_age_columns].gt(df[current_age_column], axis='index')

out_age_contra_df = has_age_contradictions_df[has_age_contradictions_df.any(axis='columns')].copy()
out_age_contra_df = df.loc[out_age_contra_df.index, answer_age_columns][out_age_contra_df]
out_age_contra_df.insert(0, 'D', df.loc[out_age_contra_df.index, current_age_column])
# TODO: add question text to columns
out_age_contra_df.astype('Int64').to_csv('out/age_contradictions.csv')


In [20]:
# After codings

ac_age_df = pd.read_csv('./data/after_codings/age_contradictions.csv', keep_default_na=False, na_values=[''])

ac_age_d_df = ac_age_df[['回答番号', 'D', 'Dの変更後']].copy()
ac_age_d_df.columns = ['回答番号', '修正前', '修正後']
ac_age_d_df.dropna(subset='修正後', inplace=True)
ac_age_d_df[['修正前', '修正後']] = ac_age_d_df[['修正前', '修正後']].replace('NA', None).astype('float64')
ac_age_d_df.insert(0, '列番号', 'D')

ac_age_at_df = ac_age_df[['回答番号', 'AT', 'ATの変更後']].copy()
ac_age_at_df.columns = ['回答番号', '修正前', '修正後']
ac_age_at_df.dropna(subset='修正後', inplace=True)
ac_age_at_df[['修正前', '修正後']] = ac_age_at_df[['修正前', '修正後']].replace('NA', None).astype('float64')
ac_age_at_df.insert(0, '列番号', 'AT')

ac_exclusives_df = pd.read_csv('./data/after_codings/exclusives.csv')[['列番号', '回答番号', '回答', '変更後']]
ac_exclusives_df.columns = ['列番号', '回答番号', '修正前', '修正後']
ac_exclusives_df['修正前'] = ac_exclusives_df['修正前'].str.split(', ')
ac_exclusives_df['修正後'] = ac_exclusives_df['修正後'].str.split(', ')


q_df = questions.reset_index()
q_df.columns = ['列番号', '設問文章']
ac_transforms_df = pd.read_csv('./data/after_codings/transforms.csv').merge(q_df, how='left', on='設問文章', copy=False)[['列番号', '回答番号', '修正前', '修正後']]

ac_others_df = pd.read_csv('./data/after_codings/others.csv')[['列番号', '回答番号', '回答文字列', '変更後の回答']]
ac_others_df.columns = ['列番号', '回答番号', '修正前', '修正後']

ac_data_df = df.copy()
ac_multi_choice_df = multi_choice_answers_df.copy()

ac_changes = pd.concat([ac_age_d_df, ac_age_at_df, ac_exclusives_df, ac_transforms_df, ac_others_df])
for column, group in ac_changes.groupby('列番号', sort=False):
    if column in ac_multi_choice_df.columns:
        for _, row in group.iterrows():
            before_list = ac_multi_choice_df.loc[row['回答番号'], column]
            if isinstance(row['修正前'], list):
                if before_list != row['修正前']:
                    raise ValueError(f'element {row["修正前"]} does not match for index {row["回答番号"]} of column {column} (actual {before_list})')
                ac_multi_choice_df.loc[row['回答番号'], column] = row['修正後']
            else:
                if row['修正前'] not in map(str.strip, before_list):
                    raise ValueError(f'element {row["修正前"]} not found for index {row["回答番号"]} of column {column} (actual {before_list})')
                ac_multi_choice_df.loc[row['回答番号'], column] = list(dict.fromkeys([row['修正後'] if x.strip() == row['修正前'] else x for x in before_list]))
    else:
        assert_series_equal(ac_data_df[column][group['回答番号']], group.set_index('回答番号')['修正前'], check_names=False, check_dtype=False)
        ac_data_df[column].update(group.set_index('回答番号')['修正後'])

out_ac_df = ac_data_df.copy()
for column, series in ac_multi_choice_df.iteritems():
    out_ac_df[column] = series.str.join(', ')

out_ac_df.to_csv('out/after_coded.csv')