In [13]:
# Setup
import pandas as pd
from pandas.testing import assert_series_equal

# Assign A,B,C,... column names to source data frame.

df = pd.read_csv('./data/source.csv')
q_df = pd.read_csv('./config/questions.csv')
col_names = q_df.iloc[:, 0]
questions = q_df.iloc[:, 1]
questions_in_source = pd.Series(df.columns).replace(regex=r'\.[0-9]+$', value=r'')
assert_series_equal(questions, questions_in_source, check_names=False)

df.columns = col_names.values

drop_df = df[df['B'] != '了承して、回答する']

df.drop(drop_df.index, inplace=True)
df['B'].unique()


array(['了承して、回答する'], dtype=object)

In [14]:
out_drop_df = drop_df.copy()
out_drop_df.index += 1
out_drop_df.to_csv('out/drop.csv', index_label='回答番号')

In [5]:
# Setup choices_df
# choices_df contains multiple candidates of choice indexed by column name
choices_df = pd.read_csv('./config/choices.csv')
assert_series_equal(choices_df['設問文章'], questions[questions != "タイムスタンプ"], check_names=False, check_index=False)
choices_df.index = col_names.drop(index=questions[questions == "タイムスタンプ"].index).values
choices_df['選択肢'] = choices_df['選択肢'].str.split(',')


In [6]:
# Output raw data with modified column names

out_raw_df = df.copy()
out_raw_df.columns = col_names + ': ' + questions
out_raw_df.index += 1
out_raw_df.to_csv('out/raw.csv', index_label="回答番号")


In [7]:
# Explode multiple choices into list, extract free answers

free_answers_count_dict_by_col = { col: {} for col in choices_df[choices_df['自由記述'] == 1].index }

# single choice cols
for col in choices_df.loc[(choices_df['複数回答'] != 1) & (choices_df['自由記述'] == 1), :].index:
    free_answers_count_dict_by_col[col] = df.loc[:, col].value_counts().drop(choices_df.loc[col, '選択肢'], errors='ignore').to_dict()

# multi choice cols, with list
multi_choice_answers_df = df[choices_df.loc[choices_df['複数回答'] == 1, :].index].copy()
for col in multi_choice_answers_df.columns:
    choices_set = frozenset(choices_df.loc[col, '選択肢'])

    def split_choices(text):
        if not isinstance(text, str):
            return []
        attrs = text.split(', ')
        ret = []
        for i, attr in enumerate(attrs):
            if attr in choices_set:
                ret.append(attr)
            else:
                # Join texts after first free answer text (users can input exact ', ')
                rem = ', '.join(attrs[i:])
                ret.append(rem)
                dic = free_answers_count_dict_by_col[col]
                dic.setdefault(rem, 0)
                dic[rem] += 1
                break
        return ret

    multi_choice_answers_df[col] = multi_choice_answers_df[col].apply(split_choices)

out_free_answers_count_df = pd.DataFrame.from_dict(free_answers_count_dict_by_col, orient='index').stack().to_frame().astype(int)
out_free_answers_count_df.reset_index(inplace=True)
out_free_answers_count_df.columns = ['列番号', '回答文字列', '出現回数']
out_free_answers_count_df.insert(1, '設問文章', choices_df.loc[out_free_answers_count_df['列番号'], '設問文章'].values)
out_free_answers_count_df.sort_values('出現回数')
out_free_answers_count_df.to_csv('out/free_answers.csv', index=False)

In [8]:
# Find duplicated 

dup_config_df = pd.read_csv('./config/duplicated.csv')


dup_groups_list = []

full_dup_index = df[df.duplicated(df.columns.difference(['A']), keep=False)].index
full_dup_groups = pd.DataFrame(full_dup_index, columns=['回答番号'])
full_dup_groups.insert(0, '列番号', 'ALL')
full_dup_groups.insert(1, '回答', 'すべて一致')
dup_groups_list.append(full_dup_groups)

non_full_dup_df = df.drop(index=full_dup_index)


for col in dup_config_df.loc[:, '質問（列）']:
    dup_series = non_full_dup_df.loc[non_full_dup_df.duplicated(col, keep=False), col]
    dup_groups = dup_series.groupby(dup_series)
    dup_groups = dup_groups.apply(lambda x: x.index)
    dup_groups = dup_groups.drop(choices_df.loc[choices_df.index == col, '選択肢'][0], errors="ignore").explode().to_frame()
    dup_groups.columns = ['回答番号']
    dup_groups.insert(0, '列番号', col)
    dup_groups.insert(1, '回答', dup_groups.index)
    dup_groups_list.append(dup_groups)

out_dup_df = pd.concat(dup_groups_list, ignore_index=True)
out_dup_df = pd.merge(out_dup_df, df, how='left', left_on='回答番号', right_index=True)
out_dup_df['回答番号'] += 1
out_dup_df.to_csv('out/duplicates.csv', index=False)