In [1]:
import pandas as pd
from pandas.testing import assert_series_equal

questions: pd.Series = pd.read_csv('./config/questions.csv', index_col=0).squeeze()

# Setup choices_df
# choices_df contains multiple candidates of choice indexed by column name
choices_df = pd.read_csv('./config/choices.csv')
col_names = questions.index.to_series()

assert_series_equal(choices_df['設問文章'], questions[questions != "タイムスタンプ"], check_names=False, check_index=False)
choices_df.index = col_names.drop(index=questions[questions == "タイムスタンプ"].index).values
choices_df['選択肢'] = choices_df['選択肢'].str.split(',')
choices_df['選択肢'].where(choices_df['選択肢'].notna(), None, inplace=True)

# NOTE: list() will be loaded as np.array()
df = pd.read_feather('out/after_coded.feather')
df.set_index('回答番号', inplace=True)
for col in choices_df[choices_df['複数回答'] == 1].index:
    df[col] = df[col].map(lambda x: x.tolist())

label_q_ids = ['Y', 'AN', 'AS']

exploded_choices_df: pd.Series = choices_df.loc[label_q_ids, '選択肢'].explode()
sorted_choices_df = exploded_choices_df.sort_values(key=lambda series: series.map({ 'アロマンティック': 1, 'アセクシュアル':1 }))
updated_exploded_choices_df = sorted_choices_df.groupby(level=0).agg(list)
choices_df.update(updated_exploded_choices_df)

location_source = {
    '北海道': ['北海道'],
    '東北': ['青森県', '岩手県', '宮城県', '秋田県', '山形県', '福島県'],
    '南関東 (東京都を除く)': ['埼玉県', '千葉県', '神奈川県'],
    '東京都': ['東京都'],
    '北関東・甲信': ['茨城県', '栃木県', '群馬県', '山梨県', '長野県'],
    '北陸': ['新潟県', '富山県', '石川県', '福井県'],
    '東海': ['岐阜県', '静岡県', '愛知県', '三重県'],
    '近畿': ['滋賀県', '京都府', '大阪府', '兵庫県', '奈良県', '和歌山県'],
    '中国': ['鳥取県', '島根県', '岡山県', '広島県', '山口県'],
    '四国': ['徳島県', '香川県', '愛媛県', '高知県'],
    '九州・沖縄': ['福岡県', '佐賀県', '長崎県', '熊本県', '大分県', '宮崎県', '鹿児島県', '沖縄県'],
}

location_map = {}

for cat, values in location_source.items():
    for value in values:
        location_map[value] = cat



ages_bin_labels = [
    '13歳未満',
    '13~18歳',
    '19~22歳',
    '23~25歳',
    '26~29歳',
    '30~34歳',
    '35~39歳',
    '40~44歳',
    '45歳以上',
]
def cut_ages_bin(ages):
    return pd.cut(ages, [-1, 13, 19, 23, 26, 30, 35, 40, 45, 10000], labels=ages_bin_labels, right=False)


# Apply transformations
for column in questions[questions.str.contains(r'年齢|何歳', regex=True)].index.values:
    df[column] = cut_ages_bin(df[column])
    choices_df.at[column, '選択肢'] = ages_bin_labels
df['E'] = df['E'].replace(location_map)
choices_df.at['E', '選択肢'] = location_source.keys()


In [7]:
# Generate Markdown
from pytablewriter import MarkdownTableWriter

other_groups_map = {
    'CO': ['その他', 'とくになかった'],
    'CQ': ['その他', 'とくにない'],
}

with open("simple_agg.md", "w") as f:
    column: str
    q_text: str
    num = 0
    for column, q_text in questions.drop(['A', 'B', 'DL']).items():
        num += 1
        values = df[column].dropna()
        if choices_df.loc[column, '複数回答'] == 1:
            values = values[df[column].astype(bool)]  # drop empty list

        n = len(values)
        f.write(f"#### {num}. {q_text.strip()}\n\n(n={n})\n\n")

        if choices_df.loc[column, 'テキスト回答'] == 1:
            f.write('(自由記述のため省略)\n')
            continue

        # 度数分布表
        data = values.explode().value_counts(sort=False).rename('回答数').to_frame()
        data['割合 (%)'] = (data['回答数'] * 100 / n).round(1)

        choices_list = choices_df.at[column, '選択肢']
        if choices_list is None:
            data.sort_index(inplace=True)
        else:
            choices = pd.Index(choices_list).union(data.index, sort=False)

            other_groups = other_groups_map.get(column, [])
            index_order = choices.drop(other_groups, errors='ignore').append(pd.Index(other_groups))

            data = data.reindex(index_order, fill_value=0)
        data.index.name = '選択肢'
        data.reset_index(inplace=True)

        writer = MarkdownTableWriter(dataframe=data)
        f.write(writer.dumps())
        f.write('\n\n')
