In [55]:
import pandas as pd
from pandas.testing import assert_series_equal

questions: pd.Series = pd.read_csv('./config/questions.csv', index_col=0).squeeze()

# Setup choices_df
# choices_df contains multiple candidates of choice indexed by column name
choices_df = pd.read_csv('./config/choices.csv')
col_names = questions.index.to_series()

assert_series_equal(choices_df['設問文章'], questions[questions != "タイムスタンプ"], check_names=False, check_index=False)
choices_df.index = col_names.drop(index=questions[questions == "タイムスタンプ"].index).values
choices_df['選択肢'] = choices_df['選択肢'].str.split(',')
choices_df['選択肢'].where(choices_df['選択肢'].notna(), None, inplace=True)

# NOTE: list() will be loaded as np.array()
df = pd.read_feather('out/after_coded.feather')
df.set_index('回答番号', inplace=True)
for col in choices_df[choices_df['複数回答'] == 1].index:
    df[col] = df[col].map(lambda x: x.tolist())

label_q_ids = ['Y', 'AN', 'AS']

exploded_choices_df: pd.Series = choices_df.loc[label_q_ids, '選択肢'].explode()
sorted_choices_df = exploded_choices_df.sort_values(key=lambda series: series.map({ 'アロマンティック': 1, 'アセクシュアル':1 }))
updated_exploded_choices_df = sorted_choices_df.groupby(level=0).agg(list)
choices_df.update(updated_exploded_choices_df)


short_label_map = {
    'アセクシュアル': 'アセク',
    'アロマンティック': 'アロマ',
    'セクシュアル【性的に惹(ひ)かれる】': 'セク',
    'ロマンティック【恋愛的に惹(ひ)かれる】': 'ロマ',
    'デミセクシュアル': 'デミセク',
    'デミロマンティック': 'デミロマ',
    'グレイアセクシュアル/グレイセクシュアル': 'グレイセク',
    'グレイアロマンティック/グレイロマンティック': 'グレイロマ',
    'リスセクシュアル': 'リスセク',
    'リスロマンティック': 'リスロマ',
    'クエスチョニング': 'Q',
}


location_source = {
    '北海道': ['北海道'],
    '東北': ['青森県', '岩手県', '宮城県', '秋田県', '山形県', '福島県'],
    '南関東 (東京都を除く)': ['埼玉県', '千葉県', '神奈川県'],
    '東京都': ['東京都'],
    '北関東・甲信': ['茨城県', '栃木県', '群馬県', '山梨県', '長野県'],
    '北陸': ['新潟県', '富山県', '石川県', '福井県'],
    '東海': ['岐阜県', '静岡県', '愛知県', '三重県'],
    '近畿': ['滋賀県', '京都府', '大阪府', '兵庫県', '奈良県', '和歌山県'],
    '中国': ['鳥取県', '島根県', '岡山県', '広島県', '山口県'],
    '四国': ['徳島県', '香川県', '愛媛県', '高知県'],
    '九州・沖縄': ['福岡県', '佐賀県', '長崎県', '熊本県', '大分県', '宮崎県', '鹿児島県', '沖縄県'],
}

location_map = {}

for cat, values in location_source.items():
    for value in values:
        location_map[value] = cat


if (df['D'] <= 12).sum():
    raise ValueError('Column D has one or more values less than 13')

ages_bin_map = {
    'D': [
        (18, '13~18歳'),
        (22, '19~22歳'),
        (25, '23~25歳'),
        (29, '26~29歳'),
        (34, '30~34歳'),
        (39, '35~39歳'),
        (44, '40~44歳'),
        (10000, '45歳以上'),
    ],
    'AT': [
        (0, '0 (違いを感じたことがない)'),
        (12, '12歳以下'),
        (15, '13~15歳'),
        (18, '16~18歳'),
        (22, '19~22歳'),
        (25, '23~25歳'),
        (29, '26~29歳'),
        (34, '30~34歳'),
        (10000, '35歳以上'),
    ],
    'AX': [
        (0, '0 (本調査で知った)'),
        (15, '15歳以下'),
        (18, '16~18歳'),
        (22, '19~22歳'),
        (25, '23~25歳'),
        (29, '26~29歳'),
        (34, '30~34歳'),
        (10000, '35歳以上'),
    ],
    'AY': [
        (0, '0 (自認していない)'),
        (15, '15歳以下'),
        (18, '16~18歳'),
        (22, '19~22歳'),
        (25, '23~25歳'),
        (29, '26~29歳'),
        (34, '30~34歳'),
        (39, '35歳〜39歳'),
        (10000, '40歳以上'),
    ],
    'CY': [
        (15, '15歳以下'),
        (18, '16~18歳'),
        (22, '19~22歳'),
        (25, '23~25歳'),
        (29, '26~29歳'),
        (10000, '30歳以上'),
    ],
}
def cut_ages_bin(ages, column):
    return pd.cut(ages, [-1] + [x[0] for x in ages_bin_map[column]], labels=[x[1] for x in ages_bin_map[column]], right=True)


# Apply transformations
column: str

for column in questions[questions.str.contains(r'年齢|何歳', regex=True)].index.values:
    df[column] = cut_ages_bin(df[column], column)
    choices_df.at[column, '選択肢'] = [x[1] for x in ages_bin_map[column]]
df['E'] = df['E'].replace(location_map)
choices_df.at['E', '選択肢'] = location_source.keys()

for column in questions[questions.str.contains('強さを１～５で表現すると', regex=False)].index.values:
    num_orig_na = df[column].isna().sum()
    df[column] = df[column].map({ 1: '1 (弱い)', 2: '2', 3: '3', 4: '4', 5: '5 (強い)' })
    assert df[column].isna().sum() == num_orig_na


In [12]:
# Calculate n for excluded answers

# b_name ='【必須項目】上記説明を全て読み、理解した上で本調査への協力をご了承いただける場合は、「了承して、回答する」を選択してください。'
# c_name = '【必須項目】自分のことをアロマンティック／アセクシュアル・スペクトラム(Aro/Ace)に当てはまると思いますか。'

source_df = pd.read_csv('./data/source.csv', na_values=[''])

source_df.index += 1
source_df.index.name = '回答番号'
source_df.columns = df.columns

ac_drop_df = pd.read_csv('./data/after_codings/drops.csv')
ac_drop_df = ac_drop_df[ac_drop_df['理由'] != '回答拒否のため']

source_df.drop(pd.Index(ac_drop_df['回答番号']), inplace=True)

source_df.loc[source_df['B'] == 'いいえ', 'C'] = pd.NA


In [13]:
# Generate tabulation.json
from typing import TypedDict
import json

other_groups_map = {
    'CO': ['その他', 'とくになかった'],
    'CQ': ['その他', 'とくにない'],
}

column: str
q_text: str
num = 0

class Tabulation(TypedDict):
    title: str
    n: int
    table: list[list[str]]

tabulations: list[Tabulation] = []

for column, title in questions.drop(['A']).items():
    if column == 'B':
        values = source_df['B'].dropna()
    elif column == 'C':
        values = source_df['C'].dropna()
    else:
        values = df[column].dropna()

    if choices_df.loc[column, '複数回答'] == 1:
        # Drop empty list. Note that it should not included to n.
        values = values[df[column].astype(bool)]

    n = len(values)

    tabulation: Tabulation = {}
    tabulation['title'] = title
    tabulation['n'] = n

    if choices_df.loc[column, 'テキスト回答'] == 1:
        tabulation['table'] = [['(自由記述)']]
        tabulations.append(tabulation)
        continue

    # 度数分布表
    data = values.explode().value_counts(sort=False).rename('割合').to_frame()

    # reorder table
    choices_list = choices_df.at[column, '選択肢']
    if choices_list is None:
        data.sort_index(inplace=True)
    else:
        g_choices = pd.Index(choices_list).union(data.index, sort=False)

        g_other_groups = other_groups_map.get(column, [])
        index_order = g_choices.drop(g_other_groups, errors='ignore').append(pd.Index(g_other_groups))

        data = data.reindex(index_order, fill_value=0)

    # format value
    data['割合'] = (data['割合'] * 100 / n).round(1).astype(str) + '%'

    data.index.name = '選択肢'
    data.reset_index(inplace=True)

    tabulation['table'] = data.values.tolist()

    tabulations.append(tabulation)

with open('out/tabulation.json', 'w') as f:
    json.dump(tabulations, f, ensure_ascii=False)


In [45]:
# TODO: Unify with plot.ipynb

from matplotlib import pyplot as plt
import seaborn as sns
from seaborn import FacetGrid
from matplotlib.figure import Figure
import numpy as np
from plot import adjust_figure_for_h_grouped, adjust_figure_for_h_stack, adjust_figure_for_v_grouped, hbar, vbar, palette_3


def create_table_single(df: pd.DataFrame, column: str):
    values = df[column].dropna()
    if choices_df.loc[column, '複数回答'] == 1:
        values = values[df[column].astype(bool)]  # drop empty list

    choices_list = choices_df.loc[column, '選択肢']

    q_text: str = questions.at[column]
    if q_text.find('強さを１～５で表現すると') != -1:
        # TODO: Apply only in figures?
        choices_list = ['1 (弱い)', '2', '3', '4', '5 (強い)']
    return _create_table_single(values, choices_list)

def _create_table_single(values: pd.Series, choices_list: list[str]):
    n = len(values)
    data = values.explode().value_counts(sort=not choices_list) * 100 / n

    if choices_list:
        choices = pd.Index(choices_list).union(data.index, sort=False)
        other_groups=pd.Index(['その他', 'とくになかった', 'とくにない', 'Aro/Aceを自認していない']).intersection(choices)
        index_order = choices.drop(other_groups).append(other_groups)
        data = data.reindex(index_order, fill_value=0)

    return data, n

# TODO: return N for each group
def create_table_cross(df: pd.DataFrame, column: str, group_by: str):
    if choices_df.loc[column, '複数回答'] == 1:
        n_table = df[group_by][df[column].astype(bool)].value_counts(sort=False)
    else:
        n_table = df[group_by][df[column].notna()].value_counts(sort=False)

    exploded_df = df[[column, group_by]].explode(column)
    table = pd.crosstab(exploded_df[column], exploded_df[group_by])

    g_choices = pd.Index(choices_df.loc[group_by, '選択肢']).union(pd.Index(table.columns), sort=False)
    g_other_groups=pd.Index(['その他', 'とくになかった', 'とくにない', 'Aro/Aceを自認していない', 'どれも使っていない']).intersection(g_choices)
    col_order = g_choices.drop(g_other_groups, errors='ignore').append(pd.Index(g_other_groups))

    choices_list = choices_df.loc[column, '選択肢']

    q_text: str = questions.at[column]
    if q_text.find('強さを１～５で表現すると') != -1:
        # TODO: Apply only in figures?
        choices_list = ['1 (弱い)', '2', '3', '4', '5 (強い)']

    choices = pd.Index(choices_list).union(table.index.values, sort=False)
    other_groups=pd.Index(['その他', 'とくになかった', 'とくにない', 'Aro/Aceを自認していない', 'どれも使っていない']).intersection(choices)
    index_order = choices.drop(other_groups).append(other_groups)

    table = table.mul(100).div(n_table)
    # TODO: filter can drop nonexistent columns
    table = table.filter(col_order).reindex(index_order).fillna(0) # fillna() for n=0

    # Remove index name to prevent from rendered on the graph
    table.rename_axis(index=None, inplace=True)

    return table, n_table

def set_columns_label(table: pd.DataFrame, n_table: pd.Series):
    table.rename(columns=short_label_map, inplace=True)
    table.columns += n_table.map(lambda x: f"\n(n={x})")


def render_table(table: pd.Series) -> list[list[str | int |float]]:
    table_df = table.to_frame()
    table_df.columns = ['割合']

    # format value
    table_df['割合'] = table_df['割合'].round(1).astype(str) + '%'

    table_df.index.name = '選択肢'
    table_df.reset_index(inplace=True)

    return [table_df.columns.tolist()] + table_df.values.tolist()

def render_table_cross(table: pd.DataFrame, n_table: pd.Series) -> list[list[str | int |float]]:
    table = table.round(1).astype(str) + '%'
    set_columns_label(table, n_table)
    return [[''] + table.columns.tolist()] + table.reset_index().values.tolist()


def render_cross_single_graphs(table: pd.DataFrame, n_table, direction: str, start_g_num: int):
    figures: list[Figure] = []

    for g_num, g_label in enumerate(table.columns.values, start_g_num):
        values = table.loc[:, g_label]
        n = n_table.at[g_label]

        if direction == 'h':
            figure = hbar(values, n, bar_label=False)
        elif direction == 'v':
            figure = vbar(values, n, bar_label=False)
        else:
            raise ValueError(f'Unknown direction: {direction}')
        figures.append(figure)

    return figures

def render_stacked_hbar(table: pd.Series, n: int, title: str = ''):
    data = table.to_frame()
    data.index.name = 'choice'
    data.columns = ['percent']

    grid: FacetGrid = sns.displot(
        data=data, y=[''] * len(data), hue='choice', weights='percent',
        multiple='stack', shrink=0.8, discrete=True,
        alpha=1,
        aspect=16/9,
    )
    figure: Figure = grid.figure
    adjust_figure_for_h_stack(figure, n, title)
    figure.axes[0].margins(y=0.7)

    return figure

def _get_grouped_plot_data(table: pd.DataFrame, n_table: pd.Series) -> pd.DataFrame:
    data = table.copy()
    set_columns_label(data, n_table)
    data = data.rename_axis(index='choice', columns='group_by')
    return data.unstack(fill_value=0).rename('percent').reset_index()


def render_stacked_hbar_grouped(table: pd.DataFrame, n_table: pd.Series, title: str = ''):
    grid: FacetGrid = sns.displot(
        data=_get_grouped_plot_data(table, n_table), y='group_by', hue='choice', weights='percent',
        multiple='stack', shrink=0.8, discrete=True,
        alpha=1,
        hue_order=table.index.values,
        aspect=16/9,
    )
    figure: Figure = grid.figure
    adjust_figure_for_h_stack(figure, None, title)

    return figure


def render_bar_grouped(table: pd.DataFrame, n_table: pd.Series, direction: str, title: str = ''):
    data = _get_grouped_plot_data(table, n_table)

    palette = palette_3 if len(table.index) <= 3 else None

    if direction == 'v':
        grid: FacetGrid = sns.catplot(data=data, y='percent', x='group_by', hue='choice', kind='bar', orient='v', width=0.5, aspect=16/9, palette=palette)
        figure: Figure = grid.figure
        adjust_figure_for_v_grouped(figure, None, title, bar_label=False)
    elif direction == 'h':
        grid: FacetGrid = sns.catplot(data=data, x='percent', y='group_by', hue='choice', kind='bar', orient='h', height=5, aspect=16/9, palette=palette)
        figure: Figure = grid.figure
        adjust_figure_for_h_grouped(figure, None, title, bar_label=False)
    else:
        raise ValueError(f'Unknown direction: {direction}')
    # render_bar_labels_for_comparison(grid.figure)


# TODO: extract to plot.py
# Based on https://stackoverflow.com/a/39358752/1474113
def render_mpl_table(data: pd.DataFrame, col_width=3.0, row_height=0.625, header_row_height=0.625, font_size=14,
                     header_color=None, header_text_color=None, row_colors=['#f1f1f2', 'w'], edge_color='w',
                     bbox=[0, 0, 1, 1], header_columns=0,
                     ax: plt.Axes | None = None, **kwargs):
    if ax is not None:
        raise ValueError('ax is not supported')
    figure, ax = plt.subplots()
    ax.axis('off')

    mpl_table = ax.table(cellText=data.values, bbox=bbox, colLabels=data.columns, colWidths=[col_width] * data.shape[1], rowLabels=data.index, **kwargs)
    mpl_table.auto_set_font_size(False)
    mpl_table.set_fontsize(font_size)
    mpl_table.add_cell(0, -1, col_width, header_row_height)

    for k, cell in mpl_table.get_celld().items():
        cell.set_edgecolor(edge_color)

        if k[0] == 0:
            cell.set_height(header_row_height)
        else:
            cell.set_height(row_height)

        if k[1] != -1:
            cell.set_width(col_width)

        # NOTE: k[1] would be -1
        if k[0] == 0 or k[1] < header_columns:
            if header_text_color is not None:
                cell.set_text_props(color=header_text_color)
            if header_color is not None:
                cell.set_facecolor(header_color)
        if k[0] > 0:
            cell.set_facecolor(row_colors[k[0]%len(row_colors) ])

    row_heights = [0] * (len(data.index) + 1)
    col_widths = [0] * (len(data.columns) + 1)

    for k, cell in mpl_table.get_celld().items():
        row_heights[k[0]] = max(row_heights[k[0]], cell.get_height())
        col_widths[k[1] + 1] = max(col_widths[k[1] + 1], cell.get_width())

    figure.set_size_inches(sum(col_widths), sum(row_heights))

    return figure, ax


def render_table_single_mpl(table: pd.Series, n: int):
    col_name = f'割合\n(n={n})'
    table_df = table.to_frame()
    table_df.columns = [col_name]

    # format value
    table_df[col_name] = table_df[col_name].round(1).astype(str) + '%'

    table_df.index.name = '選択肢'

    figure, ax = render_mpl_table(table_df, header_columns=0, col_width=0.5, row_height=0.35, font_size=10, edge_color='#eee', header_text_color='#777')
    return figure


def render_table_cross_mpl(table: pd.DataFrame, n_table: pd.Series):
    data = table.round(1).astype(str) + '%'
    set_columns_label(data, n_table)

    figure, ax = render_mpl_table(data, header_columns=0, col_width=1.0, row_height=0.35, font_size=10, edge_color='#eee', header_text_color='#777')
    return figure


def percent(num: int):
    return f'{round(num, 1)}%'

def wari(num: int):
    return f'約{round(num / 10)}割'

def template_1(table: pd.Series, q_num: int, q_text: str, fig_num: str):
    sorted_table = table.sort_values(ascending=False)
    s_c = sorted_table.index.values
    s_v = sorted_table.values

    c = table.index.values
    v = table.values
    positive_v = v[0] + v[1]
    negative_v = v[-1] + v[-2]

    if c[2] != 'どちらでもない' and c[2] != '3':
        raise ValueError(f'center choice is not "どちらでもない" nor "3": {c[2]} ({", ".join(map(str, c))})')

    return (
        f"問{q_num}は{q_text}についてたずねました。その結果を示したのが図表{fig_num}です。" +
        f"{q_text}について、「{s_c[0]}」（{percent(s_v[0])}）と回答した割合がもっとも高く、次に「{s_c[1]}」（{percent(s_v[1])}）の割合が高かったです。" +
        f"「{c[0]}」（{percent(v[0])}）と「{c[1]}」（{percent(v[1])}）を足すと{wari(positive_v)}（{percent(positive_v)}）、" +
        f"「{c[-1]}」（{percent(v[-1])}）と「{c[-2]}」（{percent(v[-2])}）を足すと{wari(negative_v)}（{percent(negative_v)}）でした。" +
        f"「{c[2]}」は{percent(v[2])}でした。"
    )

def template_1_cross(data: pd.DataFrame, single_table: pd.Series, q_text: str, group_name: str, fig_num: str):
    most_choice = single_table.idxmax()

    a_row = data.loc[most_choice, :].transpose().sort_values(ascending=False)
    a_c = a_row.index.values
    a_v = a_row.values

    least_group_most_choice = data.loc[:, a_c[-1]].idxmax()
    b_row = data.loc[least_group_most_choice, :].transpose().sort_values(ascending=False)
    b_c = b_row.index.values
    b_v = b_row.values
    # FIXME: 矛盾があるので要確認
    # 【カテゴリーx】における「【カテゴリーxで一番割合が高かった回答B】」は【Bの％】％でした。「【B】」の割合が次に高かったのは、
    # とあるが、カテゴリーxにおいてBの割合が最も高かったことが、それぞれのカテゴリーの人がBを選んだ割合の中で、カテゴリーxの値が最も高いことを意味しないので、
    # 「次に高かった」が適用できない。
    # if a_c[-1] != b_c[0]:
    #     raise AssertionError(f'assert a_c[-1] == b_c[0]: {a_c[-1]} != {b_c[0]}')

    c_choice = single_table.drop([most_choice, least_group_most_choice]).idxmax()
    c_row = data.loc[c_choice, :].transpose().sort_values(ascending=False)
    c_c = c_row.index.values
    c_v = c_row.values

    # FIXME: 「最も」「もっとも」表記ゆれ
    return (
        f"{q_text}について、{group_name}別にみたのが図表{fig_num}です。"
        f"{a_c[0]}で「{most_choice}」の割合が最も高く（{percent(a_v[0])}）、次に{a_c[1]}（{percent(a_v[1])}）、{a_c[2]}（{percent(a_v[2])}）で「{most_choice}」の割合が高い結果でした。"
        f"一方、「{most_choice}」の割合がもっとも低かったのは{a_c[-1]}（{percent(a_v[-1])}）でしたが、{a_c[0]}における「{least_group_most_choice}」は{percent(b_v[0])}でした。"
        f"「{least_group_most_choice}」の割合が次に高かったのは、{b_c[1]}（{percent(b_v[1])}）です。"
        f"そのほか、{c_c[0]}（{percent(c_v[0])}）や{c_c[1]}（{percent(c_v[1])}）では「{c_choice}」の割合が高かったです。"
        f"以上のように、{q_text}否かはカテゴリーによって回答に差があることがわかりました。"
    )

def template_2(table: pd.Series, q_num: int, q_text: str, fig_num: str, q_description: str):
    sorted_table = table.sort_values(ascending=False)
    s_c = sorted_table.index.values
    s_v = sorted_table.values
    others_table = table.drop(s_c[0:3])

    return (
        f"問{q_num}で、{q_text}をうかがいました（必須項目）。{q_description}のが図表{fig_num}です。"
        f"もっとも多かったのは「{s_c[0]}」で{percent(s_v[0])}、次に多かったのが「{s_c[1]}」で{percent(s_v[1])}、その次に多かったのが「{s_c[2]}」の{percent(s_v[2])}でした。"
        f"その他の年齢階級の分布は{'、'.join([f'{index}が{percent(value)}' for index, value in others_table.items()])}という結果でした。"
    )

def _template_cross_result_text(data: pd.DataFrame):
    body_texts: list[str] = []
    cats_df = data.transpose()
    last_cat_name = cats_df.index[-1]
    for cat_name, row in cats_df.iterrows():
        sorted_row = row.sort_values(ascending=False)
        s_c = sorted_row.index.values
        s_v = sorted_row.values
        is_last = cat_name == last_cat_name
        body_texts.append(f"{cat_name}{'は' if is_last else 'で'}" + '、'.join([f"「{s_c[i]}」（{percent(s_v[i])}）" for i in range(min(2, len(s_c)))]))

    return f"その結果、割合が高かった選択肢は、{'、'.join(body_texts[0:2])}でした。{'、'.join(body_texts[2:-1])}の割合が高いことがわかりました。そして、{body_texts[-1]}という結果でした。"

def template_2_cross(data: pd.DataFrame, q_text: str, group_name: str, fig_nums: list[str]):
    return (
        f"{group_name}別に{q_text}を集計しました。"
        f"{'、'.join([f'図表{fig_num}は{cat_name}' for fig_num, cat_name in zip(fig_nums, data.columns)])}における{q_text}です。"
        f"{_template_cross_result_text(data)}"
    )

def template_3(table: pd.Series, q_num: int, q_text: str):
    sorted_table = table.sort_values(ascending=False)
    s_c = sorted_table.index.values
    s_v = sorted_table.values
    sorted_table.iat[1]

    return (
        f"問{q_num}は{q_text}をたずねました。"
        f"「{s_c[0]}」が{percent(s_v[0])}でもっとも多く、次いで{'、'.join([f'「{index}」が（{percent(value)}）' for index, value in sorted_table.iloc[1:3].items()])}でした。" +
        (f"その後には{'、'.join([f'「{index}」（{percent(value)}）' for index, value in sorted_table.iloc[3:].items()])}が続きました。" if len(sorted_table) >= 4 else '')
    )

def template_3_cross(data: pd.DataFrame, q_text: str, group_name: str):
    return (
        f"{q_text}の回答を{group_name}別にすると、以下のような結果でした。"
        f"{_template_cross_result_text(data)}"
    )

def template_4(table: pd.Series, q_num: int, q_text: str):
    sorted_table = table.sort_values(ascending=False)
    s_c = sorted_table.index.values
    s_v = sorted_table.values

    rest_table = sorted_table.iloc[3:]

    ten_up_texts = '、'.join([f'「{index}」（{percent(value)}）' for index, value in rest_table[rest_table >= 10].items()])
    others_text = '、'.join([f'「{index}」が（{percent(value)}）' for index, value in rest_table[rest_table < 10].items()])
    if not others_text:
        others_text = '、'.join([f'「{index}」が（{percent(value)}）' for index, value in rest_table.items()])
        ten_up_texts = ''

    return (
        f"問{q_num}は{q_text}をたずねました（複数回答）。"
        f"一番多かったのは「{s_c[0]}」で{percent(s_v[0])}でした。次に多かったのは「{s_c[1]}」で{percent(s_v[1])}、その次に多かったのが「{s_c[2]}」で{percent(s_v[2])}です。" +
        (f"これら以外で10%を超えたのは、{ten_up_texts}でした。" if ten_up_texts else '')+
        f"それ以外の選択肢は、{'、'.join([f'「{index}」（{percent(value)}）' for index, value in sorted_table.iloc[3:].items()])}が続きました。"
    )

def template_4_cross(data: pd.DataFrame, q_text: str, group_name: str, fig_nums: list[str]):
    return (
        f"{q_text}を{group_name}別にまとめました。"
        f"{'、'.join([f'図表{fig_num}は{g_choice}' for fig_num, g_choice in zip(fig_nums, data.columns.values)])}の分布です。"
        f"{_template_cross_result_text(data)}"
    )


In [20]:
%matplotlib inline

# # J / N
# table, n = create_table_single(df, 'J')
# figure = render_table_single_mpl(table, n)
# # figure.subplots_adjust()
# figure.savefig('out/tmp.png', bbox_inches='tight', dpi=144)
# # figure.savefig('out/tmp.png', dpi=144)
# figure

table, n = create_table_single(df, 'BT')
display(table)


In [6]:
%matplotlib agg

from dataclasses import dataclass
from dataclasses_json import LetterCase, dataclass_json
import shutil
import os
from IPython.core.display import display


report_config_df = pd.read_csv("./config/report_config.csv")
report_config_df.dropna(subset=['質問（列）'], inplace=True)

# TODO
report_config_intro_df = report_config_df[report_config_df['報告書の大見出し'] == '標本の構成']

report_config_body_df = report_config_df[report_config_df['報告書の大見出し'] != '標本の構成']


def g_fig_num_str(fig_num: int, g_fig_num: int):
    return f"{fig_num}-{chr(g_fig_num+96)}"

def save_figure(figure: Figure, figure_num: str):
    image_name = f"figure-{figure_num}.png"
    figure.savefig(f"out/full_tabulation/{image_name}", format="png", bbox_inches='tight', dpi=144)
    return image_name

def save_json():
    global records
    json = FullTabulationRecord.schema().dumps(records, many=True)
    with open('out/full_tabulation/full_tabulation.json', mode='w') as f:
        f.write(json)


@dataclass_json(letter_case=LetterCase.CAMEL)
@dataclass
class FullTabulationFigure:
    title: str
    image_name: str | None = None
    table: list[list[str]] | None = None

@dataclass_json(letter_case=LetterCase.CAMEL)
@dataclass
class FullTabulationRecord:
    h1: str
    h2: str
    title: str
    body: str
    description: str | None = None
    figures: list[FullTabulationFigure] | None = None


os.makedirs('out/full_tabulation', 0o755, exist_ok=True)

records: list[FullTabulationRecord] = []

for q_num, (_, row) in enumerate(report_config_body_df.iterrows(), 1):
    column = row['質問（列）']
    print(column)

    if column == 'DL':
        print(f'{column}は手動でスキップ')
        continue
    if choices_df.loc[column, 'テキスト回答']:
        print(f'{column}はテキスト回答のためスキップ')
        continue

    filtered_df = source_df if column == 'B' or column == 'C' else df # Use non-filtered table for screening questions

    fig_num = f"{q_num}"
    graph_title = row['グラフタイトル：（例）図表12 年齢の分布']
    q_txt = graph_title if pd.notna(graph_title) else questions[column]

    table, n = create_table_single(filtered_df, column)

    match row['説明文']:
        case 'リッカート尺度':
            body_text = template_1(table, q_num, q_txt, fig_num)
        case 'アフターコーディング系':
            body_text = template_2(table, q_num, q_txt, fig_num, 'TODO:')
        case '単一選択':
            body_text = template_3(table, q_num, q_txt)
        case '複数選択':
            body_text = template_4(table, q_num, q_txt)
        case _:
            raise ValueError(f'Unknown template type: {row["説明文"]}')

    h1 = row['報告書の大見出し']
    h2 = f"{row['報告書の見出し：（例）年齢【問3】']}【問{q_num}】"
    record = FullTabulationRecord(h1, h2, graph_title, body_text, row['セクション補足文'])
    records.append(record)

    rendered_table = None
    image_name = None
    figure = None

    match row['グラフの種類']:
        case '表':
            figure = render_table_single_mpl(table, n)
        case '横100':
            figure = render_stacked_hbar(table, n)
        case '円グラフ':
            figure = render_stacked_hbar(table, n)
        case '横棒グラフ':
            figure = hbar(table, n, bar_label=False)
        case '縦棒グラフ':
            figure = vbar(table, n, bar_label=False)
        case _:
            if pd.isna(row['グラフの種類']):
                continue
            raise ValueError(f'Unknown graph type: {row["グラフの種類"]}')
    if figure:
        image_name = save_figure(figure, fig_num)
    # display(table or figure)

    figure_title = f"図表{fig_num}：{graph_title}"
    record.figures = [FullTabulationFigure(figure_title, image_name=image_name, table=rendered_table)]

    if column == 'B' or column == 'C':
        continue

    so = row['性的指向']
    ro = row['恋愛的指向']

    group_cols = []

    if so:
        group_cols.append(('AN', 'Aceアイデンティティ'))
    if ro:
        group_cols.append(('Y', 'Aroアイデンティティ'))
    if not group_cols:
        continue

    g_fig_num = 1

    for g_col, g_name in group_cols:
        table_cross, n_table_cross = create_table_cross(filtered_df, column, g_col)

        match row['説明文']:
            case 'リッカート尺度':
                body_text = template_1_cross(table_cross, table, q_txt, g_name, g_fig_num_str(fig_num, g_fig_num))
            case 'アフターコーディング系':
                num_of_figs = len(table_cross.columns)
                fig_nums = list(map(lambda x: g_fig_num_str(fig_num, x), range(g_fig_num, g_fig_num + num_of_figs)))
                body_text = template_2_cross(table_cross, q_txt, g_name, fig_nums)
            case '単一選択':
                body_text = template_3_cross(table_cross, q_txt, g_name)
            case '複数選択':
                num_of_figs = len(table_cross.columns)
                fig_nums = list(map(lambda x: g_fig_num_str(fig_num, x), range(g_fig_num, g_fig_num + num_of_figs)))
                body_text = template_4_cross(table_cross, q_txt, g_name, fig_nums)
            case _:
                raise ValueError(f'Unknown cross template type: {row["説明文"]}')

        # display(body_text)

        section_title = f"{g_name}別にみた、{h2}"

        record = FullTabulationRecord(h1, h2, section_title, body_text, figures=[])
        records.append(record)

        cross_graph_type = row['クロス集計グラフ種類（空白の場合は左と同じ）']
        if pd.isna(cross_graph_type) or cross_graph_type == 'いらない':
            continue

        rendered_table: list[list[str | int | float]] | None = None
        figure: Figure | None = None
        figures: list[Figure] = []

        match row['クロス集計グラフ種類（空白の場合は左と同じ）']:
            case '表':
                figure = render_table_cross_mpl(table_cross, n_table_cross)
            case '単独-縦棒グラフ':
                figures = render_cross_single_graphs(table_cross, n_table_cross, 'v', 1)
            case '単独-横棒グラフ':
                figures = render_cross_single_graphs(table_cross, n_table_cross, 'h', 1)
            case '縦棒グラフ':
                figure = render_bar_grouped(table_cross, n_table_cross, 'v')
            case '横100':
                figure = render_stacked_hbar_grouped(table_cross, n_table_cross)
            case _:
                raise ValueError(f'Unknown cross graph type: {row["クロス集計グラフ種類（空白の場合は左と同じ）"]}')

        if rendered_table is not None:
            current_fig_num = g_fig_num_str(fig_num, g_fig_num)
            record.figures.append(FullTabulationFigure(f"図表{current_fig_num}：{g_name}別にみた{graph_title}の分布", table=rendered_table))
            g_fig_num += 1
            # display(rendered_table)
        if figure is not None:
            current_fig_num = g_fig_num_str(fig_num, g_fig_num)
            image_name = save_figure(figure, current_fig_num)
            record.figures.append(FullTabulationFigure(f"図表{current_fig_num}：{g_name}別にみた{graph_title}の分布", image_name=image_name))
            g_fig_num += 1
            # display(figure)
        for i, figure in enumerate(figures):
            current_group = table_cross.columns.values[i]
            current_fig_num = g_fig_num_str(fig_num, g_fig_num)
            image_name = save_figure(figure, current_fig_num)
            record.figures.append(FullTabulationFigure(f"図表{current_fig_num}：{current_group}における{graph_title}の分布", image_name=image_name))
            g_fig_num += 1
            # display(figure)

save_json()

shutil.make_archive('out/full_tabulation', 'zip', 'out', 'full_tabulation')


  from IPython.core.display import display


B
C
D
E


  fig = plt.figure(figsize=figsize)


F
G
H
I
J
K
L


posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values


M
N
O
P


posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and pos

Q
R
S
T
U
V
W
X
Y
Z
AA
AB
AC
ACはテキスト回答のためスキップ
AD
AE
AF
AG
AH
AI
AJ
AK
AL
AM
AN
AO
AP
AQ
AQはテキスト回答のためスキップ
AR
AS
AT
AU
AV
AW
AWはテキスト回答のためスキップ
AX
AY
AZ
BA
BB
BC
BD
BE
BF
BG
BH
BI
BJ
BK
BL
BM
BN
BO
BP
BQ
BR
BS


posx and posy should be finite values
posx and posy should be finite values


BT
BU
BV


posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values


BW
BX
BY
BZ
CA
CB
CC
CD
CDはテキスト回答のためスキップ
CE
CF
CG
CH
CI
CJ
CK
CL
CM
CN
CO
CP
CPはテキスト回答のためスキップ
CQ
CR
CRはテキスト回答のためスキップ
CS
CT
CTはテキスト回答のためスキップ
CU
CV
CW
CX
CY
CZ
DA
DB
DC
DD
DE
DF
DG
DH
DI
DJ
DJはテキスト回答のためスキップ
DK
DKはテキスト回答のためスキップ
DL
DLは手動でスキップ


'/Users/ypresto/repo/github.com-private/ypresto/asloop-survey/out/full_tabulation.zip'

posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values


In [77]:
%matplotlib agg

# 図表I-1 Aro/Ace調査2022と国勢調査における年齢の分布と両者の差異

census_age_values: pd.Series = pd.read_csv('./config/reference_data/国勢調査令和2年 - 男女，年齢（各歳），国籍総数か日本人別人口－全国，都道府県，21大都市，特別区，人口50万以上の市 - 年齢.csv',
                    index_col='年齢',
                    encoding='CP932',
                    usecols=['年齢', 'value'],
                    na_values=['']).squeeze()
census_age_values_n = census_age_values.sum()
census_age_values.index = cut_ages_bin(census_age_values.index.map(lambda s: s.replace('以上', '').replace('歳', '')).astype(int), 'D')
census_age_values = census_age_values.groupby(level=0).sum() * 100 / census_age_values_n

asloop_age_values, asloop_age_values_n = create_table_single(df, 'D')

age_table = pd.concat([asloop_age_values.rename('Aro/Ace調査2022'),census_age_values.rename('国勢調査2020')], axis=1)
age_table_n = pd.Series({'Aro/Ace調査2022': asloop_age_values_n, '国勢調査': census_age_values_n})

figure = render_stacked_hbar_grouped(age_table, age_table_n)
save_figure(figure, 'I-1')

# 図表I-2 Aro/Ace調査2022と国勢調査における居住地の分布と両者の差異

census_prefecture_values: pd.Series = pd.read_csv('./config/reference_data/国勢調査令和2年 - 男女別人口－全国，都道府県，市区町村（2000年（平成12年）市区町村含む） - 都道府県.csv',
                    index_col='全国，都道府県，市区町村（2000年市区町村含む）',
                    encoding='CP932',
                    usecols=['全国，都道府県，市区町村（2000年市区町村含む）', 'value'],
                    na_values=['']).squeeze()
census_prefecture_values_n = census_prefecture_values.sum()
census_prefecture_values.rename(index=location_map, inplace=True)
census_prefecture_values = census_prefecture_values.groupby(level=0).sum() * 100 / census_prefecture_values_n

asloop_prefecture_values, asloop_prefecture_values_n = create_table_single(df, 'E')

prefecture_table = pd.concat([asloop_prefecture_values.rename('Aro/Ace調査2022'),census_prefecture_values.rename('国勢調査2020')], axis=1)
prefecture_table_n = pd.Series({'Aro/Ace調査2022': asloop_prefecture_values_n, '国勢調査2020': census_prefecture_values_n})

figure = render_stacked_hbar_grouped(prefecture_table, prefecture_table_n)
save_figure(figure, 'I-2')

# 図表I-3 （Aro/Ace調査2022）出生時の性別と現在の認識が一致していると思うか否かの分布
table, n = create_table_single(df, 'M')
figure = render_stacked_hbar(table, n)
save_figure(figure, 'I-3')

# 図表I-4 Aro/Ace調査2022における出生時の性別の分布
table, n = create_table_single(df, 'N')
figure = render_stacked_hbar(table, n)
save_figure(figure, 'I-4')


# 図表I-5 Aro/Ace調査2022における性別の分布

mapped_o_values = df['O'].dropna().map({
    '女性': 'シスジェンダー女性',
    'どちらかといえば女性': 'シスジェンダー女性',
    '男性': 'シスジェンダー男性',
    'どちらかといえば男性': 'シスジェンダー男性',
}).fillna('非シスジェンダー')
table, n = _create_table_single(mapped_o_values, ['シスジェンダー女性', 'シスジェンダー男性', '非シスジェンダー'])
figure = render_stacked_hbar(table, n)

save_figure(figure, 'I-5')

# 図表I-6 国勢調査における性別の分布

census_sex_values: pd.Series = pd.read_csv('./config/reference_data/国勢調査令和2年 - 男女別人口－全国，都道府県，市区町村（2000年（平成12年）市区町村含む） - 男女.csv',
                            index_col='男女',
                            encoding='CP932',
                            usecols=['男女', 'value'],
                            na_values=['']).squeeze()
census_sex_values_n = census_sex_values.sum()
census_sex_values.index = pd.Series(census_sex_values.index).replace({ '男': '男性', '女': '女性' })

figure = render_stacked_hbar(census_sex_values / census_sex_values_n * 100, census_sex_values_n)

save_figure(figure, 'I-6')

pass
