In [None]:
import numpy as np
import pandas as pd

In [None]:
import yaml
with open('../setting/path.yaml', 'r') as file:
    config = yaml.safe_load(file)

# パスの設定
survey_path_2024_q2 = config['survey_data_path_2024_q2']
survey_path_2024_q3 = config['survey_data_path_2024_q3']
pay_amount_list_path = config['pay_amount_list']
# 出力
survey_2024_Q2andQ3_expenditure_amount = config['expenditure_amount_processed']

## 個票データの読み込み

In [None]:
columns_to_read = ['F_a', 'F_b', 'F_c', 'F_c1', 'F_c2', 'F_c3',
                   'F_c4', 'F_c5', 'F_c6', 'F_c7', 'F_c8', 
                   'F_d', 'F_d1', 'F_d2', 'F_d3', 'F_d4',
                   'F_d5', 'F_d6', 'F_d7', 'F_d8', 'F_d9',
                   'F_d10', 'F_d11', 'F_d12', 'F_e', 'F_e1',
                   'F_e2', 'F_e3', 'F_e4', 'F_e5', 'F_e6',
                   'F_e7', 'F_e8', 'F_e9', 'F_e10', 'F_e11',
                   'F_e12', 'F_e13', 'F_e14', 'F_e15', 'F_e16',
                   'F_f', 'F_g']

df_q2 = pd.read_csv(survey_path_2024_q2,
                    usecols=columns_to_read)
print(df_q2.shape)

df_q3 = pd.read_csv(survey_path_2024_q3,
                    usecols=columns_to_read)
print(df_q3.shape)

In [None]:
df_q2.head()

In [None]:
assert df_q2.columns.equals(df_q3.columns), '2つの変数名が一致していません。'

## カラム名の変更

In [None]:
pay_amount_list = pd.read_excel(pay_amount_list_path)
pay_amount_list.shape

In [None]:
pay_amount_list.head()

In [None]:
def rename_columns(df, column_list):
    # カラム名の変更
    column_mapping = dict(zip(column_list['変数名'], column_list['ラベル名']))
    # データフレームのカラム名を更新
    df = df.rename(columns=column_mapping)
    assert set(column_list['ラベル名']) == set(df.columns), "カラム名が正しく変更されていません"
    
    return df

In [None]:
df_q2 = rename_columns(df_q2, pay_amount_list)
df_q3 = rename_columns(df_q3, pay_amount_list)

## 主キーの設定

In [None]:
df = pd.concat([df_q2, df_q3],
               axis=0,
               ignore_index=True)
df.shape

In [None]:
def create_primary_key(df):
    df['userid'] = [f"{i:03d}" for i in range(1, len(df) + 1)]
    assert df['userid'].nunique() == len(df), "主キーが正しく設定されていません。"

    return df

In [None]:
df = create_primary_key(df)
print(df.shape)
df['userid'].nunique()

In [None]:
df.columns

## 支出金額の加工

In [None]:
df.isna().sum()

In [None]:
df.dtypes

In [None]:
# 'userid'以外のカラムをリストとして取得
expenditure_columns = [col for col in df.columns if col != 'userid']

# 欠損値(NaN)と-99 → 0
df[expenditure_columns] = df[expenditure_columns].replace(-99, 0)

# カラムのデータ型をfloatからintに変更
df[expenditure_columns] = df[expenditure_columns].astype(int)

assert not (df[expenditure_columns] == -99).any().any(), "Error: -99 still exists in the dataframe."

In [None]:
df.dtypes

In [None]:
columns_order = ['userid'] + expenditure_columns
df = df[columns_order]
assert df.columns[0] == 'userid', "Error: 'userid' is not the first column."

# データを出力

In [None]:
df.to_excel(survey_2024_Q2andQ3_expenditure_amount,
            index=False)