# データを成形し、BigQueryにアップロードします。

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta

In [3]:
select_columns=["日付", "取引数", "売上高(税抜)", "現計(取引数)", "現計(税抜金額)", "信計(取引数)",
                "信計(取引数内訳)", "信計(税抜金額)", "信計(税抜金額内訳)", "客数", "客単価(税抜)"]
day_of_week_ja=["月", "火", "水", "木", "金", "土", "日", "(", ")"]

In [4]:
def molding_dataset(df, select_columns):
    df_select = df[select_columns]
    df_rename = df_select.rename(columns={'日付': 'date',
                          '取引数': 'transactions',
                          '売上高(税抜)': 'sales',
                          '現計(取引数)': 'sales_transactions',
                          '現計(税抜金額)': 'cash_sales',
                          '信計(取引数)': 'cashless_transactions',
                          '信計(取引数内訳)': 'cashless_kind',
                          '信計(税抜金額)': 'cashless_sales',
                          '信計(税抜金額内訳)': 'cashless_sales_kind',
                          '客数': 'customers',
                          '客単価(税抜)': 'Customer_unit_price',})
    
    # Delete the total value in the last line
    return df_rename[:-1]

def reshape_date(df, day_of_week_ja, datetime):
    for week_ja in day_of_week_ja:
        df['date'] = df['date'].str.replace(week_ja, '', regex=True)
    
    df['date'] = str(datetime.strftime('%Y')) + '/' + df['date'].astype(str)

    return df

def read_csv(select_columns, day_of_week_ja):
    csvfile_path = "../data/sales/bregister_daily_{}.csv"
    
    old_datetime = datetime(2021, 1, 1)
    today_datetime = datetime.today().replace(hour=0,minute=0,second=0,microsecond=0)

    month_diff = (today_datetime.month - old_datetime.month) + (today_datetime.year - old_datetime.year) * 12
    if today_datetime.day - old_datetime.day < 0:
        month_diff -= 1
    
    for i in range(0, month_diff):
        df = pd.read_csv(csvfile_path.format(str(old_datetime.strftime('%Y%m'))), encoding='shift_jis')
        df_m = molding_dataset(df, select_columns)
        df_r = reshape_date(df_m, day_of_week_ja, old_datetime)
        
        if not i == 0:
            df_csv = pd.concat([df_csv, df_r], axis=0, ignore_index=True)
        else:
            df_csv = df_r
            
        old_datetime += relativedelta(months=1)
    
    # if store is regular holiday store_horiday colums status 1
    df_csv['store_horiday'] = (df_csv['transactions'] == 0) * 1
        
    return df_csv


In [5]:
df_data = read_csv(select_columns, day_of_week_ja)

In [6]:
df_data.to_csv("../data/reshape_dataset/sales_data.csv", index = False)

In [7]:
df_data

Unnamed: 0,date,transactions,sales,sales_transactions,cash_sales,cashless_transactions,cashless_kind,cashless_sales,cashless_sales_kind,customers,Customer_unit_price,store_horiday
0,2021/01/1,70,58340,70,58340,0,,0,,70,833,0
1,2021/01/2,80,73298,80,73298,0,,0,,80,916,0
2,2021/01/3,90,85452,90,85452,0,,0,,90,949,0
3,2021/01/4,77,63312,77,63312,0,,0,,77,822,0
4,2021/01/5,85,72448,85,72448,0,,0,,85,852,0
...,...,...,...,...,...,...,...,...,...,...,...,...
725,2022/12/27,54,49091,34,29078,20,iD(4)/JCB(1)/交通系IC(10)/Master(3)/選択なし(1)/QUICP...,20454,iD(3414)/JCB(1229)/交通系IC(7950)/Master(5153)/選択...,54,909,0
726,2022/12/28,57,58766,38,37156,19,交通系IC(7)/Master(8)/iD(2)/JCB(2),21609,交通系IC(6311)/Master(12421)/iD(1756)/JCB(1121),57,1031,0
727,2022/12/29,55,70233,33,40774,22,iD(3)/Master(9)/交通系IC(8)/JCB(2),29460,iD(1755)/Master(16731)/交通系IC(9427)/JCB(1547),55,1277,0
728,2022/12/30,49,65897,32,38654,17,Master(11)/iD(4)/JCB(1)/交通系IC(1),27244,Master(22611)/iD(3139)/JCB(540)/交通系IC(954),49,1345,0
