### データインポート

In [1]:

import pandas as pd
from logic.factory_manage.sql import load_data_from_sqlite

# CSVファイル読み込み（パスは適宜変更）
path = "/work/app/data/factory_manage/weight_data.db"
df = load_data_from_sqlite()
df["伝票日付"].max()

Timestamp('2025-06-19 00:00:00')

In [2]:
from utils.get_holydays import get_japanese_holidays
hol_max = df["伝票日付"].max()
hol_min = df["伝票日付"].min()
holiday = get_japanese_holidays(hol_min, hol_max)
holiday[:5]

['2024-05-03', '2024-05-04', '2024-05-05', '2024-05-06', '2024-07-15']

In [3]:
from logic.factory_manage.predict_model_v3 import generate_features, train_and_predict
df_feat, df_pivot = generate_features(df,holidays=holiday)

### 旧予測モデル

In [None]:
from logic.factory_manage.predict_model_ver2 import train_and_predict_with_holiday
import pandas as pd
from datetime import timedelta


In [None]:

roop_start_date = pd.to_datetime("2025-03-01")
roop_end_date = pd.to_datetime("2025-05-31")

results = []

for current_start in pd.date_range(start=roop_start_date, end=roop_end_date):
    start_dt = current_start
    end_dt = start_dt  # 同じ日に設定

    filtered_df = df[df['伝票日付'] < start_dt]

    if filtered_df.empty:
        print(f"Warning: No training data available before {start_dt.strftime('%Y-%m-%d')}, skipping.")
        continue

    print(f"date {filtered_df['伝票日付'].min()} to {filtered_df['伝票日付'].max()} with {len(filtered_df)} records")
    print(f"predicting for {start_dt.strftime('%Y-%m-%d')}")

    pred_df = train_and_predict_with_holiday(
        filtered_df,
        start_dt.strftime("%Y-%m-%d"),
        end_dt.strftime("%Y-%m-%d"),
        holiday
    )

    # 予測結果に予測対象日カラムを追加
    pred_df['予測対象日'] = start_dt.strftime('%Y-%m-%d')

    results.append(pred_df)

if results:
    all_results_df = pd.concat(results, ignore_index=True)
    all_results_df.to_csv("predictions_.csv", index=False)
    print("Prediction results saved to predictions_202505.csv")
else:
    print("No prediction results to save.")


### 真予測モデル

In [None]:
from logic.factory_manage.predict_model_ver2 import train_and_predict_fix
import pandas as pd
from datetime import timedelta


In [None]:

roop_start_date = pd.to_datetime("2025-03-01")
roop_end_date = pd.to_datetime("2025-05-31")

results = []

for current_start in pd.date_range(start=roop_start_date, end=roop_end_date):
    start_dt = current_start
    end_dt = start_dt  # 同じ日に設定

    filtered_df = df[df['伝票日付'] < start_dt]

    if filtered_df.empty:
        print(f"Warning: No training data available before {start_dt.strftime('%Y-%m-%d')}, skipping.")
        continue

    print(f"date {filtered_df['伝票日付'].min()} to {filtered_df['伝票日付'].max()} with {len(filtered_df)} records")
    print(f"predicting for {start_dt.strftime('%Y-%m-%d')}")

    pred_df = train_and_predict_fix(
        filtered_df,
        start_dt.strftime("%Y-%m-%d"),
        end_dt.strftime("%Y-%m-%d"),
        holiday
    )

    # 予測結果に予測対象日カラムを追加
    pred_df['予測対象日'] = start_dt.strftime('%Y-%m-%d')

    results.append(pred_df)

if results:
    all_results_df = pd.concat(results, ignore_index=True)
    all_results_df.to_csv("predictions_.csv", index=False)
    print("Prediction results saved to predictions_202505.csv")
else:
    print("No prediction results to save.")


## モデル改良版

In [None]:
from logic.factory_manage.predict_model_v3 import generate_features, train_and_predict
import pandas as pd
from datetime import timedelta


In [None]:

roop_start_date = pd.to_datetime("2025-03-01")
roop_end_date = pd.to_datetime("2025-05-31")

results = []

for current_start in pd.date_range(start=roop_start_date, end=roop_end_date):
    start_dt = current_start
    end_dt = start_dt  # 同じ日に設定

    filtered_df = df[df['伝票日付'] < start_dt]

    if filtered_df.empty:
        print(f"Warning: No training data available before {start_dt.strftime('%Y-%m-%d')}, skipping.")
        continue

    print(f"date {filtered_df['伝票日付'].min()} to {filtered_df['伝票日付'].max()} with {len(filtered_df)} records")
    print(f"predicting for {start_dt.strftime('%Y-%m-%d')}")

    pred_df = train_and_predict(
        df_feat,
        df_pivot,
        # filtered_df,
        start_dt.strftime("%Y-%m-%d"),
        end_dt.strftime("%Y-%m-%d"),
        holidays=holiday,
    )

    # 予測結果に予測対象日カラムを追加
    pred_df['予測対象日'] = start_dt.strftime('%Y-%m-%d')

    results.append(pred_df)

if results:
    all_results_df = pd.concat(results, ignore_index=True)
    all_results_df.to_csv("predictions_.csv", index=False)
    print("Prediction results saved to predictions_202505.csv")
else:
    print("No prediction results to save.")


## モデル改良版/ステージ１/リークなし

In [31]:

from logic.factory_manage.predict_model_v4 import generate_features, run_fully_walkforward_pipeline
import pandas as pd
from datetime import timedelta

df_feat, df_pivot = generate_features(df,holidays=holiday)

# print(f"date {df['伝票日付'].min()} to {df['伝票日付'].max()} with {len(df)} records")
# print(f"predicting for {start_dt.strftime('%Y-%m-%d')}")

pred_df = run_fully_walkforward_pipeline(
    df_feat,
    df_pivot,
)



🕒 Processing 2024-05-13 ...
  📦 Training for 混合廃棄物A
  📦 Training for 混合廃棄物B
🕒 Processing 2024-05-14 ...
  📦 Training for 混合廃棄物A
  📦 Training for 混合廃棄物B
🕒 Processing 2024-05-15 ...
  📦 Training for 混合廃棄物A
  📦 Training for 混合廃棄物B
🕒 Processing 2024-05-16 ...
  📦 Training for 混合廃棄物A
  📦 Training for 混合廃棄物B
🕒 Processing 2024-05-17 ...
  📦 Training for 混合廃棄物A
  📦 Training for 混合廃棄物B
🕒 Processing 2024-05-18 ...
  📦 Training for 混合廃棄物A
  📦 Training for 混合廃棄物B
🕒 Processing 2024-05-19 ...
  📦 Training for 混合廃棄物A
  📦 Training for 混合廃棄物B
🕒 Processing 2024-05-20 ...
  📦 Training for 混合廃棄物A
  📦 Training for 混合廃棄物B
🕒 Processing 2024-05-21 ...
  📦 Training for 混合廃棄物A
  📦 Training for 混合廃棄物B
🕒 Processing 2024-05-22 ...
  📦 Training for 混合廃棄物A
  📦 Training for 混合廃棄物B
🕒 Processing 2024-05-23 ...
  📦 Training for 混合廃棄物A
  📦 Training for 混合廃棄物B
🕒 Processing 2024-05-24 ...
  📦 Training for 混合廃棄物A
  📦 Training for 混合廃棄物B
🕒 Processing 2024-05-25 ...
  📦 Training for 混合廃棄物A
  📦 Training for 混合廃棄物B
🕒 Processing

## モデル改良版/ステージ2も/リークなし

In [4]:
from logic.factory_manage.predict_model_v4_3 import full_walkforward_with_debug

In [5]:

# df_feat, df_pivot = generate_features(df,holidays=holiday)

# print(f"date {df['伝票日付'].min()} to {df['伝票日付'].max()} with {len(df)} records")
# print(f"predicting for {start_dt.strftime('%Y-%m-%d')}")

pred_df = full_walkforward_with_debug(
    df,
    holiday,
)



===== 2024-05-10T00:00:00.000000000 処理中 =====
  ステージ1履歴: 2024-05-08 ～ 2024-05-09
    混合廃棄物A 予測値: nan kg
    混合廃棄物B 予測値: nan kg

===== 2024-05-11T00:00:00.000000000 処理中 =====
  ステージ1履歴: 2024-05-08 ～ 2024-05-10
    混合廃棄物A 予測値: nan kg
    混合廃棄物B 予測値: nan kg

===== 2024-05-12T00:00:00.000000000 処理中 =====
  ステージ1履歴: 2024-05-08 ～ 2024-05-11
    混合廃棄物A 予測値: nan kg
    混合廃棄物B 予測値: nan kg

===== 2024-05-13T00:00:00.000000000 処理中 =====
  ステージ1履歴: 2024-05-08 ～ 2024-05-12
    混合廃棄物A 予測値: nan kg
    混合廃棄物B 予測値: nan kg

===== 2024-05-14T00:00:00.000000000 処理中 =====
  ステージ1履歴: 2024-05-08 ～ 2024-05-13
    混合廃棄物A 予測値: nan kg
    混合廃棄物B 予測値: nan kg

===== 2024-05-15T00:00:00.000000000 処理中 =====
  ステージ1履歴: 2024-05-08 ～ 2024-05-14
    混合廃棄物A 予測値: nan kg
    混合廃棄物B 予測値: nan kg

===== 2024-05-16T00:00:00.000000000 処理中 =====
  ステージ1履歴: 2024-05-08 ～ 2024-05-15
    混合廃棄物A 予測値: nan kg
    混合廃棄物B 予測値: nan kg

===== 2024-05-17T00:00:00.000000000 処理中 =====
  ステージ1履歴: 2024-05-08 ～ 2024-05-16
    混合廃棄物A 予測値: nan kg

## モデル改良版/リークなし/データ強化版

### データ作成・全体


In [15]:
import pandas as pd
from logic.factory_manage.sql import load_data_from_sqlite

# Read the CSV file
path_2020 = "/work/app/data/input/2020顧客.csv"
path_2021 = "/work/app/data/input/2021顧客.csv"
path_2022 = "/work/app/data/input/2022顧客.csv"
path_2023 = "/work/app/data/input/2023_all.csv"
path_2024 = "/work/app/data/input/20240501-20250422.csv"


# CSV読み込み
df_2021 = pd.read_csv(path_2021, encoding="utf-8")
df_2022 = pd.read_csv(path_2022, encoding="utf-8")
df_2023 = pd.read_csv(path_2023, encoding="utf-8")
df_2024 = pd.read_csv(path_2024, encoding="utf-8")

  df_2023 = pd.read_csv(path_2023, encoding="utf-8")


### DFの特徴量３つ

In [22]:
import re
# CSV読み込み
df_2021 = pd.read_csv(path_2021, encoding="utf-8")[["伝票日付", "正味重量", "商品"]]
df_2022 = pd.read_csv(path_2022, encoding="utf-8")[["伝票日付", "正味重量", "商品"]]
df_2023 = pd.read_csv(path_2023, encoding="utf-8")[["伝票日付", "正味重量", "商品"]]
df_2024 = pd.read_csv(path_2024, encoding="utf-8")[["伝票日付", "正味重量", "品名"]]


# カラム名統一
df_2021 = df_2021.rename(columns={"商品": "品名"})
df_2022 = df_2022.rename(columns={"商品": "品名"})
df_2023 = df_2023.rename(columns={"商品": "品名"})


# df_2024は修正
def remove_parenthesis_weekday(date_str):
    # 「(」から「)」までの部分を削除
    return re.sub(r"\([^)]+\)", "", date_str)

df_2024["伝票日付"]= df_2024["伝票日付"].apply(remove_parenthesis_weekday)
df_2024.head()

# 結合
df_all = pd.concat([df_2021, df_2022, df_2023, df_2024], ignore_index=True)

# 日付をdatetime型に変換
df_all["伝票日付"] = pd.to_datetime(df_all["伝票日付"], errors='coerce')
df_all["正味重量"] = pd.to_numeric(df_all["正味重量"], errors='coerce')
df_all = df_all.dropna()
print(df_all["伝票日付"].min(), df_all["伝票日付"].max())

  df_2023 = pd.read_csv(path_2023, encoding="utf-8")[["伝票日付", "正味重量", "商品"]]


2021-01-04 00:00:00 2025-05-26 00:00:00


In [23]:
holiday_all = get_japanese_holidays(df_all["伝票日付"].min(), df_all["伝票日付"].max())

### 正規化

In [None]:
import pandas as pd
import unicodedata
import re

# 揺れ吸収マスタ辞書（今後も随時拡張可）
replace_dict = {
    'ダンボ-ル': 'ダンボール',
    'スチ-ル': 'スチール',
    'トナー他': 'トナー',
    'GC軽鉄・スチ-ル類.': 'GC軽鉄・スチール類',
    'GC軽鉄・スチ-ル類': 'GC軽鉄・スチール類',
    'GC軽鉄・スチ-ル': 'GC軽鉄・スチール',
    'GAH鋼・鉄筋等': 'GA H鋼・鉄筋等',
    'フロン行程表': 'フロン行程管理票',
    'コンクリ-ト': 'コンクリート',
    'アスガラ': 'アスファルトがら',
}

# 半角カナ→全角カナ置換辞書
kana_replace_dict = {
    'ｿﾌｧｰ': 'ソファー',
    'ｶﾞﾗｽ': 'ガラス',
    'ﾋﾞﾝ': 'ビン',
    'ｶﾝ': 'カン',
    'ｽﾁｰﾙ': 'スチール',
}

def normalize_item_name(name):
    # ① 全角→半角正規化
    name = unicodedata.normalize("NFKC", name)
    
    # ② 中黒統一
    name = name.replace('･', '・')
    
    # ③ 半角カナ置換
    for key, val in kana_replace_dict.items():
        name = name.replace(key, val)
    
    # ④ 辞書置換
    for key, val in replace_dict.items():
        name = name.replace(key, val)
    
    # ⑤ 空白除去
    name = re.sub(r'\s+', '', name)
    
    # ⑥ 長音記号統一
    name = name.replace('-', 'ー')
    
    # ⑦ 特定品目専用：GC軽鉄・スチール類の枝番吸収（念のため残す）
    name = re.sub(r'GC軽鉄・スチール類\(.*?\)', 'GC軽鉄・スチール類', name)

    # ⑧ 汎用括弧吸収（全品目対象：混合廃棄物A等も吸収）
    name = re.sub(r'\(.*?\)', '', name)

    # ⑨ 末尾のピリオド・空白吸収
    name = re.sub(r'[\s\.]+$', '', name)

    return name

# 一括適用
df_all["品名"] = df_all["品名"].astype(str).apply(normalize_item_name)
print(list(df_all.columns))

In [25]:

from logic.factory_manage.predict_model_v4 import generate_features, run_fully_walkforward_pipeline
import pandas as pd
from datetime import timedelta

df_feat, df_pivot = generate_features(df_all,holidays=holiday_all)

print(f"date {df_all['伝票日付'].min()} to {df_all['伝票日付'].max()} with {len(df_all)} records")


date 2021-01-04 00:00:00 to 2025-05-26 00:00:00 with 178704 records


### 長期データにて予測

In [None]:
pred_df = run_fully_walkforward_pipeline(
    df_feat,
    df_pivot,
)

### 最適な履歴長さ

In [26]:
from logic.factory_manage.predict_model_v4 import find_optimal_history, train_final_model

In [27]:
df_feat, df_pivot = generate_features(df_all, holiday_all)

In [28]:
# 試したい履歴長リスト（例）
history_windows = [100, 200, 300, 400, 500, 700, 1000]

# 最適履歴長を探索
best_window = find_optimal_history(df_feat, df_pivot, history_windows)



===== 検証: 最新 100日履歴 =====
検証 R²=0.550, MAE=14,800kg

===== 検証: 最新 200日履歴 =====


KeyboardInterrupt: 

### 1日分将来予測

In [29]:
from logic.factory_manage.predict_model_v5 import walkforward_one_day_evaluation


In [30]:

# 試したい履歴長リスト（例）
history_windows = [100, 200, 300, 400, 500, 700, 1000]
for num in history_windows:
    print(f"Testing history window: {num}")
    r2, mae = walkforward_one_day_evaluation(df_feat, df_pivot, num)
    print(f"R2: {r2}, MAE: {mae}")


# r2, mae = walkforward_one_day_full_stage2(df_feat, df_pivot, history_window)


Testing history window: 100

===== 各日の予測結果 =====

===== ウォークフォワード1日先評価 結果 =====
R² = -0.629, MAE = 33,058kg
R2: -0.6293537956490203, MAE: 33057.94474432485
Testing history window: 200

===== 各日の予測結果 =====


KeyboardInterrupt: 

### 特徴量追加/1日予測

In [9]:
from logic.factory_manage.predict_model_v6 import yobidasi, generate_features

In [12]:
df_feat, df_pivot = generate_features(df_all, holidays=holiday_all)
# 試したい履歴長リスト（例）
print(list(df_pivot.columns))


NameError: name 'normalize_item_name' is not defined

In [None]:
history_windows = [100, 200, 300, 400, 500, 700, 1000]
for num in history_windows:
    print(f"Testing history window: {num}")
    r2, mae = yobidasi(df_feat, df_pivot, num)
    print(f"R2: {r2}, MAE: {mae}")


# history_date = 1000

# r2, mae = walkforward_2(df_feat, df_pivot, history_date)

Testing history window: 100

===== 各日の予測結果 =====


KeyError: 'GC軽鉄・スチール類'