## 1 (改訂 v2). ライブラリのインポートと設定 (最大量取得・プログレスバー対応)
データ取得上限を増やし、プログレスバー表示のための tqdm をインポートします。
tqdm が未インストールの場合: !pip install tqdm
`start_date_str` で取得を開始したい最も古い日付を指定します。
`max_total_data` で取得するデータ件数の大まかな上限を設定できます（メモリ保護のため）。


In [None]:
import pybotters
import pandas as pd
import asyncio
import time
from datetime import datetime, timedelta, timezone
import nest_asyncio
from tqdm.notebook import tqdm # Jupyter Notebook 用の tqdm をインポート

# --- 設定項目 ---
target_symbol = 'BTCUSDT'
interval = '5'
category = 'linear'
limit = 1000
start_date_str = '2000-01-01' # 取得開始希望日をさらに過去に設定 (例: 2020年)
# max_total_data = 1000 # 取得上限を増やす (例: 100万件)
max_total_data = None      # または None にしてAPIが提供する限り取得する (メモリ注意)
output_filename_full = f'{target_symbol}_{interval}m_data_max.csv' # ファイル名変更

# --- 設定項目ここまで ---

base_url = 'https://api.bybit.com'
start_timestamp_ms = int(datetime.strptime(start_date_str, '%Y-%m-%d').replace(tzinfo=timezone.utc).timestamp() * 1000)


## 2 (改訂 v2). データ取得・整形関数の定義 (最大量取得・プログレスバー版)
指定した開始日まで遡ってデータを取得するよう修正した関数。
tqdm を組み込み、進捗を表示します。

In [None]:
async def fetch_bybit_kline_full_tqdm(symbol, interval, category, limit, start_ts_ms, max_data=None):
    """Bybit V5 APIから指定開始日まで遡ってデータを取得し、DataFrameに整形 (tqdm進捗表示付き)"""
    apis = {}
    client = pybotters.Client(apis=apis, base_url=base_url)
    endpoint = '/v5/market/kline'

    all_data_list = []
    current_end_time = int(time.time() * 1000)
    total_fetched = 0
    request_count = 0
    estimated_total_requests = None # 総リクエスト回数の推定値 (任意)

    # 大まかな総リクエスト回数を推定する場合（任意、正確ではない）
    if max_data:
         estimated_total_requests = (max_data // limit) + 1
    else:
         # 現在から開始日までのおおよその期間を計算して推定 (ざっくり)
         try:
             duration_days = (datetime.now(timezone.utc) - datetime.fromtimestamp(start_ts_ms / 1000, tz=timezone.utc)).days
             estimated_total_requests = (duration_days * 24 * (60 // int(interval)) // limit) + 5 # 余裕を持たせる
             print(f"推定総リクエスト回数 (目安): {estimated_total_requests}")
         except:
             pass # 計算失敗しても気にしない

    print(f"データ取得を開始します (開始希望日: {start_date_str})...")

    # tqdmの初期化 (totalが不明な場合もある)
    # descで何のプログレスバーか示す, unit='req' で単位をリクエストにする
    pbar = tqdm(total=estimated_total_requests, desc=f"Fetching {symbol}", unit="req")

    while True:
        request_count += 1
        params = {
            'category': category,
            'symbol': symbol,
            'interval': interval,
            'limit': limit,
            'end': current_end_time,
        }
        try:
            resp = await client.get(endpoint, params=params)
            data = await resp.json()

            if data['retCode'] == 0 and data['result'] and data['result']['list']:
                kline_list = data['result']['list']
                fetched_count = len(kline_list)
                total_fetched += fetched_count
                oldest_timestamp_in_batch = int(kline_list[-1][0])

                # tqdmの進捗を更新 (取得件数も表示させる postifx)
                pbar.update(1)
                pbar.set_postfix(fetched=f"{total_fetched/1000:.1f}k", last_dt=f"{datetime.fromtimestamp(oldest_timestamp_in_batch / 1000, tz=timezone.utc).strftime('%Y-%m-%d')}")

                all_data_list.extend(kline_list)

                if oldest_timestamp_in_batch <= start_ts_ms or fetched_count < limit:
                    print("\n目標開始日以前のデータに到達、または取得データがlimit未満になったため終了します。")
                    break
                if max_data is not None and total_fetched >= max_data:
                    print(f"\n最大取得件数 ({max_data} 件) に到達したため終了します。")
                    break

                current_end_time = oldest_timestamp_in_batch - 1
                await asyncio.sleep(0.2) # レートリミット考慮

            else:
                print(f"\nReq {request_count}: データ取得エラーまたはデータがありません。Response: {data}")
                break # ループ終了

        except Exception as e:
            print(f"\nReq {request_count}: リクエスト中にエラーが発生しました: {e}")
            await asyncio.sleep(1)
            if request_count > 5 and total_fetched == 0:
                 print("\n初期のデータ取得でエラーが続いたため中断します。")
                 break
            continue

    pbar.close() # プログレスバーを閉じる

    if not all_data_list:
        print("有効なデータを取得できませんでした。")
        return pd.DataFrame()

    # --- DataFrame変換以降は同じ ---
    print("\nDataFrame変換中...")
    df = pd.DataFrame(all_data_list, columns=['timestamp', 'open', 'high', 'low', 'close', 'volume', 'turnover'])
    df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms', utc=True)
    df.set_index('timestamp', inplace=True)
    numeric_cols = ['open', 'high', 'low', 'close', 'volume', 'turnover']
    df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')
    df = df[~df.index.duplicated(keep='first')]
    df.sort_index(ascending=True, inplace=True)
    df = df[df.index >= pd.Timestamp(start_date_str, tz='UTC')]
    print("DataFrame変換完了。")

    print(f"\nデータ取得完了。合計 {len(df)} 件の一意なデータを取得しました。")
    return df


## 3 (改訂 v2). データ取得の実行と結果確認 (最大量取得・プログレスバー版)

In [None]:
nest_asyncio.apply() # Jupyter環境用

if __name__ == '__main__':
    try:
        print("最大量のデータ取得処理を開始します...(プログレスバーが表示されます)")
        # 関数名を変更したものを呼び出す
        df_kline_max = asyncio.run(fetch_bybit_kline_full_tqdm(target_symbol, interval, category, limit, start_timestamp_ms, max_total_data))
        print("データ取得処理が完了しました。")
    except Exception as e:
        print(f"データ取得中に予期せぬエラーが発生しました: {e}")
        df_kline_max = pd.DataFrame() # エラー時は空のDataFrame

# --- 結果表示 (変数名を df_kline_max に変更) ---
if not df_kline_max.empty:
    print("\n--- 取得データ (最初の5行) ---")
    display(df_kline_max.head())
    print("\n--- 取得データ (最後の5行) ---")
    display(df_kline_max.tail())
    print("\n--- データフレーム情報 ---")
    df_kline_max.info()
    print(f"\n取得期間: {df_kline_max.index.min()} ~ {df_kline_max.index.max()}")
    print(f"データ件数: {len(df_kline_max)}")

    # (任意) 取得したデータを保存
    # try:
    #     print(f"\nデータを '{output_filename_full}' として保存中...")
    #     df_kline_max.to_csv(output_filename_full)
    #     print("保存完了。")
    # except Exception as e:
    #     print(f"\nデータの保存中にエラーが発生しました: {e}")
else:
    print("\nデータフレームが空、または取得に失敗しました。")


最大量のデータ取得処理を開始します...(プログレスバーが表示されます)
推定総リクエスト回数 (目安): 2666
データ取得を開始します (開始希望日: 2000-01-01)...


Fetching BTCUSDT:   0%|          | 0/2666 [00:00<?, ?req/s]


目標開始日以前のデータに到達、または取得データがlimit未満になったため終了します。

DataFrame変換中...


  df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms', utc=True)
Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x000002AA69E62510>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0x000002AA1AA32040>, 920793.312)])']
connector: <aiohttp.connector.TCPConnector object at 0x000002AA1A699150>


DataFrame変換完了。

データ取得完了。合計 533372 件の一意なデータを取得しました。
データ取得処理が完了しました。

--- 取得データ (最初の5行) ---


Unnamed: 0_level_0,open,high,low,close,volume,turnover
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-03-25 10:35:00+00:00,6500.0,6500.0,6500.0,6500.0,0.001,6.5
2020-03-25 10:40:00+00:00,6500.0,6500.0,6500.0,6500.0,0.001,6.5
2020-03-25 10:45:00+00:00,6500.0,6500.0,6500.0,6500.0,0.0,0.0
2020-03-25 10:50:00+00:00,6500.0,6588.0,6500.0,6588.0,0.001,6.588
2020-03-25 10:55:00+00:00,6588.0,6591.5,6588.0,6591.5,0.001,6.5915



--- 取得データ (最後の5行) ---


Unnamed: 0_level_0,open,high,low,close,volume,turnover
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2025-04-20 09:50:00+00:00,84397.1,84470.6,84397.0,84459.2,266.047,22462690.0
2025-04-20 09:55:00+00:00,84459.2,84500.0,84459.1,84485.7,124.104,10483640.0
2025-04-20 10:00:00+00:00,84485.7,84485.7,84406.0,84406.0,98.594,8326069.0
2025-04-20 10:05:00+00:00,84406.0,84412.6,84380.3,84393.4,278.281,23485430.0
2025-04-20 10:10:00+00:00,84393.4,84393.5,84383.1,84383.2,33.614,2836721.0



--- データフレーム情報 ---
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 533372 entries, 2020-03-25 10:35:00+00:00 to 2025-04-20 10:10:00+00:00
Data columns (total 6 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   open      533372 non-null  float64
 1   high      533372 non-null  float64
 2   low       533372 non-null  float64
 3   close     533372 non-null  float64
 4   volume    533372 non-null  float64
 5   turnover  533372 non-null  float64
dtypes: float64(6)
memory usage: 28.5 MB

取得期間: 2020-03-25 10:35:00+00:00 ~ 2025-04-20 10:10:00+00:00
データ件数: 533372


## 特徴量セット A (WclPriceベース) の作成

Weighted Close (WclPrice) を計算し、主要なテクニカル指標の計算に WclPrice を使用します。
tqdm で進捗を表示し、目的変数は時間差チェック済みのものを利用します。

In [None]:
import pandas as pd
import pandas_ta as ta
import numpy as np
from tqdm.notebook import tqdm

# df_kline_max (最大量データ) がロード済みとする
if 'df_kline_max' in locals() and not df_kline_max.empty:
    print("特徴量セット A (WclPriceベース) の作成を開始します...")
    df_features_wcl = df_kline_max.copy()

    # 1. Weighted Close (WclPrice) の計算
    print("Calculating WclPrice...")
    df_features_wcl['wclprice'] = (df_features_wcl['high'] + df_features_wcl['low'] + 2 * df_features_wcl['close']) / 4

    # --- 特徴量計算 (WclPrice を入力に使用) ---
    # 2. 基本的な価格特徴量 (WclPrice基準) - オプション
    # df_features_wcl['open_norm_wcl'] = df_features_wcl['open'] / df_features_wcl['wclprice']
    # df_features_wcl['high_norm_wcl'] = df_features_wcl['high'] / df_features_wcl['wclprice']
    # df_features_wcl['low_norm_wcl'] = df_features_wcl['low'] / df_features_wcl['wclprice']
    # df_features_wcl['close_norm_wcl'] = df_features_wcl['close'] / df_features_wcl['wclprice']

    # 3. テクニカル指標 (入力に 'wclprice' を指定)
    print("テクニカル指標 (WclPriceベース) を計算中...")
    periods_ma = [7, 14, 21, 50, 100, 200]
    print("Calculating MAs (WclPrice)...")
    for length in tqdm(periods_ma, desc="MAs_WCL"):
        # pandas-ta で入力列を指定するには close='wclprice' のように引数を渡す
        df_features_wcl.ta.sma(close=df_features_wcl['wclprice'], length=length, append=True, col_names=(f'SMA_{length}_WCL'))
        df_features_wcl.ta.ema(close=df_features_wcl['wclprice'], length=length, append=True, col_names=(f'EMA_{length}_WCL'))

    print("Calculating MACD (WclPrice)...")
    df_features_wcl.ta.macd(close=df_features_wcl['wclprice'], append=True, col_names=('MACD_WCL', 'MACDh_WCL', 'MACDs_WCL')) # 列名を変更

    periods_rsi = [7, 14, 21]
    print("Calculating RSI (WclPrice)...")
    for length in tqdm(periods_rsi, desc="RSI_WCL"):
         df_features_wcl.ta.rsi(close=df_features_wcl['wclprice'], length=length, append=True, col_names=(f'RSI_{length}_WCL'))

    # Stochastics は High, Low, Close を使うので WclPrice ではなく通常通り計算
    print("Calculating Stochastics...")
    df_features_wcl.ta.stoch(append=True) # STOCHk_14_3_3, STOCHd_14_3_3

    # StochRSI は RSI に基づくので、WclPrice ベースの RSI を使うか検討 -> ここでは Close ベースの RSI を使う StochRSI を計算
    print("Calculating StochRSI (Close base)...")
    df_features_wcl.ta.stochrsi(append=True) # STOCHRSIk_14_14_3_3, STOCHRSId_14_14_3_3

    # Bollinger Bands (WclPrice ベース)
    print("Calculating Bollinger Bands (WclPrice)...")
    df_features_wcl.ta.bbands(close=df_features_wcl['wclprice'], length=20, std=2, append=True, col_names=('BBL_WCL', 'BBM_WCL', 'BBU_WCL', 'BBB_WCL', 'BBP_WCL')) # 列名変更

    # ATR (通常通り H, L, C を使う)
    print("Calculating ATR...")
    df_features_wcl.ta.atr(length=14, append=True, col_names=('ATR_14'))

    # ADX (通常通り H, L, C を使う)
    print("Calculating ADX...")
    df_features_wcl.ta.adx(length=14, append=True) # ADX_14, DMP_14, DMN_14

    # CCI (WclPrice ベースで計算可能か？ -> HLC を使うのが一般的) -> 通常通り計算
    print("Calculating CCI (HLC base)...")
    df_features_wcl.ta.cci(length=14, append=True, col_names=('CCI_14'))

    # Williams %R (通常通り H, L, C を使う)
    print("Calculating Williams %R...")
    df_features_wcl.ta.willr(length=14, append=True, col_names=('WILLR_14'))

    # OBV (通常通り C, V を使う)
    print("Calculating OBV...")
    df_features_wcl.ta.obv(append=True)

    # --- 4. ラグ特徴量 (WclPrice と Close の両方) ---
    print("ラグ特徴量を計算中...")
    periods_return = [1, 2, 3, 5, 10, 20, 50]
    for n in tqdm(periods_return, desc="Returns"):
        df_features_wcl[f'return_{n}'] = df_features_wcl['close'].pct_change(periods=n)
        df_features_wcl[f'return_{n}_wcl'] = df_features_wcl['wclprice'].pct_change(periods=n) # WclPriceのリターンも追加

    # --- 5. 時間特徴量 ---
    print("時間特徴量を計算中...")
    df_features_wcl['hour'] = df_features_wcl.index.hour
    df_features_wcl['dayofweek'] = df_features_wcl.index.dayofweek

    # --- 6. 目的変数作成 (時間差チェック付き) ---
    print("目的変数 (ターゲット) を作成中 (5分間隔チェック付き)...")
    df_features_wcl['timediff'] = df_features_wcl.index.to_series().diff()
    df_features_wcl['next_close'] = df_features_wcl['close'].shift(-1)
    df_features_wcl['next_timediff'] = df_features_wcl['timediff'].shift(-1)
    condition_high = (df_features_wcl['next_close'] > df_features_wcl['close']) & (df_features_wcl['next_timediff'] == pd.Timedelta('5 minutes'))
    condition_low = (df_features_wcl['next_close'] <= df_features_wcl['close']) & (df_features_wcl['next_timediff'] == pd.Timedelta('5 minutes'))
    df_features_wcl['target'] = np.select([condition_high, condition_low], [1.0, 0.0], default=np.nan)
    df_features_wcl = df_features_wcl.drop(columns=['timediff', 'next_timediff', 'next_close'])

    # --- 7. NaN削除 ---
    rows_before_dropna = len(df_features_wcl)
    print(f"\nNaN削除前の行数: {rows_before_dropna}")
    print("NaN削除処理を開始します...")
    df_processed_wcl = df_features_wcl.dropna()
    print("NaN削除処理完了。")
    rows_after_dropna = len(df_processed_wcl)
    if 'target' in df_processed_wcl.columns:
         df_processed_wcl['target'] = df_processed_wcl['target'].astype(int)

    print(f"NaN削除後の行数: {rows_after_dropna}")
    print(f"削除された行数: {rows_before_dropna - rows_after_dropna}")

    print("\n--- 特徴量セット A (WclPrice) 作成完了 ---")
    df_processed_wcl.info(verbose=False, memory_usage='deep')
    display(df_processed_wcl.head())

    # (任意) 保存
    df_processed_wcl.to_csv('processed_data_wcl.csv')

else:
    print("df_kline_max が存在しないか空です。データ取得ステップを先に実行してください。")


特徴量セット A (WclPriceベース) の作成を開始します...
Calculating WclPrice...
テクニカル指標 (WclPriceベース) を計算中...
Calculating MAs (WclPrice)...


MAs_WCL:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating MACD (WclPrice)...
Calculating RSI (WclPrice)...


RSI_WCL:   0%|          | 0/3 [00:00<?, ?it/s]

Calculating Stochastics...
Calculating StochRSI (Close base)...
Calculating Bollinger Bands (WclPrice)...
Calculating ATR...
Calculating ADX...
Calculating CCI (HLC base)...
Calculating Williams %R...
Calculating OBV...
ラグ特徴量を計算中...


Returns:   0%|          | 0/7 [00:00<?, ?it/s]

時間特徴量を計算中...
目的変数 (ターゲット) を作成中 (5分間隔チェック付き)...

NaN削除前の行数: 533372
NaN削除処理を開始します...
NaN削除処理完了。
NaN削除後の行数: 533172
削除された行数: 200

--- 特徴量セット A (WclPrice) 作成完了 ---
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 533172 entries, 2020-03-26 03:10:00+00:00 to 2025-04-20 10:05:00+00:00
Columns: 58 entries, open to target
dtypes: float64(55), int32(2), int64(1)
memory usage: 235.9 MB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_processed_wcl['target'] = df_processed_wcl['target'].astype(int)


Unnamed: 0_level_0,open,high,low,close,volume,turnover,wclprice,SMA_7_WCL,EMA_7_WCL,SMA_14_WCL,...,return_5_wcl,return_10,return_10_wcl,return_20,return_20_wcl,return_50,return_50_wcl,hour,dayofweek,target
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-03-26 03:10:00+00:00,6687.5,6687.5,6667.5,6670.0,35.519,236911.73,6673.75,6685.196429,6682.394124,6679.357143,...,-0.001944,-0.00045,0.0003,-0.005517,-0.004475,-0.001048,-0.000636,3,3,1
2020-03-26 03:15:00+00:00,6670.0,6693.5,6670.0,6693.5,92.544,619443.264,6687.625,6686.285714,6683.701843,6679.446429,...,-0.000766,0.004728,0.003432,-0.001864,-0.002424,0.003598,0.002567,3,3,0
2020-03-26 03:20:00+00:00,6693.5,6693.5,6682.5,6682.5,10.882,72718.965,6685.25,6686.071429,6684.088882,6680.589286,...,-0.001307,0.000749,0.002005,-0.00484,-0.003707,0.004283,0.004112,3,3,1
2020-03-26 03:25:00+00:00,6682.5,6694.0,6682.5,6685.0,11.24,75139.4,6686.625,6685.196429,6684.722912,6681.6875,...,0.000355,0.000749,0.001085,-0.003057,-0.003168,0.003528,0.004054,3,3,1
2020-03-26 03:30:00+00:00,6685.0,6738.0,6685.0,6737.5,13.516,91064.05,6724.5,6689.553571,6694.667184,6685.455357,...,0.005928,0.008608,0.006662,0.0071,0.004575,0.01012,0.008492,3,3,0


## PyCaret による AutoML評価 (特徴量セットA: WclPriceベース) - 修正版 3

setup 関数から numeric_features パラメータを削除し、型推論に任せてみます。

In [None]:
import pandas as pd

df_processed_wcl = pd.read_csv('processed_data_wcl.csv')
display(df_processed_wcl)


Unnamed: 0,timestamp,open,high,low,close,volume,turnover,wclprice,SMA_7_WCL,EMA_7_WCL,...,return_5_wcl,return_10,return_10_wcl,return_20,return_20_wcl,return_50,return_50_wcl,hour,dayofweek,target
0,2020-03-26 03:10:00+00:00,6687.5,6687.5,6667.5,6670.0,35.519,2.369117e+05,6673.750,6685.196429,6682.394124,...,-0.001944,-0.000450,0.000300,-0.005517,-0.004475,-0.001048,-0.000636,3,3,1
1,2020-03-26 03:15:00+00:00,6670.0,6693.5,6670.0,6693.5,92.544,6.194433e+05,6687.625,6686.285714,6683.701843,...,-0.000766,0.004728,0.003432,-0.001864,-0.002424,0.003598,0.002567,3,3,0
2,2020-03-26 03:20:00+00:00,6693.5,6693.5,6682.5,6682.5,10.882,7.271896e+04,6685.250,6686.071429,6684.088882,...,-0.001307,0.000749,0.002005,-0.004840,-0.003707,0.004283,0.004112,3,3,1
3,2020-03-26 03:25:00+00:00,6682.5,6694.0,6682.5,6685.0,11.240,7.513940e+04,6686.625,6685.196429,6684.722912,...,0.000355,0.000749,0.001085,-0.003057,-0.003168,0.003528,0.004054,3,3,1
4,2020-03-26 03:30:00+00:00,6685.0,6738.0,6685.0,6737.5,13.516,9.106405e+04,6724.500,6689.553571,6694.667184,...,0.005928,0.008608,0.006662,0.007100,0.004575,0.010120,0.008492,3,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
533167,2025-04-20 09:45:00+00:00,84438.0,84438.0,84395.0,84397.1,241.373,2.037444e+07,84406.800,84488.360714,84478.622062,...,-0.001563,-0.002211,-0.002025,-0.003318,-0.003250,-0.008122,-0.008038,9,6,1
533168,2025-04-20 09:50:00+00:00,84397.1,84470.6,84397.0,84459.2,266.047,2.246269e+07,84446.500,84475.735714,84470.591547,...,-0.000932,-0.001210,-0.001363,-0.003080,-0.003215,-0.007305,-0.007475,9,6,1
533169,2025-04-20 09:55:00+00:00,84459.2,84500.0,84459.1,84485.7,124.104,1.048364e+07,84482.625,84467.689286,84473.599910,...,-0.000157,-0.000996,-0.000984,-0.002205,-0.002418,-0.006867,-0.006877,9,6,0
533170,2025-04-20 10:00:00+00:00,84485.7,84485.7,84406.0,84406.0,98.594,8.326069e+06,84425.925,84453.500000,84461.681183,...,-0.000557,-0.001763,-0.001579,-0.002984,-0.002765,-0.007605,-0.007536,10,6,0


In [None]:
# 必要なライブラリをインポート
import pandas as pd
from sklearn.model_selection import train_test_split
# PyCaret classification モジュールをインポート
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score


# 特徴量セットAのデータフレーム df_processed_wcl が存在すると仮定
if 'df_processed_wcl' in locals() and not df_processed_wcl.empty:

    # --- 1. データの分割 ---
    print("学習データとテストデータに分割中...")
    # 特徴量とターゲットの指定は変わらず (PyCaret内部で使われる)
    exclude_cols = ['open', 'high', 'low', 'close', 'volume', 'turnover', 'wclprice', 'target']
    features_wcl = [col for col in df_processed_wcl.columns if col not in exclude_cols]
    print(f"定義上の特徴量の数: {len(features_wcl)}") # PyCaretが実際に使うかは自動判別

    X_wcl = df_processed_wcl[features_wcl]
    y_wcl = df_processed_wcl['target']

    test_size = 0.2
    X_train_wcl, X_test_wcl, y_train_wcl, y_test_wcl = train_test_split(
        X_wcl, y_wcl, test_size=test_size, shuffle=False
    )

    train_data_wcl = pd.concat([X_train_wcl, y_train_wcl], axis=1)
    test_data_wcl = pd.concat([X_test_wcl, y_test_wcl], axis=1)

    print(f"学習データ数: {len(train_data_wcl)}, テストデータ数: {len(test_data_wcl)}")
    print(f"学習データ期間: {train_data_wcl.index.min()} ~ {train_data_wcl.index.max()}")
    print(f"テストデータ期間: {test_data_wcl.index.min()} ~ {test_data_wcl.index.max()}")


    # --- 2. PyCaret セットアップ (修正 3) ---
    print("\nPyCaret セットアップを開始します...")
    session_id = 123
    clf_setup = setup(data=train_data_wcl,
                      target='target',
                      test_data=test_data_wcl,
                      # --- 修正点: numeric_features を削除 ---
                      # numeric_features=features_wcl, # PyCaretに型推論させる
                      # --- 修正点ここまで ---
                      index=train_data_wcl.index, # インデックス指定は維持
                      fold_strategy='timeseries',
                      fold=3,
                      data_split_shuffle=False,
                      fold_shuffle=False,
                      session_id=session_id,
                      use_gpu=False,
                      verbose=True,
                      preprocess=True) # 前処理は引き続き有効

    print("PyCaret セットアップ完了。")


    # --- 3. モデル比較 ---
    # (以降のコードは変更なし)
    print("\nPyCaret モデル比較を開始します (時間がかかります)...")
    best_model_cv = compare_models(sort='AUC')

    print("\n--- モデル比較結果 (学習データでの時系列CV) ---")
    print(best_model_cv)

    # --- 4. (参考) テストデータでの評価 ---
    if best_model_cv:
        print("\n--- 最良モデル (CV基準) のテストデータでの評価 ---")
        test_predictions = predict_model(best_model_cv)
        print(test_predictions.head())

        final_accuracy = accuracy_score(test_predictions['target'], test_predictions['prediction_label'])
        if 'prediction_score' in test_predictions.columns:
             final_auc = roc_auc_score(test_predictions['target'], test_predictions['prediction_score'])
             print(f"\nテストデータ Accuracy: {final_accuracy:.4f}")
             print(f"テストデータ AUC Score: {final_auc:.4f}") # ★最終的な評価はこの値
        else:
             print(f"\nテストデータ Accuracy: {final_accuracy:.4f}")
             print("テストデータ AUC Score: (確率が出力されませんでした)")

        # 結果を保存
        if 'model_results' not in locals(): model_results = {}
        model_results[f'PyCaret_Best_{type(best_model_cv).__name__}_WCL'] = {
            'model': best_model_cv,
            'auc': final_auc if ('final_auc' in locals() and 'prediction_score' in test_predictions.columns) else None,
            'accuracy': final_accuracy,
            'y_pred_proba': test_predictions['prediction_score'].values if 'prediction_score' in test_predictions.columns else None,
            'features': 'Set A (WCL)'
        }
        # save_model(best_model_cv, f'pycaret_best_model_wcl_{session_id}')

    else:
        print("モデル比較で有効なモデルが見つかりませんでした。")


else:
    print("df_processed_wcl が存在しないか空です。前のステップを先に実行してください。")


学習データとテストデータに分割中...
定義上の特徴量の数: 51
学習データ数: 426537, テストデータ数: 106635
学習データ期間: 0 ~ 426536
テストデータ期間: 426537 ~ 533171

PyCaret セットアップを開始します...


Unnamed: 0,Description,Value
0,Session id,123
1,Target,target
2,Target type,Binary
3,Original data shape,"(533172, 52)"
4,Transformed data shape,"(533172, 52)"
5,Transformed train set shape,"(426537, 52)"
6,Transformed test set shape,"(106635, 52)"
7,Numeric features,50
8,Categorical features,1
9,Preprocess,True


PyCaret セットアップ完了。

PyCaret モデル比較を開始します (時間がかかります)...


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
ridge,Ridge Classifier,0.5053,0.5418,0.0,0.0,0.0,0.0,0.0,0.8333
lr,Logistic Regression,0.5284,0.5402,0.4357,0.5284,0.475,0.0549,0.0561,10.4933
nb,Naive Bayes,0.523,0.5379,0.6555,0.5158,0.5709,0.0493,0.0532,1.5267
svm,SVM - Linear Kernel,0.5053,0.5072,0.0,0.25,0.0001,0.0,0.0015,18.49
rf,Random Forest Classifier,0.5053,0.5041,0.0,0.0,0.0,0.0,0.0,3.9367
dt,Decision Tree Classifier,0.5053,0.5,0.0,0.0,0.0,0.0,0.0,2.03
ada,Ada Boost Classifier,0.5053,0.5,0.0,0.0,0.0,0.0,0.0,2.0733
lda,Linear Discriminant Analysis,0.5053,0.5,0.0,0.0,0.0,0.0,0.0,1.21
dummy,Dummy Classifier,0.5053,0.5,0.0,0.0,0.0,0.0,0.0,0.73
gbc,Gradient Boosting Classifier,0.5053,0.4997,0.0,0.0,0.0,0.0,0.0,78.0567



--- モデル比較結果 (学習データでの時系列CV) ---
RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
                max_iter=None, positive=False, random_state=123, solver='auto',
                tol=0.0001)

--- 最良モデル (CV基準) のテストデータでの評価 ---


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Ridge Classifier,0.5001,0.5,0.0,0.0,0.0,0.0,0.0


                        timestamp     SMA_7_WCL     EMA_7_WCL    SMA_14_WCL  \
426537  2024-04-15 03:55:00+00:00  65191.753906  65171.300781  65180.351562   
426538  2024-04-15 04:00:00+00:00  65173.500000  65147.781250  65177.714844   
426539  2024-04-15 04:05:00+00:00  65123.480469  65070.886719  65157.492188   
426540  2024-04-15 04:10:00+00:00  65055.378906  64989.621094  65132.175781   
426541  2024-04-15 04:15:00+00:00  64989.160156  64944.371094  65103.183594   

          EMA_14_WCL    SMA_21_WCL    EMA_21_WCL    SMA_50_WCL    EMA_50_WCL  \
426537  65181.789062  65160.914062  65194.449219  65319.042969  65172.554688   
426538  65167.847656  65155.726562  65183.792969  65308.832031  65168.816406   
426539  65124.160156  65140.695312  65152.554688  65293.656250  65155.929688   
426540  65073.714844  65119.503906  65115.582031  65279.503906  65139.847656   
426541  65038.371094  65103.058594  65087.675781  65266.691406  65126.855469   

         SMA_100_WCL  ...  return_10  return