<a href="https://colab.research.google.com/github/yuukienomoto/mahjong/blob/main/%E5%88%86%E9%A1%9E%E3%82%B3%E3%83%BC%E3%83%89%E3%82%B3%E3%83%94%E3%83%BCipynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
数牌8万、字牌2万のアンダーサンプリング

SyntaxError: invalid character '、' (U+3001) (ipython-input-768148582.py, line 1)

In [None]:
!pip install optuna

import os
import glob
import zipfile
import shutil
import numpy as np
import pandas as pd
import lightgbm as lgb
import xml.etree.ElementTree as ET
import optuna
import re
import gc # ガベージコレクション（メモリ解放用）
from google.colab import drive
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# =============================================================================
# 1. 環境設定・データの準備
# =============================================================================
print("\n--- ステップ1: 環境設定とデータの準備 ---")
try:
    drive.mount('/content/drive')
except Exception as e:
    print(f"Google Driveのマウントに失敗しました: {e}")

# ZIPファイルのパス
ZIP_PATH = '/content/drive/MyDrive/dataset_2022-2024_houou_80k.zip'

# 作業用ディレクトリ
LOCAL_WORK_DIR = '/content/temp_xml_data/'      # XML解凍先
BATCH_SAVE_DIR = '/content/temp_batches/'       # ★一時バッチ保存先

# 最終保存先
DATASET_PKL = '/content/drive/MyDrive/mahjong_balanced_dataset_200k.pkl'

# ディレクトリ初期化
if os.path.exists(BATCH_SAVE_DIR):
    shutil.rmtree(BATCH_SAVE_DIR)
os.makedirs(BATCH_SAVE_DIR, exist_ok=True)

# ZIP解凍
if not os.path.exists(LOCAL_WORK_DIR):
    print(f"★ZIPファイル: {ZIP_PATH}")
    if os.path.exists(ZIP_PATH):
        print("ローカル環境に高速解凍中... (数分お待ちください)")
        try:
            with zipfile.ZipFile(ZIP_PATH, 'r') as zip_ref:
                zip_ref.extractall(LOCAL_WORK_DIR)
            print("解凍完了！")
        except Exception as e:
            print(f"解凍エラー: {e}")
            raise
    else:
        raise FileNotFoundError("ZIP file not found")
else:
    print("既に解凍済みデータがあるため、それを使用します。")

# 解析対象フォルダ
LOG_DIR = LOCAL_WORK_DIR
print(f"解析対象フォルダ: {LOG_DIR}")

# =============================================================================
# 2. ヘルパー関数 (ロジック)
# =============================================================================
# (ロジック部分は変更なし)

def _parse_tile_type(tile_type_code):
    if 0 <= tile_type_code <= 8:      return 0, tile_type_code + 1
    elif 9 <= tile_type_code <= 17: return 1, (tile_type_code - 9) + 1
    elif 18 <= tile_type_code <= 26: return 2, (tile_type_code - 18) + 1
    elif 27 <= tile_type_code <= 33: return 3, (tile_type_code - 27) + 1
    else: return -1, -1

def count_dora(tile_code, dora_indicators):
    dora_count = 0
    t_type, t_num = _parse_tile_type(tile_code)
    for ind in dora_indicators:
        ind_type, ind_num = _parse_tile_type(ind)
        if t_type == ind_type:
            if t_type == 3:
                target_num = (ind_num % 4) + 1
                if ind_num >= 5: target_num = 5 + ((ind_num - 5 + 1) % 3)
                if t_num == target_num: dora_count += 1
            else:
                target_num = 1 if ind_num == 9 else ind_num + 1
                if t_num == target_num: dora_count += 1
    return dora_count

def parse_called_tile(m_int):
    extracted = (m_int >> 8) & 0xFF
    return extracted * 4

def check_suji_fair(target_type, target_num, opponent_discards):
    if target_type == 3: return 0
    opp_discards_nums = []
    for d in opponent_discards:
        dt, dn = _parse_tile_type(d)
        if dt == target_type:
            opp_discards_nums.append(dn)

    suji_map = {
        1: [4], 2: [5], 3: [6],
        4: [1, 7], 5: [2, 8], 6: [3, 9],
        7: [4], 8: [5], 9: [6]
    }
    needed = suji_map.get(target_num, [])
    if not needed: return 0
    return 1 if all(n in opp_discards_nums for n in needed) else 0

def get_kabe_fair(target_type, target_num, all_public_tiles):
    if target_type == 3: return 0
    check_nums = []
    if target_num == 1: check_nums = [2]
    elif target_num == 9: check_nums = [8]
    elif target_num == 2: check_nums = [3]
    elif target_num == 8: check_nums = [7]
    elif target_num == 3: check_nums = [4]
    elif target_num == 7: check_nums = [6]
    elif target_num == 4: check_nums = [3, 5]
    elif target_num == 5: check_nums = [4, 6]
    elif target_num == 6: check_nums = [5, 7]

    max_visible = 0
    for n in check_nums:
        check_id = (target_type * 9) + (n - 1)
        count = all_public_tiles.count(check_id)
        if count > max_visible: max_visible = count
    return max_visible

def get_my_yakuman_potential(my_hand_tiles):
    tiles = [_parse_tile_type(t // 4) for t in my_hand_tiles]
    yaochu_ids = set()
    sangen_count = 0
    jihai_count = 0
    routou_count = 0
    ryuuiiso_count = 0

    for t_type, t_num in tiles:
        if t_type == 3 or t_num == 1 or t_num == 9:
            yaochu_ids.add(t_type * 10 + t_num)
        if t_type == 3 and t_num >= 5:
            sangen_count += 1
        if t_type == 3:
            jihai_count += 1
        if t_type < 3 and (t_num == 1 or t_num == 9):
            routou_count += 1
        if (t_type == 2 and t_num in [2,3,4,6,8]) or (t_type == 3 and t_num == 6):
            ryuuiiso_count += 1

    scores = [
        len(yaochu_ids), sangen_count * 2.5, jihai_count, routou_count, ryuuiiso_count
    ]
    return max(scores)

def get_meld_threat_level(opp_idx, game_state, dora_indicators):
    melds = game_state['open_hands'][opp_idx]
    if not melds: return 0

    threat = 0
    colors = {0:0, 1:0, 2:0, 3:0}
    dora_pon_count = 0

    for meld in melds:
        if not meld: continue
        first = meld[0]
        tt, tn = _parse_tile_type(first)

        if tt != -1: colors[tt] += 1
        d_val = count_dora(first // 4, dora_indicators)
        if d_val > 0:
            dora_pon_count += d_val
        if tt == 3 and tn >= 5: threat += 15

    threat += (dora_pon_count * 20)

    for c in range(3):
        if colors[c] + colors[3] >= 3 and colors[c] >= 2:
            threat += 20
            if colors[3] == 0: threat += 30

    sangen = 0
    for meld in melds:
        first = meld[0] // 4
        tt, tn = _parse_tile_type(first)
        if tt == 3 and tn >= 5: sangen += 1
    if sangen >= 2: threat += 80

    return threat

def calculate_features_fair(game_state, player_index, physical_code, is_tsumogiri, oya_player):
    tile_type_code = physical_code // 4
    t_type, t_num = _parse_tile_type(tile_type_code)
    features = []

    # 公開情報
    public_tiles = []
    public_tiles.extend(game_state['hands'][player_index])
    for i in range(4):
        public_tiles.extend(game_state['discards'][i])
        for meld in game_state['open_hands'][i]:
            public_tiles.extend(meld)

    features.append(t_type)
    features.append(t_num)
    features.append(public_tiles.count(tile_type_code))

    dora_val = 0
    dora_indicators = game_state['dora_indicators']
    dora_val += count_dora(tile_type_code, dora_indicators)
    if physical_code in [16, 52, 88]: dora_val += 1
    features.append(dora_val)

    features.append(len(game_state['history']) // 4)
    features.append(1 if is_tsumogiri else 0)

    other_reach_indices = [i for i in range(4) if i != player_index and game_state['reach'][i]]
    features.append(len(other_reach_indices))

    risk_score = 0
    if len(other_reach_indices) > 0:
        for r_idx in other_reach_indices:
            if tile_type_code in game_state['discards'][r_idx]: continue
            this_risk = 0
            if check_suji_fair(t_type, t_num, game_state['discards'][r_idx]) == 0:
                this_risk += 50
                if 4 <= t_num <= 6: this_risk += 20
                kabe = get_kabe_fair(t_type, t_num, public_tiles)
                if kabe < 1: this_risk += 30
            else:
                this_risk += 15
            if this_risk > risk_score: risk_score = this_risk
    features.append(risk_score)

    live_degree = 0
    visible_n = public_tiles.count(tile_type_code)
    if visible_n == 0:
        live_degree = 2
        if t_type == 3: live_degree = 3
    elif visible_n == 1:
        live_degree = 1
    features.append(live_degree)

    features.append(get_kabe_fair(t_type, t_num, public_tiles))

    max_meld_threat = 0
    for i in range(4):
        if i != player_index:
            th = get_meld_threat_level(i, game_state, dora_indicators)
            if th > max_meld_threat: max_meld_threat = th
    features.append(max_meld_threat)

    my_yakuman = get_my_yakuman_potential(game_state['hands'][player_index])
    features.append(my_yakuman)

    return features

feature_names = [
    '1.種類', '2.数字', '3.見え枚', '4.ドラ数',
    '5.巡目', '6.ツモ切', '7.他家立直数',
    '8.対リーチ危険度', '9.生牌度', '10.カベ強度',
    '11.敵副露脅威度', '12.自手牌価値'
]

# =============================================================================
# 4. XML解析 (バッチ保存方式でメモリクラッシュ回避)
# =============================================================================

def get_clean_tag(tag_str):
    if '}' in tag_str:
        return tag_str.split('}', 1)[1]
    return tag_str

print("\n★ XML解析開始...")
xml_files = glob.glob(os.path.join(LOG_DIR, '**/*.xml'), recursive=True)
print(f"発見ファイル数: {len(xml_files)}")

if len(xml_files) == 0:
    print("エラー: XMLファイルが見つかりません。")
else:
    # バッチ処理用の変数
    current_batch_data = []
    BATCH_SIZE_FILES = 2000 # 2000ファイルごとに保存してメモリ解放
    batch_count = 0

    draw_ptn = re.compile(r'^[TUVW]\d+$')
    discard_ptn = re.compile(r'^[DEFG]\d+$')

    count_suuhai_pos = 0
    count_jihai_pos = 0

    TARGET_SUUHAI = 80000
    TARGET_JIHAI = 20000

    print(f"目標: 数牌正例 {TARGET_SUUHAI}件, 字牌正例 {TARGET_JIHAI}件")
    print("★メモリ保護のため、小分けにして保存(Batch processing)を行います。")

    for i_file, file_path in enumerate(xml_files):

        # --- バッチ保存処理 ---
        if i_file > 0 and i_file % BATCH_SIZE_FILES == 0:
            # ここまで溜まったデータを保存
            if current_batch_data:
                batch_df = pd.DataFrame(current_batch_data)
                save_path = os.path.join(BATCH_SAVE_DIR, f'batch_{batch_count}.pkl')
                batch_df.to_pickle(save_path)
                batch_count += 1

                # メモリ解放
                del batch_df, current_batch_data
                current_batch_data = []
                gc.collect() # 強制メモリ解放

                print(f" [Batch] {i_file}ファイル処理完了 -> batch_{batch_count-1}.pkl 保存 (現在: 数牌正{count_suuhai_pos}/字牌正{count_jihai_pos})")

            # 終了判定
            if count_suuhai_pos >= TARGET_SUUHAI and count_jihai_pos >= TARGET_JIHAI:
                print("★目標のデータ数に達しました。解析ループを終了します。")
                break

        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                try:
                    tree = ET.parse(f)
                    root = tree.getroot()
                except:
                    continue

            game_state = {
                'hands': [[],[],[],[]], 'discards': [[],[],[],[]], 'open_hands': [[],[],[],[]],
                'reach': [False]*4, 'history': [], 'dora_indicators': []
            }
            last_draw = [-1]*4
            last_discard_event = None
            oya_player = 0

            for tag in root:
                tag_name = get_clean_tag(tag.tag)

                if tag_name == 'INIT':
                    if last_discard_event:
                        current_batch_data.append(last_discard_event)
                        if last_discard_event['y'] == 1:
                            if last_discard_event['X'][0] == 3: count_jihai_pos += 1
                            else: count_suuhai_pos += 1

                    oya_player = int(tag.attrib.get('oya', 0))
                    seed = tag.attrib.get('seed', '0,0,0,0,0,0').split(',')
                    dora_ind = int(seed[5]) // 4 if len(seed) > 5 else 0

                    game_state = {
                        'hands': [[],[],[],[]], 'discards': [[],[],[],[]], 'open_hands': [[],[],[],[]],
                        'reach': [False]*4, 'history': [], 'dora_indicators': [dora_ind]
                    }
                    last_discard_event = None
                    for p in range(4):
                        h = tag.attrib.get(f'hai{p}')
                        if h: game_state['hands'][p] = [int(s)//4 for s in h.split(',')]

                if draw_ptn.match(tag_name):
                    p = {'T':0,'U':1,'V':2,'W':3}[tag_name[0]]
                    phys = int(tag_name[1:])
                    last_draw[p] = phys
                    game_state['hands'][p].append(phys // 4)

                if discard_ptn.match(tag_name):
                    p = {'D':0,'E':1,'F':2,'G':3}[tag_name[0]]
                    phys = int(tag_name[1:])

                    if last_discard_event:
                        current_batch_data.append(last_discard_event)
                        if last_discard_event['y'] == 1:
                            if last_discard_event['X'][0] == 3: count_jihai_pos += 1
                            else: count_suuhai_pos += 1

                    is_tsumogiri = (phys == last_draw[p])
                    feats = calculate_features_fair(game_state, p, phys, is_tsumogiri, oya_player)

                    last_discard_event = {'X': feats, 'y': 0}

                    t_type = phys // 4
                    if t_type in game_state['hands'][p]: game_state['hands'][p].remove(t_type)
                    game_state['discards'][p].append(t_type)
                    game_state['history'].append((p, t_type))

                if tag_name == 'N':
                    who = int(tag.attrib.get('who'))
                    m = int(tag.attrib.get('m'))
                    called_tile_id = parse_called_tile(m)
                    c_type = called_tile_id // 4
                    removed = 0
                    temp = game_state['hands'][who][:]
                    for h in temp:
                        if h == c_type and removed < 2:
                            game_state['hands'][who].remove(h)
                            removed += 1
                    game_state['open_hands'][who].append([called_tile_id]*3)

                    if last_discard_event:
                        current_batch_data.append(last_discard_event)
                        if last_discard_event['y'] == 1:
                            if last_discard_event['X'][0] == 3: count_jihai_pos += 1
                            else: count_suuhai_pos += 1
                        last_discard_event = None

                if tag_name == 'REACH' and tag.attrib.get('step') == '1':
                    who_reach = int(tag.attrib.get('who'))
                    game_state['reach'][who_reach] = True

                if tag_name == 'DORA':
                    hai = int(tag.attrib.get('hai'))
                    game_state['dora_indicators'].append(hai // 4)

                if tag_name == 'AGARI':
                    loser = tag.attrib.get('fromWho')
                    if loser and int(tag.attrib.get('who')) != int(loser):
                        if last_discard_event:
                            last_discard_event['y'] = 1
                            current_batch_data.append(last_discard_event)
                            if last_discard_event['X'][0] == 3: count_jihai_pos += 1
                            else: count_suuhai_pos += 1
                            last_discard_event = None

        except:
            continue

    # ループ終了後の残りを保存
    if current_batch_data:
        batch_df = pd.DataFrame(current_batch_data)
        save_path = os.path.join(BATCH_SAVE_DIR, f'batch_{batch_count}.pkl')
        batch_df.to_pickle(save_path)
        print(f" [Batch] 最終バッチ保存完了 -> batch_{batch_count}.pkl")
        del batch_df, current_batch_data
        gc.collect()

# =============================================================================
# 5. ★アンダーサンプリング (バッチファイルを統合しながら抽出)
# =============================================================================
print("\n--- ステップ2: 保存されたバッチファイルからサンプリング統合 ---")

batch_files = glob.glob(os.path.join(BATCH_SAVE_DIR, '*.pkl'))
if len(batch_files) == 0:
    print("エラー: バッチファイルが生成されていません。")
else:
    # データを溜めるリスト
    all_suuhai_pos = []
    all_suuhai_neg = []
    all_jihai_pos = []
    all_jihai_neg = []

    # 目標数
    TARGET_SUUHAI_EACH = 80000
    TARGET_JIHAI_EACH = 20000

    print(f"全{len(batch_files)}個のバッチファイルを処理中...")

    for b_file in batch_files:
        try:
            df_b = pd.read_pickle(b_file)
            if len(df_b) == 0: continue

            # 特徴量展開
            X_temp = pd.DataFrame(df_b['X'].tolist(), columns=feature_names)
            df_full = pd.concat([df_b['y'], X_temp], axis=1)
            df_full['is_jihai'] = df_full['1.種類'] == 3

            # カテゴリ分けして一時リストに追加
            s_p = df_full[(df_full['is_jihai'] == False) & (df_full['y'] == 1)]
            s_n = df_full[(df_full['is_jihai'] == False) & (df_full['y'] == 0)]
            j_p = df_full[(df_full['is_jihai'] == True) & (df_full['y'] == 1)]
            j_n = df_full[(df_full['is_jihai'] == True) & (df_full['y'] == 0)]

            # 必要な分だけ保持（メモリ節約のため、負例はランダムに少しだけ取る）
            # 正例は全て確保
            all_suuhai_pos.append(s_p)
            all_jihai_pos.append(j_p)

            # 負例はバッチごとに少しサンプリングして確保（全部持つと死ぬので）
            # バッチ数で割った数より少し多めに確保しておく
            sample_rate = 0.2 # 負例は20%くらい確保しておけば十分
            if len(s_n) > 0: all_suuhai_neg.append(s_n.sample(frac=sample_rate))
            if len(j_n) > 0: all_jihai_neg.append(j_n.sample(frac=sample_rate))

            del df_b, df_full, s_p, s_n, j_p, j_n, X_temp
            gc.collect()

        except Exception as e:
            print(f"Skip file {b_file}: {e}")

    # 全結合
    print("データを結合中...")
    df_suuhai_pos = pd.concat(all_suuhai_pos)
    df_suuhai_neg = pd.concat(all_suuhai_neg)
    df_jihai_pos = pd.concat(all_jihai_pos)
    df_jihai_neg = pd.concat(all_jihai_neg)

    print(f"集計結果(正例): 数牌 {len(df_suuhai_pos)}, 字牌 {len(df_jihai_pos)}")

    # 最終サンプリング
    n_sp = min(len(df_suuhai_pos), TARGET_SUUHAI_EACH)
    n_sn = TARGET_SUUHAI_EACH

    n_jp = min(len(df_jihai_pos), TARGET_JIHAI_EACH)
    n_jn = TARGET_JIHAI_EACH

    # 負例が足りない場合
    if len(df_suuhai_neg) < n_sn: n_sn = len(df_suuhai_neg)
    if len(df_jihai_neg) < n_jn: n_jn = len(df_jihai_neg)

    final_df = pd.concat([
        df_suuhai_pos.sample(n=n_sp, random_state=42),
        df_suuhai_neg.sample(n=n_sn, random_state=42),
        df_jihai_pos.sample(n=n_jp, random_state=42),
        df_jihai_neg.sample(n=n_jn, random_state=42)
    ]).sample(frac=1, random_state=42).reset_index(drop=True)

    print(f"\n★最終データセット作成完了: {len(final_df)}件")
    print(final_df['y'].value_counts())

    # 保存
    final_df.to_pickle(DATASET_PKL)
    print(f"データセット保存完了: {DATASET_PKL}")

    # =============================================================================
    # 6. Optuna & 学習
    # =============================================================================
    X = final_df[feature_names]
    y = final_df['y']
    cat_features = ['1.種類']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    print("\n--- ステップ3: Optunaによるハイパーパラメータ探索 ---")

    def objective(trial):
        param = {
            'objective': 'binary',
            'metric': 'binary_logloss',
            'verbosity': -1,
            'boosting_type': 'gbdt',
            'n_estimators': 300,
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
            'num_leaves': trial.suggest_int('num_leaves', 20, 150),
            'max_depth': trial.suggest_int('max_depth', 5, 15),
            'min_child_samples': trial.suggest_int('min_child_samples', 10, 100),
            'subsample': trial.suggest_float('subsample', 0.5, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        }

        model = lgb.LGBMClassifier(**param)
        model.fit(X_train, y_train, categorical_feature=cat_features)
        preds = model.predict(X_test)
        return accuracy_score(y_test, preds)

    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=15)

    print(f"★ Best Params: {study.best_params}")

    # --- 最終学習 ---
    print("\n--- 最終学習 ---")
    best_params = study.best_params
    final_model = lgb.LGBMClassifier(
        objective='binary',
        metric='binary_logloss',
        n_estimators=5000,
        learning_rate=best_params.get('learning_rate', 0.05),
        num_leaves=best_params.get('num_leaves', 31),
        max_depth=best_params.get('max_depth', -1),
        min_child_samples=best_params.get('min_child_samples', 20),
        subsample=best_params.get('subsample', 0.8),
        colsample_bytree=best_params.get('colsample_bytree', 0.8)
    )

    final_model.fit(
        X_train, y_train,
        eval_set=[(X_test, y_test)],
        eval_metric='auc',
        callbacks=[lgb.early_stopping(100)],
        categorical_feature=cat_features
    )

    # =============================================================================
    # 7. 結果評価
    # =============================================================================
    print("\n--- 評価結果 ---")
    preds = final_model.predict(X_test)

    print(f"Accuracy: {accuracy_score(y_test, preds):.4f}")
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, preds))
    print("\nClassification Report:")
    print(classification_report(y_test, preds))

    print("\n★ 特徴量重要度")
    imp = pd.DataFrame({'Feature': feature_names, 'Importance': final_model.feature_importances_}).sort_values('Importance', ascending=False)
    print(imp.to_string(index=False))


--- ステップ1: 環境設定とデータの準備 ---
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
既に解凍済みデータがあるため、それを使用します。
解析対象フォルダ: /content/temp_xml_data/

★ XML解析開始...
発見ファイル数: 80000
目標: 数牌正例 80000件, 字牌正例 20000件
★メモリ保護のため、小分けにして保存(Batch processing)を行います。
 [Batch] 2000ファイル処理完了 -> batch_0.pkl 保存 (現在: 数牌正9811/字牌正788)
 [Batch] 4000ファイル処理完了 -> batch_1.pkl 保存 (現在: 数牌正19473/字牌正1613)
 [Batch] 6000ファイル処理完了 -> batch_2.pkl 保存 (現在: 数牌正29190/字牌正2418)
 [Batch] 8000ファイル処理完了 -> batch_3.pkl 保存 (現在: 数牌正38978/字牌正3166)
 [Batch] 10000ファイル処理完了 -> batch_4.pkl 保存 (現在: 数牌正48756/字牌正4003)
 [Batch] 12000ファイル処理完了 -> batch_5.pkl 保存 (現在: 数牌正58519/字牌正4811)
 [Batch] 14000ファイル処理完了 -> batch_6.pkl 保存 (現在: 数牌正68267/字牌正5634)
 [Batch] 16000ファイル処理完了 -> batch_7.pkl 保存 (現在: 数牌正78001/字牌正6429)
 [Batch] 18000ファイル処理完了 -> batch_8.pkl 保存 (現在: 数牌正87887/字牌正7220)
 [Batch] 20000ファイル処理完了 -> batch_9.pkl 保存 (現在: 数牌正97671/字牌正8020)
 [Batch] 22000ファイル処理完了 -> batch_10.pkl 保存 (現在: 数牌

[I 2026-01-15 04:20:35,999] A new study created in memory with name: no-name-d4463478-f155-4e99-b2b1-200098d88cfb



★最終データセット作成完了: 200000件
y
0    100000
1    100000
Name: count, dtype: int64
データセット保存完了: /content/drive/MyDrive/mahjong_balanced_dataset_200k.pkl

--- ステップ3: Optunaによるハイパーパラメータ探索 ---


[I 2026-01-15 04:20:42,997] Trial 0 finished with value: 0.7866 and parameters: {'learning_rate': 0.058619631653518495, 'num_leaves': 21, 'max_depth': 5, 'min_child_samples': 42, 'subsample': 0.8237052602311032, 'colsample_bytree': 0.9587347855859911}. Best is trial 0 with value: 0.7866.
[I 2026-01-15 04:20:51,818] Trial 1 finished with value: 0.778725 and parameters: {'learning_rate': 0.2775440966176624, 'num_leaves': 145, 'max_depth': 11, 'min_child_samples': 78, 'subsample': 0.7004963572751877, 'colsample_bytree': 0.6905591354046439}. Best is trial 0 with value: 0.7866.
[I 2026-01-15 04:20:59,846] Trial 2 finished with value: 0.783725 and parameters: {'learning_rate': 0.20454861024915846, 'num_leaves': 89, 'max_depth': 15, 'min_child_samples': 82, 'subsample': 0.9502193067160039, 'colsample_bytree': 0.6633071128678614}. Best is trial 0 with value: 0.7866.
[I 2026-01-15 04:21:08,543] Trial 3 finished with value: 0.787075 and parameters: {'learning_rate': 0.030223383778425407, 'num_le

★ Best Params: {'learning_rate': 0.030223383778425407, 'num_leaves': 68, 'max_depth': 9, 'min_child_samples': 64, 'subsample': 0.5500331359017955, 'colsample_bytree': 0.9045797077481199}

--- 最終学習 ---
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[260]	valid_0's auc: 0.870648	valid_0's binary_logloss: 0.443141

--- 評価結果 ---
Accuracy: 0.7871

Confusion Matrix:
[[15081  4813]
 [ 3704 16402]]

Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.76      0.78     19894
           1       0.77      0.82      0.79     20106

    accuracy                           0.79     40000
   macro avg       0.79      0.79      0.79     40000
weighted avg       0.79      0.79      0.79     40000


★ 特徴量重要度
  Feature  Importance
     5.巡目        3223
     2.数字        3101
    3.見え枚        2068
 12.自手牌価値        1885
  10.カベ強度        1782
8.対リーチ危険度        1364
     1.種類        1280
  7.他家立直数         9

数牌20万、字牌2万のアンダーサンプリング

In [None]:
!pip install optuna

import os
import glob
import zipfile
import shutil
import numpy as np
import pandas as pd
import lightgbm as lgb
import xml.etree.ElementTree as ET
import optuna
import re
import gc
from google.colab import drive
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# =============================================================================
# 1. 環境設定・データの準備
# =============================================================================
print("\n--- ステップ1: 環境設定とデータの準備 ---")
try:
    drive.mount('/content/drive')
except Exception as e:
    print(f"Google Driveのマウントに失敗しました: {e}")

# ZIPファイルのパス
ZIP_PATH = '/content/drive/MyDrive/dataset_2022-2024_houou_80k.zip'

# 作業用ディレクトリ
LOCAL_WORK_DIR = '/content/temp_xml_data/'
BATCH_SAVE_DIR = '/content/temp_batches/'

# ★保存名変更: 440k (数牌20万ペア + 字牌2万ペア)
DATASET_PKL = '/content/drive/MyDrive/mahjong_balanced_dataset_440k.pkl'

# バッチフォルダの初期化
if os.path.exists(BATCH_SAVE_DIR):
    shutil.rmtree(BATCH_SAVE_DIR)
os.makedirs(BATCH_SAVE_DIR, exist_ok=True)

# ZIP解凍
if not os.path.exists(LOCAL_WORK_DIR):
    print(f"★ZIPファイル: {ZIP_PATH}")
    if os.path.exists(ZIP_PATH):
        print("ローカル環境に高速解凍中... (数分お待ちください)")
        try:
            with zipfile.ZipFile(ZIP_PATH, 'r') as zip_ref:
                zip_ref.extractall(LOCAL_WORK_DIR)
            print("解凍完了！")
        except Exception as e:
            print(f"解凍エラー: {e}")
            raise
    else:
        raise FileNotFoundError("ZIP file not found")
else:
    print("既に解凍済みデータがあるため、それを使用します。")

LOG_DIR = LOCAL_WORK_DIR

# =============================================================================
# 2. ヘルパー関数 (ロジック)
# =============================================================================
def _parse_tile_type(tile_type_code):
    if 0 <= tile_type_code <= 8:      return 0, tile_type_code + 1
    elif 9 <= tile_type_code <= 17: return 1, (tile_type_code - 9) + 1
    elif 18 <= tile_type_code <= 26: return 2, (tile_type_code - 18) + 1
    elif 27 <= tile_type_code <= 33: return 3, (tile_type_code - 27) + 1
    else: return -1, -1

def count_dora(tile_code, dora_indicators):
    dora_count = 0
    t_type, t_num = _parse_tile_type(tile_code)
    for ind in dora_indicators:
        ind_type, ind_num = _parse_tile_type(ind)
        if t_type == ind_type:
            if t_type == 3:
                target_num = (ind_num % 4) + 1
                if ind_num >= 5: target_num = 5 + ((ind_num - 5 + 1) % 3)
                if t_num == target_num: dora_count += 1
            else:
                target_num = 1 if ind_num == 9 else ind_num + 1
                if t_num == target_num: dora_count += 1
    return dora_count

def parse_called_tile(m_int):
    extracted = (m_int >> 8) & 0xFF
    return extracted * 4

def check_suji_fair(target_type, target_num, opponent_discards):
    if target_type == 3: return 0
    opp_discards_nums = []
    for d in opponent_discards:
        dt, dn = _parse_tile_type(d)
        if dt == target_type:
            opp_discards_nums.append(dn)

    suji_map = {
        1: [4], 2: [5], 3: [6],
        4: [1, 7], 5: [2, 8], 6: [3, 9],
        7: [4], 8: [5], 9: [6]
    }
    needed = suji_map.get(target_num, [])
    if not needed: return 0
    return 1 if all(n in opp_discards_nums for n in needed) else 0

def get_kabe_fair(target_type, target_num, all_public_tiles):
    if target_type == 3: return 0
    check_nums = []
    if target_num == 1: check_nums = [2]
    elif target_num == 9: check_nums = [8]
    elif target_num == 2: check_nums = [3]
    elif target_num == 8: check_nums = [7]
    elif target_num == 3: check_nums = [4]
    elif target_num == 7: check_nums = [6]
    elif target_num == 4: check_nums = [3, 5]
    elif target_num == 5: check_nums = [4, 6]
    elif target_num == 6: check_nums = [5, 7]

    max_visible = 0
    for n in check_nums:
        check_id = (target_type * 9) + (n - 1)
        count = all_public_tiles.count(check_id)
        if count > max_visible: max_visible = count
    return max_visible

def get_my_yakuman_potential(my_hand_tiles):
    tiles = [_parse_tile_type(t // 4) for t in my_hand_tiles]
    yaochu_ids = set()
    sangen_count = 0
    jihai_count = 0
    routou_count = 0
    ryuuiiso_count = 0

    for t_type, t_num in tiles:
        if t_type == 3 or t_num == 1 or t_num == 9:
            yaochu_ids.add(t_type * 10 + t_num)
        if t_type == 3 and t_num >= 5:
            sangen_count += 1
        if t_type == 3:
            jihai_count += 1
        if t_type < 3 and (t_num == 1 or t_num == 9):
            routou_count += 1
        if (t_type == 2 and t_num in [2,3,4,6,8]) or (t_type == 3 and t_num == 6):
            ryuuiiso_count += 1

    scores = [
        len(yaochu_ids), sangen_count * 2.5, jihai_count, routou_count, ryuuiiso_count
    ]
    return max(scores)

def get_meld_threat_level(opp_idx, game_state, dora_indicators):
    melds = game_state['open_hands'][opp_idx]
    if not melds: return 0

    threat = 0
    colors = {0:0, 1:0, 2:0, 3:0}
    dora_pon_count = 0

    for meld in melds:
        if not meld: continue
        first = meld[0]
        tt, tn = _parse_tile_type(first)

        if tt != -1: colors[tt] += 1
        d_val = count_dora(first // 4, dora_indicators)
        if d_val > 0:
            dora_pon_count += d_val
        if tt == 3 and tn >= 5: threat += 15

    threat += (dora_pon_count * 20)

    for c in range(3):
        if colors[c] + colors[3] >= 3 and colors[c] >= 2:
            threat += 20
            if colors[3] == 0: threat += 30

    sangen = 0
    for meld in melds:
        first = meld[0] // 4
        tt, tn = _parse_tile_type(first)
        if tt == 3 and tn >= 5: sangen += 1
    if sangen >= 2: threat += 80

    return threat

def calculate_features_fair(game_state, player_index, physical_code, is_tsumogiri, oya_player):
    tile_type_code = physical_code // 4
    t_type, t_num = _parse_tile_type(tile_type_code)
    features = []

    public_tiles = []
    public_tiles.extend(game_state['hands'][player_index])
    for i in range(4):
        public_tiles.extend(game_state['discards'][i])
        for meld in game_state['open_hands'][i]:
            public_tiles.extend(meld)

    features.append(t_type)
    features.append(t_num)
    features.append(public_tiles.count(tile_type_code))

    dora_val = 0
    dora_indicators = game_state['dora_indicators']
    dora_val += count_dora(tile_type_code, dora_indicators)
    if physical_code in [16, 52, 88]: dora_val += 1
    features.append(dora_val)

    features.append(len(game_state['history']) // 4)
    features.append(1 if is_tsumogiri else 0)

    other_reach_indices = [i for i in range(4) if i != player_index and game_state['reach'][i]]
    features.append(len(other_reach_indices))

    risk_score = 0
    if len(other_reach_indices) > 0:
        for r_idx in other_reach_indices:
            if tile_type_code in game_state['discards'][r_idx]: continue
            this_risk = 0
            if check_suji_fair(t_type, t_num, game_state['discards'][r_idx]) == 0:
                this_risk += 50
                if 4 <= t_num <= 6: this_risk += 20
                kabe = get_kabe_fair(t_type, t_num, public_tiles)
                if kabe < 1: this_risk += 30
            else:
                this_risk += 15
            if this_risk > risk_score: risk_score = this_risk
    features.append(risk_score)

    live_degree = 0
    visible_n = public_tiles.count(tile_type_code)
    if visible_n == 0:
        live_degree = 2
        if t_type == 3: live_degree = 3
    elif visible_n == 1:
        live_degree = 1
    features.append(live_degree)

    features.append(get_kabe_fair(t_type, t_num, public_tiles))

    max_meld_threat = 0
    for i in range(4):
        if i != player_index:
            th = get_meld_threat_level(i, game_state, dora_indicators)
            if th > max_meld_threat: max_meld_threat = th
    features.append(max_meld_threat)

    my_yakuman = get_my_yakuman_potential(game_state['hands'][player_index])
    features.append(my_yakuman)

    return features

feature_names = [
    '1.種類', '2.数字', '3.見え枚', '4.ドラ数',
    '5.巡目', '6.ツモ切', '7.他家立直数',
    '8.対リーチ危険度', '9.生牌度', '10.カベ強度',
    '11.敵副露脅威度', '12.自手牌価値'
]

# =============================================================================
# 4. XML解析 (バッチ保存)
# =============================================================================
def get_clean_tag(tag_str):
    if '}' in tag_str:
        return tag_str.split('}', 1)[1]
    return tag_str

print("\n★ XML解析開始...")
xml_files = glob.glob(os.path.join(LOG_DIR, '**/*.xml'), recursive=True)
print(f"発見ファイル数: {len(xml_files)}")

if len(xml_files) == 0:
    print("エラー: XMLファイルが見つかりません。")
else:
    current_batch_data = []
    BATCH_SIZE_FILES = 2000
    batch_count = 0

    draw_ptn = re.compile(r'^[TUVW]\d+$')
    discard_ptn = re.compile(r'^[DEFG]\d+$')

    count_suuhai_pos = 0
    count_jihai_pos = 0

    # ★目標値を変更
    TARGET_SUUHAI = 200000
    TARGET_JIHAI = 20000

    print(f"目標: 数牌正例 {TARGET_SUUHAI}件, 字牌正例 {TARGET_JIHAI}件")

    for i_file, file_path in enumerate(xml_files):

        # バッチ保存
        if i_file > 0 and i_file % BATCH_SIZE_FILES == 0:
            if current_batch_data:
                batch_df = pd.DataFrame(current_batch_data)
                save_path = os.path.join(BATCH_SAVE_DIR, f'batch_{batch_count}.pkl')
                batch_df.to_pickle(save_path)
                batch_count += 1
                del batch_df, current_batch_data
                current_batch_data = []
                gc.collect()
                print(f" [Batch] {i_file}完了 -> batch_{batch_count-1}.pkl (正例: 数牌{count_suuhai_pos}/字牌{count_jihai_pos})")

            if count_suuhai_pos >= TARGET_SUUHAI and count_jihai_pos >= TARGET_JIHAI:
                print("★目標達成。解析終了。")
                break

        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                try:
                    tree = ET.parse(f)
                    root = tree.getroot()
                except:
                    continue

            game_state = {
                'hands': [[],[],[],[]], 'discards': [[],[],[],[]], 'open_hands': [[],[],[],[]],
                'reach': [False]*4, 'history': [], 'dora_indicators': []
            }
            last_draw = [-1]*4
            last_discard_event = None
            oya_player = 0

            for tag in root:
                tag_name = get_clean_tag(tag.tag)

                if tag_name == 'INIT':
                    if last_discard_event:
                        current_batch_data.append(last_discard_event)
                        if last_discard_event['y'] == 1:
                            if last_discard_event['X'][0] == 3: count_jihai_pos += 1
                            else: count_suuhai_pos += 1

                    oya_player = int(tag.attrib.get('oya', 0))
                    seed = tag.attrib.get('seed', '0,0,0,0,0,0').split(',')
                    dora_ind = int(seed[5]) // 4 if len(seed) > 5 else 0

                    game_state = {
                        'hands': [[],[],[],[]], 'discards': [[],[],[],[]], 'open_hands': [[],[],[],[]],
                        'reach': [False]*4, 'history': [], 'dora_indicators': [dora_ind]
                    }
                    last_discard_event = None
                    for p in range(4):
                        h = tag.attrib.get(f'hai{p}')
                        if h: game_state['hands'][p] = [int(s)//4 for s in h.split(',')]

                if draw_ptn.match(tag_name):
                    p = {'T':0,'U':1,'V':2,'W':3}[tag_name[0]]
                    phys = int(tag_name[1:])
                    last_draw[p] = phys
                    game_state['hands'][p].append(phys // 4)

                if discard_ptn.match(tag_name):
                    p = {'D':0,'E':1,'F':2,'G':3}[tag_name[0]]
                    phys = int(tag_name[1:])

                    if last_discard_event:
                        current_batch_data.append(last_discard_event)
                        if last_discard_event['y'] == 1:
                            if last_discard_event['X'][0] == 3: count_jihai_pos += 1
                            else: count_suuhai_pos += 1

                    is_tsumogiri = (phys == last_draw[p])
                    feats = calculate_features_fair(game_state, p, phys, is_tsumogiri, oya_player)
                    last_discard_event = {'X': feats, 'y': 0}

                    t_type = phys // 4
                    if t_type in game_state['hands'][p]: game_state['hands'][p].remove(t_type)
                    game_state['discards'][p].append(t_type)
                    game_state['history'].append((p, t_type))

                if tag_name == 'N':
                    who = int(tag.attrib.get('who'))
                    m = int(tag.attrib.get('m'))
                    called_tile_id = parse_called_tile(m)
                    c_type = called_tile_id // 4
                    removed = 0
                    temp = game_state['hands'][who][:]
                    for h in temp:
                        if h == c_type and removed < 2:
                            game_state['hands'][who].remove(h)
                            removed += 1
                    game_state['open_hands'][who].append([called_tile_id]*3)

                    if last_discard_event:
                        current_batch_data.append(last_discard_event)
                        if last_discard_event['y'] == 1:
                            if last_discard_event['X'][0] == 3: count_jihai_pos += 1
                            else: count_suuhai_pos += 1
                        last_discard_event = None

                if tag_name == 'REACH' and tag.attrib.get('step') == '1':
                    who_reach = int(tag.attrib.get('who'))
                    game_state['reach'][who_reach] = True

                if tag_name == 'DORA':
                    hai = int(tag.attrib.get('hai'))
                    game_state['dora_indicators'].append(hai // 4)

                if tag_name == 'AGARI':
                    loser = tag.attrib.get('fromWho')
                    if loser and int(tag.attrib.get('who')) != int(loser):
                        if last_discard_event:
                            last_discard_event['y'] = 1
                            current_batch_data.append(last_discard_event)
                            if last_discard_event['X'][0] == 3: count_jihai_pos += 1
                            else: count_suuhai_pos += 1
                            last_discard_event = None
        except:
            continue

    if current_batch_data:
        batch_df = pd.DataFrame(current_batch_data)
        save_path = os.path.join(BATCH_SAVE_DIR, f'batch_{batch_count}.pkl')
        batch_df.to_pickle(save_path)
        print(f" [Batch] 最終保存完了")
        del batch_df, current_batch_data
        gc.collect()

# =============================================================================
# 5. ★アンダーサンプリング (数牌20万ペア+字牌2万ペア)
# =============================================================================
print("\n--- ステップ2: バッチ統合とサンプリング ---")
batch_files = glob.glob(os.path.join(BATCH_SAVE_DIR, '*.pkl'))

if len(batch_files) == 0:
    print("エラー: バッチファイルがありません。")
else:
    all_suuhai_pos = []
    all_suuhai_neg = []
    all_jihai_pos = []
    all_jihai_neg = []

    # ★サンプリング目標
    TARGET_SUUHAI_EACH = 200000
    TARGET_JIHAI_EACH = 20000

    print(f"バッチ処理中...")
    for b_file in batch_files:
        try:
            df_b = pd.read_pickle(b_file)
            if len(df_b) == 0: continue

            X_temp = pd.DataFrame(df_b['X'].tolist(), columns=feature_names)
            df_full = pd.concat([df_b['y'], X_temp], axis=1)
            df_full['is_jihai'] = df_full['1.種類'] == 3

            all_suuhai_pos.append(df_full[(df_full['is_jihai'] == False) & (df_full['y'] == 1)])
            all_jihai_pos.append(df_full[(df_full['is_jihai'] == True) & (df_full['y'] == 1)])

            # 負例は多すぎるので間引く(20%抽出)
            s_n = df_full[(df_full['is_jihai'] == False) & (df_full['y'] == 0)]
            j_n = df_full[(df_full['is_jihai'] == True) & (df_full['y'] == 0)]
            if len(s_n) > 0: all_suuhai_neg.append(s_n.sample(frac=0.2))
            if len(j_n) > 0: all_jihai_neg.append(j_n.sample(frac=0.2))

            del df_b, df_full, X_temp
            gc.collect()
        except:
            pass

    print("統合中...")
    df_suuhai_pos = pd.concat(all_suuhai_pos)
    df_suuhai_neg = pd.concat(all_suuhai_neg)
    df_jihai_pos = pd.concat(all_jihai_pos)
    df_jihai_neg = pd.concat(all_jihai_neg)

    print(f"集計(正例): 数牌{len(df_suuhai_pos)}, 字牌{len(df_jihai_pos)}")

    n_sp = min(len(df_suuhai_pos), TARGET_SUUHAI_EACH)
    n_sn = TARGET_SUUHAI_EACH
    if len(df_suuhai_neg) < n_sn: n_sn = len(df_suuhai_neg)

    n_jp = min(len(df_jihai_pos), TARGET_JIHAI_EACH)
    n_jn = TARGET_JIHAI_EACH
    if len(df_jihai_neg) < n_jn: n_jn = len(df_jihai_neg)

    final_df = pd.concat([
        df_suuhai_pos.sample(n=n_sp, random_state=42),
        df_suuhai_neg.sample(n=n_sn, random_state=42),
        df_jihai_pos.sample(n=n_jp, random_state=42),
        df_jihai_neg.sample(n=n_jn, random_state=42)
    ]).sample(frac=1, random_state=42).reset_index(drop=True)

    print(f"\n★最終データセット: {len(final_df)}件")
    print(final_df['y'].value_counts())
    final_df.to_pickle(DATASET_PKL)

    # =============================================================================
    # 6. Optuna & 学習
    # =============================================================================
    X = final_df[feature_names]
    y = final_df['y']
    cat_features = ['1.種類']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    print("\n--- ステップ3: Optunaによる探索 ---")
    def objective(trial):
        param = {
            'objective': 'binary',
            'metric': 'binary_logloss',
            'verbosity': -1,
            'boosting_type': 'gbdt',
            'n_estimators': 300,
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
            'num_leaves': trial.suggest_int('num_leaves', 20, 150),
            'max_depth': trial.suggest_int('max_depth', 5, 15),
            'min_child_samples': trial.suggest_int('min_child_samples', 10, 100),
            'subsample': trial.suggest_float('subsample', 0.5, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        }
        model = lgb.LGBMClassifier(**param)
        model.fit(X_train, y_train, categorical_feature=cat_features)
        preds = model.predict(X_test)
        return accuracy_score(y_test, preds)

    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=15)

    print(f"★ Best Params: {study.best_params}")

    print("\n--- 最終学習 ---")
    best_params = study.best_params
    final_model = lgb.LGBMClassifier(
        objective='binary', metric='binary_logloss', n_estimators=5000,
        learning_rate=best_params.get('learning_rate', 0.05),
        num_leaves=best_params.get('num_leaves', 31),
        max_depth=best_params.get('max_depth', -1),
        min_child_samples=best_params.get('min_child_samples', 20),
        subsample=best_params.get('subsample', 0.8),
        colsample_bytree=best_params.get('colsample_bytree', 0.8)
    )
    final_model.fit(
        X_train, y_train, eval_set=[(X_test, y_test)],
        eval_metric='auc', callbacks=[lgb.early_stopping(100)],
        categorical_feature=cat_features
    )

    preds = final_model.predict(X_test)
    print(f"Accuracy: {accuracy_score(y_test, preds):.4f}")
    print("\nConfusion Matrix:\n", confusion_matrix(y_test, preds))
    print("\nClassification Report:\n", classification_report(y_test, preds))

    imp = pd.DataFrame({'Feature': feature_names, 'Importance': final_model.feature_importances_}).sort_values('Importance', ascending=False)
    print(imp.to_string(index=False))


--- ステップ1: 環境設定とデータの準備 ---
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
既に解凍済みデータがあるため、それを使用します。

★ XML解析開始...
発見ファイル数: 80000
目標: 数牌正例 200000件, 字牌正例 20000件
 [Batch] 2000完了 -> batch_0.pkl (正例: 数牌9811/字牌788)
 [Batch] 4000完了 -> batch_1.pkl (正例: 数牌19473/字牌1613)
 [Batch] 6000完了 -> batch_2.pkl (正例: 数牌29190/字牌2418)
 [Batch] 8000完了 -> batch_3.pkl (正例: 数牌38978/字牌3166)
 [Batch] 10000完了 -> batch_4.pkl (正例: 数牌48756/字牌4003)
 [Batch] 12000完了 -> batch_5.pkl (正例: 数牌58519/字牌4811)
 [Batch] 14000完了 -> batch_6.pkl (正例: 数牌68267/字牌5634)
 [Batch] 16000完了 -> batch_7.pkl (正例: 数牌78001/字牌6429)
 [Batch] 18000完了 -> batch_8.pkl (正例: 数牌87887/字牌7220)
 [Batch] 20000完了 -> batch_9.pkl (正例: 数牌97671/字牌8020)
 [Batch] 22000完了 -> batch_10.pkl (正例: 数牌107306/字牌8852)
 [Batch] 24000完了 -> batch_11.pkl (正例: 数牌117182/字牌9654)
 [Batch] 26000完了 -> batch_12.pkl (正例: 数牌126987/字牌10423)
 [Batch] 28000完了 -> batch_13.pkl (正例: 数牌136618/字牌11204)
 [Batch] 3000

[I 2026-01-15 04:54:47,555] A new study created in memory with name: no-name-c1534b43-86d3-4395-954b-584533801982



★最終データセット: 440000件
y
1    220000
0    220000
Name: count, dtype: int64

--- ステップ3: Optunaによる探索 ---


[I 2026-01-15 04:55:04,887] Trial 0 finished with value: 0.7729772727272727 and parameters: {'learning_rate': 0.03134259367768324, 'num_leaves': 39, 'max_depth': 11, 'min_child_samples': 48, 'subsample': 0.8857286475539741, 'colsample_bytree': 0.8008015882233956}. Best is trial 0 with value: 0.7729772727272727.
[I 2026-01-15 04:55:18,496] Trial 1 finished with value: 0.7711136363636364 and parameters: {'learning_rate': 0.22063639265120677, 'num_leaves': 74, 'max_depth': 9, 'min_child_samples': 16, 'subsample': 0.6660047261235769, 'colsample_bytree': 0.9905036787184486}. Best is trial 0 with value: 0.7729772727272727.
[I 2026-01-15 04:55:34,839] Trial 2 finished with value: 0.7705909090909091 and parameters: {'learning_rate': 0.1648949668878558, 'num_leaves': 149, 'max_depth': 12, 'min_child_samples': 30, 'subsample': 0.8933175423918653, 'colsample_bytree': 0.8143864709711878}. Best is trial 0 with value: 0.7729772727272727.
[I 2026-01-15 04:55:49,152] Trial 3 finished with value: 0.770

★ Best Params: {'learning_rate': 0.0728829076732116, 'num_leaves': 47, 'max_depth': 7, 'min_child_samples': 50, 'subsample': 0.90660281854414, 'colsample_bytree': 0.870893248906875}

--- 最終学習 ---
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[251]	valid_0's auc: 0.854611	valid_0's binary_logloss: 0.467906
Accuracy: 0.7729

Confusion Matrix:
 [[32702 11513]
 [ 8471 35314]]

Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.74      0.77     44215
           1       0.75      0.81      0.78     43785

    accuracy                           0.77     88000
   macro avg       0.77      0.77      0.77     88000
weighted avg       0.77      0.77      0.77     88000

  Feature  Importance
     5.巡目        1906
     2.数字        1805
    3.見え枚        1413
 12.自手牌価値        1275
  10.カベ強度        1144
8.対リーチ危険度        1044
     1.種類         851
    6.ツモ切         666
  7.他家立直数         652
  