<a href="https://colab.research.google.com/github/yuya0426/Kaggle_Titanic/blob/main/Titanic/Titanic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

APIキーの取得

In [None]:
import os
import pandas as pd

# 1. ここをご自身の文字列に書き換えてください


# 2. データをダウンロードして解凍
!kaggle competitions download -c titanic
!unzip -o titanic.zip

# 3. データを読み込み
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

# # 4. データの中身を表示
# print("【データの中身（先頭5行）】")
# display(train_df.head())

Downloading titanic.zip to /content
  0% 0.00/34.1k [00:00<?, ?B/s]
100% 34.1k/34.1k [00:00<00:00, 128MB/s]
Archive:  titanic.zip
  inflating: gender_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


年齢の欠損を中央値で補償する

In [None]:
# 今回特徴量として使う列を指定
features = ['Pclass', 'Age']

# 欠損値の処理：Ageの空欄を、全体の「年齢の中央値」でそっと埋める
age_median = train_df['Age'].median()
train_df['Age'] = train_df['Age'].fillna(age_median)
test_df['Age'] = test_df['Age'].fillna(age_median)

# AIに学習させるためのデータ(X)と、予測させたい正解データ(y)に分割する
X_train = train_df[features]
y_train = train_df['Survived']

# テスト用のデータも同じように準備する
X_test = test_df[features]

# 本当に空欄（欠損値）が埋まったか確認
print("【処理後のAgeの欠損値の数】")
print("Trainデータ:", X_train['Age'].isnull().sum())

# AIに渡す直前のデータの中身を確認
print("\n【AIに学習させるデータ（X_train）の先頭5行】")
display(X_train.head())

【処理後のAgeの欠損値の数】
Trainデータ: 0

【AIに学習させるデータ（X_train）の先頭5行】


Unnamed: 0,Pclass,Age
0,3,22.0
1,1,38.0
2,3,26.0
3,1,35.0
4,3,35.0


kFoldとLightGBMを実行

In [None]:
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
import numpy as np

# 1. 5分割の交差検証（KFold）の設定
# shuffle=Trueでデータをシャッフルしてから分割し、random_stateで毎回同じ分け方になるよう固定します
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# 各回の正解率を保存しておくための空リスト
accuracies = []

# 2. 5回のループ処理（学習と検証）
for fold, (train_idx, val_idx) in enumerate(kf.split(X_train)):
    print(f"--- Fold {fold + 1} ---")

    # 手元のデータを「学習用(4/5)」と「検証用(1/5)」に切り分ける
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

    # LightGBMが読み込める専用のデータ形式に変換
    lgb_train = lgb.Dataset(X_tr, y_tr)
    lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train)

    # LightGBMの基本的な設定（ハイパーパラメータ）
    params = {
        'objective': 'binary',     # 目的：0(死亡)か1(生存)かの「2値分類」
        'metric': 'binary_error',  # 評価指標：エラー率
        'verbosity': -1            # 余計な警告文を出さない設定
    }

    # 3. モデルの学習を実行
    model = lgb.train(
        params,
        lgb_train,
        valid_sets=[lgb_train, lgb_eval],
        num_boost_round=100        # 最大100回学習を繰り返す
    )

    # 4. 学習したモデルを使って、検証用データ(X_val)の生存予測を行う
    # ※LightGBMは「生存する確率（0.0〜1.0）」を出力します
    y_pred_prob = model.predict(X_val)

    # 確率が0.5以上なら生存(1)、0.5未満なら死亡(0)に変換
    y_pred = np.where(y_pred_prob >= 0.5, 1, 0)

    # 5. 実際の正解(y_val)と予測(y_pred)を比較し、正解率を計算
    acc = accuracy_score(y_val, y_pred)
    print(f"Fold {fold + 1} の正解率: {acc:.4f}\n")

    # リストに正解率を追加
    accuracies.append(acc)

# 6. 全5回の平均正解率を出力
print("==================================")
print(f"★ 5分割交差検証の平均正解率: {np.mean(accuracies):.4f}")

--- Fold 1 ---
Fold 1 の正解率: 0.7207

--- Fold 2 ---
Fold 2 の正解率: 0.6685

--- Fold 3 ---
Fold 3 の正解率: 0.6573

--- Fold 4 ---
Fold 4 の正解率: 0.6685

--- Fold 5 ---
Fold 5 の正解率: 0.6517

★ 5分割交差検証の平均正解率: 0.6733


特徴量に「性別」を追加 <br>
「性別」の属性を数値に変換

In [None]:
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
import numpy as np

# 1. 新しいパーツ（性別）を数字に変換する前処理
# .map()という工具を使って、maleを0、femaleを1に置き換えます
train_df['Sex'] = train_df['Sex'].map({'male': 0, 'female': 1})
test_df['Sex'] = test_df['Sex'].map({'male': 0, 'female': 1})

# 2. 特徴量のリストに 'Sex' を追加！
features = ['Pclass', 'Age', 'Sex']

# AIに渡すための問題(X)を新しい特徴量で作り直す
X_train = train_df[features]
y_train = train_df['Survived'] # 正解(y)はそのまま

# 3. KFoldの設定（先ほどと同じ5分割）
kf = KFold(n_splits=5, shuffle=True, random_state=42)
accuracies = []

print("新しい特徴量 'Sex' を追加して学習開始...\n")

# 4. 学習と検証のループ（先ほどと全く同じ処理です）
for fold, (train_idx, val_idx) in enumerate(kf.split(X_train)):

    # データの切り分け
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

    # LightGBM用のオブジェクト(箱)に変換
    lgb_train = lgb.Dataset(X_tr, y_tr)
    lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train)

    # 設定
    params = {
        'objective': 'binary',
        'metric': 'binary_error',
        'verbosity': -1
    }

    # 学習
    model = lgb.train(params, lgb_train, valid_sets=[lgb_train, lgb_eval], num_boost_round=100)

    # 予測と正解率の計算
    y_pred_prob = model.predict(X_val)
    y_pred = np.where(y_pred_prob >= 0.5, 1, 0)
    acc = accuracy_score(y_val, y_pred)

    print(f"Fold {fold + 1} の正解率: {acc:.4f}")
    accuracies.append(acc)

# 5. 結果発表
print("==================================")
print(f"★ 5分割交差検証の平均正解率: {np.mean(accuracies):.4f}")

新しい特徴量 'Sex' を追加して学習開始...

Fold 1 の正解率: 0.8380
Fold 2 の正解率: 0.7865
Fold 3 の正解率: 0.8483
Fold 4 の正解率: 0.7697
Fold 5 の正解率: 0.8090
★ 5分割交差検証の平均正解率: 0.8103


Optunaの導入

In [None]:
# 1. Optunaのインストールとインポート
!pip install optuna
import optuna
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
import numpy as np

# 2. 目的関数（Optunaに「何をどう調整して、何を良くしたいか」を教えるルールブック）
def objective(trial):
    # Optunaに回してもらうダイヤル（パラメータ）の範囲を指定
    params = {
        'objective': 'binary',
        'metric': 'binary_error',
        'verbosity': -1,
        # ここからがOptunaにお任せする設定値
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 9),
        'num_leaves': trial.suggest_int('num_leaves', 10, 50),
    }

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    accuracies = []

    # 5分割交差検証で、その設定値の「実力」を測る
    for train_idx, val_idx in kf.split(X_train):
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

        lgb_train = lgb.Dataset(X_tr, y_tr)
        lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train)

        # Optunaが選んだパラメータを使ってLightGBMを学習
        model = lgb.train(params, lgb_train, valid_sets=[lgb_train, lgb_eval], num_boost_round=100)

        y_pred_prob = model.predict(X_val)
        y_pred = np.where(y_pred_prob >= 0.5, 1, 0)
        acc = accuracy_score(y_val, y_pred)
        accuracies.append(acc)

    # 5回の平均正解率をOptunaに報告する
    return np.mean(accuracies)

print("Optunaによる最強のセッティング探索を開始します...\n")

# 3. Optunaに「正解率を最大化(maximize)する方向で、20パターン試して！」と命令
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)

# 4. 結果発表
print("==================================")
print("★ 最適化完了！")
print(f"一番良かった正解率: {study.best_value:.4f}")
print("その時の最強パラメータ:", study.best_params)

Collecting optuna
  Downloading optuna-4.7.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.10.1-py3-none-any.whl.metadata (11 kB)
Downloading optuna-4.7.0-py3-none-any.whl (413 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m413.9/413.9 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.10.1-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.10.1 optuna-4.7.0


[I 2026-02-17 12:17:56,740] A new study created in memory with name: no-name-a05e21e1-6279-4a07-9188-e4adde7cf9fd
[I 2026-02-17 12:17:56,882] Trial 0 finished with value: 0.7957378695624883 and parameters: {'learning_rate': 0.062070531382035644, 'max_depth': 4, 'num_leaves': 42}. Best is trial 0 with value: 0.7957378695624883.


Optunaによる最強のセッティング探索を開始します...



[I 2026-02-17 12:17:57,076] Trial 1 finished with value: 0.8002134203753688 and parameters: {'learning_rate': 0.06698497906796695, 'max_depth': 8, 'num_leaves': 13}. Best is trial 1 with value: 0.8002134203753688.
[I 2026-02-17 12:17:57,251] Trial 2 finished with value: 0.8125478626577113 and parameters: {'learning_rate': 0.09711414033423715, 'max_depth': 6, 'num_leaves': 21}. Best is trial 2 with value: 0.8125478626577113.
[I 2026-02-17 12:17:57,381] Trial 3 finished with value: 0.7990961019396146 and parameters: {'learning_rate': 0.08929006815691565, 'max_depth': 4, 'num_leaves': 10}. Best is trial 2 with value: 0.8125478626577113.
[I 2026-02-17 12:17:57,552] Trial 4 finished with value: 0.7991023790094783 and parameters: {'learning_rate': 0.055068969015145276, 'max_depth': 6, 'num_leaves': 46}. Best is trial 2 with value: 0.8125478626577113.
[I 2026-02-17 12:17:57,698] Trial 5 finished with value: 0.7878789780930262 and parameters: {'learning_rate': 0.024632648910347776, 'max_depth'

★ 最適化完了！
一番良かった正解率: 0.8126
その時の最強パラメータ: {'learning_rate': 0.09531458064696166, 'max_depth': 8, 'num_leaves': 16}


提出用ファイルの作成

In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb

# ★ ここが抜けていました！X_testにも「Sex」を含めた3つの特徴量をセットし直します
features = ['Pclass', 'Age', 'Sex']
X_test = test_df[features]

# 1. Optunaが見つけた最強パラメータを取得し、基本設定を追加
best_params = study.best_params
best_params['objective'] = 'binary'
best_params['metric'] = 'binary_error'
best_params['verbosity'] = -1

print("【最終モデルの学習を開始します】")
print("使用するパラメータ:", best_params)

# 2. 手元にある学習データ「全体」を使って、最終モデルを鍛え上げる
lgb_train_final = lgb.Dataset(X_train, y_train)
final_model = lgb.train(best_params, lgb_train_final, num_boost_round=100)

# 3. 本番テストデータ (X_test) に対する予測を実行！
y_pred_prob_test = final_model.predict(X_test)
y_pred_test = np.where(y_pred_prob_test >= 0.5, 1, 0)

# 4. Kaggleの提出ルールに合わせて、データフレーム（表）を作成
submission = pd.DataFrame({
    'PassengerId': test_df['PassengerId'], # 乗客のID
    'Survived': y_pred_test                # あなたのAIが予測した生死
})

# 5. CSVファイルとして保存（index=False は余計な行番号を消すおまじないです）
submission.to_csv('submission.csv', index=False)

print("\n==================================")
print("★ 提出用ファイル 'submission.csv' が無事に作成されました！")
display(submission.head())

【最終モデルの学習を開始します】
使用するパラメータ: {'learning_rate': 0.09531458064696166, 'max_depth': 8, 'num_leaves': 16, 'objective': 'binary', 'metric': 'binary_error', 'verbosity': -1}

★ 提出用ファイル 'submission.csv' が無事に作成されました！


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


Kaggleに提出

In [None]:
# Kaggle APIを使って、submission.csvを直接提出するコマンド
!kaggle competitions submit -c titanic -f submission.csv -m "LightGBM + Optuna + Sex feature"

100% 2.77k/2.77k [00:00<00:00, 3.97kB/s]
Successfully submitted to Titanic - Machine Learning from Disaster

In [None]:
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd

# データの読み直し（リセット）
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

# 1. 前処理：性別（Sex）を数値に変換
train_df['Sex'] = train_df['Sex'].map({'male': 0, 'female': 1})
test_df['Sex'] = test_df['Sex'].map({'male': 0, 'female': 1})

# ★ ポイント：今回は「年齢（Age）」を使わず、この2つだけで挑みます
features = ['Pclass', 'Sex']

X_train = train_df[features]
y_train = train_df['Survived']

# 2. 5分割交差検証で実力を測る
kf = KFold(n_splits=5, shuffle=True, random_state=42)
accuracies = []

print(f"使用する特徴量: {features}")
print("学習開始...\n")

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train)):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

    lgb_train = lgb.Dataset(X_tr, y_tr)
    lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train)

    params = {'objective': 'binary', 'metric': 'binary_error', 'verbosity': -1}

    model = lgb.train(params, lgb_train, valid_sets=[lgb_train, lgb_eval], num_boost_round=100)

    y_pred = np.where(model.predict(X_val) >= 0.5, 1, 0)
    acc = accuracy_score(y_val, y_pred)
    accuracies.append(acc)

print("==================================")
print(f"★ 平均正解率: {np.mean(accuracies):.4f}")

使用する特徴量: ['Pclass', 'Sex']
学習開始...

★ 平均正解率: 0.7767


In [None]:
# データの読み直し
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

# 1. 前処理：性別の変換
train_df['Sex'] = train_df['Sex'].map({'male': 0, 'female': 1})
test_df['Sex'] = test_df['Sex'].map({'male': 0, 'female': 1})

# 2. 前処理：欠損値の処理（AgeとEmbarked）
# Ageは使わないかもしれませんが、念のため埋めておきます
train_df['Age'] = train_df['Age'].fillna(train_df['Age'].median())
test_df['Age'] = test_df['Age'].fillna(test_df['Age'].median())
# Embarked（乗船港）は2つだけ欠損があるので、一番多い 'S' で埋めます
train_df['Embarked'] = train_df['Embarked'].fillna('S')
test_df['Embarked'] = test_df['Embarked'].fillna('S')

# Embarkedを数値に変換 (S=0, C=1, Q=2)
embarked_map = {'S': 0, 'C': 1, 'Q': 2}
train_df['Embarked'] = train_df['Embarked'].map(embarked_map)
test_df['Embarked'] = test_df['Embarked'].map(embarked_map)

# ★ ここが新技術！「家族の人数」を作る
# 兄弟配偶者 + 親子 + 自分(1) = 家族の人数
train_df['FamilySize'] = train_df['SibSp'] + train_df['Parch'] + 1
test_df['FamilySize'] = test_df['SibSp'] + test_df['Parch'] + 1

# ★ 特徴量を厳選！
# Ageを入れるかどうかは、実験1の結果次第で決めてみてください
features = ['Pclass', 'Sex', 'FamilySize', 'Embarked']

# 以下、学習用データ作成
X_train = train_df[features]
y_train = train_df['Survived']

# 5分割交差検証（先ほどと同じコード）を実行
kf = KFold(n_splits=5, shuffle=True, random_state=42)
accuracies = []
print(f"新しい特徴量セット: {features}")

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train)):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

    lgb_train = lgb.Dataset(X_tr, y_tr)
    lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train)

    # 警告を減らすためパラメータを少し調整
    params = {'objective': 'binary', 'metric': 'binary_error', 'verbosity': -1}

    model = lgb.train(params, lgb_train, valid_sets=[lgb_train, lgb_eval], num_boost_round=100)

    y_pred = np.where(model.predict(X_val) >= 0.5, 1, 0)
    accuracies.append(accuracy_score(y_val, y_pred))

print("==================================")
print(f"★ 平均正解率: {np.mean(accuracies):.4f}")

新しい特徴量セット: ['Pclass', 'Sex', 'FamilySize', 'Embarked']
★ 平均正解率: 0.7834


In [None]:
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd

# データの読み直し
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

# 1. 性別(Sex)と出港地(Embarked)を数値に変換
train_df['Sex'] = train_df['Sex'].map({'male': 0, 'female': 1})
test_df['Sex'] = test_df['Sex'].map({'male': 0, 'female': 1})

train_df['Embarked'] = train_df['Embarked'].fillna('S')
test_df['Embarked'] = test_df['Embarked'].fillna('S')
train_df['Embarked'] = train_df['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})
test_df['Embarked'] = test_df['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})

# 2. 年齢(Age)を復活！欠損値は中央値で埋める
train_df['Age'] = train_df['Age'].fillna(train_df['Age'].median())
test_df['Age'] = test_df['Age'].fillna(test_df['Age'].median())

# 3. 家族の人数(FamilySize)を追加
train_df['FamilySize'] = train_df['SibSp'] + train_df['Parch'] + 1
test_df['FamilySize'] = test_df['SibSp'] + test_df['Parch'] + 1

# ★ 今回のオールスター特徴量
features = ['Pclass', 'Sex', 'Age', 'FamilySize', 'Embarked']

X_train = train_df[features]
y_train = train_df['Survived']

# 5分割交差検証
kf = KFold(n_splits=5, shuffle=True, random_state=42)
accuracies = []

print(f"★ 使用する特徴量: {features}")
print("学習開始...\n")

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train)):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

    lgb_train = lgb.Dataset(X_tr, y_tr)
    lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train)

    params = {'objective': 'binary', 'metric': 'binary_error', 'verbosity': -1}

    model = lgb.train(params, lgb_train, valid_sets=[lgb_train, lgb_eval], num_boost_round=100)

    y_pred = np.where(model.predict(X_val) >= 0.5, 1, 0)
    accuracies.append(accuracy_score(y_val, y_pred))

print("==================================")
print(f"★ 平均正解率: {np.mean(accuracies):.4f}")

★ 使用する特徴量: ['Pclass', 'Sex', 'Age', 'FamilySize', 'Embarked']
学習開始...

★ 平均正解率: 0.8238


In [None]:
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold # ★ KFoldの進化版
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd

# データの読み直し
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

# =================================================
# 1. 前処理：名前から「敬称(Title)」を抜き出す魔法
# =================================================
# データを結合して一気に処理します
combined = pd.concat([train_df, test_df], sort=False)

# 名前の中から "Mr." などの敬称を抽出
combined['Title'] = combined['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

# 敬称を主要なグループにまとめる（DrやRevなどは "Others" に）
title_mapping = {
    "Mr": 0, "Miss": 1, "Mrs": 2,
    "Master": 3, "Dr": 4, "Rev": 4, "Col": 4, "Major": 4, "Mlle": 1, "Countess": 2,
    "Ms": 1, "Lady": 2, "Jonkheer": 4, "Don": 4, "Dona": 4, "Mme": 2, "Capt": 4, "Sir": 4
}
combined['Title'] = combined['Title'].map(title_mapping)
# まとまらなかったものをその他(4)にする
combined['Title'] = combined['Title'].fillna(4)

# =================================================
# 2. その他の前処理（これまでのおさらい）
# =================================================
# 性別
combined['Sex'] = combined['Sex'].map({'male': 0, 'female': 1})

# 年齢（敬称ごとの中央値で埋めるのが実は最強ですが、今回は全体の中央値でシンプルに）
combined['Age'] = combined['Age'].fillna(combined['Age'].median())

# 家族の人数
combined['FamilySize'] = combined['SibSp'] + combined['Parch'] + 1

# 出港地
combined['Embarked'] = combined['Embarked'].fillna('S')
combined['Embarked'] = combined['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})

# ★ 運賃(Fare)を追加！
# テストデータに1つだけ欠損があるので中央値で埋めます
combined['Fare'] = combined['Fare'].fillna(combined['Fare'].median())

# データを再び分離
train_df = combined.iloc[:len(train_df)]
test_df = combined.iloc[len(train_df):]

# =================================================
# 3. 学習開始
# =================================================
# ★ 今回の特徴量：敬称(Title)と運賃(Fare)が新入りです
features = ['Pclass', 'Sex', 'Age', 'FamilySize', 'Embarked', 'Fare', 'Title']

X_train = train_df[features]
y_train = train_df['Survived']

# ★ 進化ポイント：StratifiedKFold
# 生存者と死亡者の割合を崩さずに分割してくれる、KFoldの上位互換です
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracies = []

print(f"★ 使用する特徴量: {features}")
print("学習開始...\n")

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train, y_train)):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

    lgb_train = lgb.Dataset(X_tr, y_tr)
    lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train)

    # 精度を上げるため、少しパラメータを調整（学習率を下げて回数を増やす）
    params = {
        'objective': 'binary',
        'metric': 'binary_error',
        'verbosity': -1,
        'learning_rate': 0.05,  # 慎重に学習
        'num_leaves': 31        # 木の複雑さ
    }

    model = lgb.train(params, lgb_train, valid_sets=[lgb_train, lgb_eval],
                      num_boost_round=1000,        # 最大1000回
                      callbacks=[lgb.early_stopping(stopping_rounds=50), # 50回改善しなければストップ
                                 lgb.log_evaluation(period=0)] # ログを黙らせる
                     )

    y_pred = np.where(model.predict(X_val) >= 0.5, 1, 0)
    acc = accuracy_score(y_val, y_pred)
    accuracies.append(acc)

print("==================================")
print(f"★ 平均正解率: {np.mean(accuracies):.4f}")

  combined['Title'] = combined['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)


★ 使用する特徴量: ['Pclass', 'Sex', 'Age', 'FamilySize', 'Embarked', 'Fare', 'Title']
学習開始...

Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[70]	training's binary_error: 0.108146	valid_1's binary_error: 0.134078
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[29]	training's binary_error: 0.13885	valid_1's binary_error: 0.134831
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[26]	training's binary_error: 0.12763	valid_1's binary_error: 0.157303
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[9]	training's binary_error: 0.16129	valid_1's binary_error: 0.162921
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[84]	training's binary_error: 0.0939691	valid_1's binary_error: 0.129213
★ 平均正解率: 0.8563


In [None]:
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd

# データの読み直し
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
combined = pd.concat([train_df, test_df], sort=False)

# =================================================
# 1. 既存の強力な特徴量（おさらい）
# =================================================
# Title（敬称）
combined['Title'] = combined['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
title_mapping = {
    "Mr": 0, "Miss": 1, "Mrs": 2, "Master": 3,
    "Dr": 4, "Rev": 4, "Col": 4, "Major": 4, "Mlle": 1, "Countess": 2,
    "Ms": 1, "Lady": 2, "Jonkheer": 4, "Don": 4, "Dona": 4, "Mme": 2, "Capt": 4, "Sir": 4
}
combined['Title'] = combined['Title'].map(title_mapping).fillna(4)

# 性別、年齢、家族、出港地、運賃
combined['Sex'] = combined['Sex'].map({'male': 0, 'female': 1})
combined['Age'] = combined['Age'].fillna(combined['Age'].median())
combined['FamilySize'] = combined['SibSp'] + combined['Parch'] + 1
combined['Embarked'] = combined['Embarked'].fillna('S').map({'S': 0, 'C': 1, 'Q': 2})
combined['Fare'] = combined['Fare'].fillna(combined['Fare'].median())

# =================================================
# 2. 【新機能】Cabinから「デッキ」を抽出
# =================================================
# C85 -> C, D26 -> D のように先頭の文字を取る
# 欠損値は 'M' (Missing) という新しいカテゴリにする
combined['Deck'] = combined['Cabin'].astype(str).str[0]
deck_mapping = {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6, 'T': 7, 'n': 8} # n is nan
combined['Deck'] = combined['Deck'].map(deck_mapping)
# マッピングできなかったものを8(Missing)にする
combined['Deck'] = combined['Deck'].fillna(8)

# =================================================
# 3. 【新機能】チケットの重複枚数（TicketGroup）
# =================================================
# 自分と同じチケット番号を持っている人が何人いるか？
# 家族だけでなく、友人や恋人も炙り出せる
ticket_counts = combined['Ticket'].value_counts()
combined['TicketGroup'] = combined['Ticket'].map(ticket_counts)

# =================================================
# データの分割と学習
# =================================================
train_df = combined.iloc[:len(train_df)]
test_df = combined.iloc[len(train_df):]

# 特徴量リスト：DeckとTicketGroupが追加されました
features = ['Pclass', 'Sex', 'Age', 'FamilySize', 'Embarked', 'Fare', 'Title', 'Deck', 'TicketGroup']

X_train = train_df[features]
y_train = train_df['Survived']

# 学習設定（少し複雑なモデルなので、過学習を防ぐ設定を強めます）
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracies = []

print(f"★ 使用する特徴量: {features}")
print("学習開始...\n")

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train, y_train)):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

    lgb_train = lgb.Dataset(X_tr, y_tr)
    lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train)

    # パラメータ調整：特徴量が増えたので少し「木の深さ」を制限して過学習抑制
    params = {
        'objective': 'binary',
        'metric': 'binary_error',
        'verbosity': -1,
        'learning_rate': 0.05,
        'num_leaves': 20,      # 少し小さくして単純化
        'max_depth': 7,        # 深すぎないように制限
        'min_data_in_leaf': 20 # 葉っぱ1つあたりのデータ数を確保
    }

    model = lgb.train(params, lgb_train, valid_sets=[lgb_train, lgb_eval],
                      num_boost_round=1000,
                      callbacks=[lgb.early_stopping(stopping_rounds=50),
                                 lgb.log_evaluation(period=0)]
                     )

    y_pred = np.where(model.predict(X_val) >= 0.5, 1, 0)
    acc = accuracy_score(y_val, y_pred)
    accuracies.append(acc)

print("==================================")
print(f"★ 平均正解率: {np.mean(accuracies):.4f}")

  combined['Title'] = combined['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)


★ 使用する特徴量: ['Pclass', 'Sex', 'Age', 'FamilySize', 'Embarked', 'Fare', 'Title', 'Deck', 'TicketGroup']
学習開始...

Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[39]	training's binary_error: 0.134831	valid_1's binary_error: 0.128492
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[54]	training's binary_error: 0.12763	valid_1's binary_error: 0.134831
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[25]	training's binary_error: 0.12763	valid_1's binary_error: 0.157303
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[48]	training's binary_error: 0.11641	valid_1's binary_error: 0.157303
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[26]	training's binary_error: 0.131837	valid_1's binary_error: 0.168539
★ 平均正解率: 0.8507


In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

# データの読み直し
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
combined = pd.concat([train_df, test_df], sort=False)

# =================================================
# 1. 特徴量エンジニアリング（ベストスコア0.8563の構成）
# =================================================
# Title（敬称）
combined['Title'] = combined['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
title_mapping = {
    "Mr": 0, "Miss": 1, "Mrs": 2, "Master": 3,
    "Dr": 4, "Rev": 4, "Col": 4, "Major": 4, "Mlle": 1, "Countess": 2,
    "Ms": 1, "Lady": 2, "Jonkheer": 4, "Don": 4, "Dona": 4, "Mme": 2, "Capt": 4, "Sir": 4
}
combined['Title'] = combined['Title'].map(title_mapping).fillna(4)

# 性別
combined['Sex'] = combined['Sex'].map({'male': 0, 'female': 1})

# 年齢（欠損値処理）
combined['Age'] = combined['Age'].fillna(combined['Age'].median())

# 家族の人数
combined['FamilySize'] = combined['SibSp'] + combined['Parch'] + 1

# 出港地
combined['Embarked'] = combined['Embarked'].fillna('S').map({'S': 0, 'C': 1, 'Q': 2})

# 運賃
combined['Fare'] = combined['Fare'].fillna(combined['Fare'].median())

# データを分割
train_df = combined.iloc[:len(train_df)]
test_df = combined.iloc[len(train_df):]

# 使用する特徴量（ベストメンバー）
features = ['Pclass', 'Sex', 'Age', 'FamilySize', 'Embarked', 'Fare', 'Title']

X_train = train_df[features]
y_train = train_df['Survived']
X_test = test_df[features]

# =================================================
# 2. アンサンブル学習開始
# =================================================
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# 結果を保存するリスト
lgb_accuracies = []
rf_accuracies = []
ensemble_accuracies = []

print("★ アンサンブル学習（LightGBM + Random Forest）を開始します...\n")

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train)):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

    # --- モデル1: LightGBM ---
    lgb_train = lgb.Dataset(X_tr, y_tr)
    lgb_params = {
        'objective': 'binary',
        'metric': 'binary_error',
        'verbosity': -1,
        'learning_rate': 0.05,
        'num_leaves': 31
    }
    model_lgb = lgb.train(lgb_params, lgb_train, num_boost_round=100)
    # 予測（確率 0.0〜1.0）
    prob_lgb = model_lgb.predict(X_val)

    # --- モデル2: Random Forest ---
    # ランダムフォレストはパラメータ調整なしでもそこそこ強いです
    model_rf = RandomForestClassifier(n_estimators=100, max_depth=7, random_state=42)
    model_rf.fit(X_tr, y_tr)
    # 予測（確率 0.0〜1.0）
    prob_rf = model_rf.predict_proba(X_val)[:, 1]

    # --- ★ 合体（アンサンブル） ---
    # 2つのAIの確率を平均します（LightGBM 50% + Random Forest 50%）
    prob_ensemble = (prob_lgb + prob_rf) / 2

    # 精度確認
    y_pred_lgb = np.where(prob_lgb >= 0.5, 1, 0)
    y_pred_rf = np.where(prob_rf >= 0.5, 1, 0)
    y_pred_ensemble = np.where(prob_ensemble >= 0.5, 1, 0)

    acc_lgb = accuracy_score(y_val, y_pred_lgb)
    acc_rf = accuracy_score(y_val, y_pred_rf)
    acc_ens = accuracy_score(y_val, y_pred_ensemble)

    lgb_accuracies.append(acc_lgb)
    rf_accuracies.append(acc_rf)
    ensemble_accuracies.append(acc_ens)

    print(f"Fold {fold+1} | LGB: {acc_lgb:.4f} | RF: {acc_rf:.4f} | ★Ensemble: {acc_ens:.4f}")

print("\n==================================")
print(f"LGB平均: {np.mean(lgb_accuracies):.4f}")
print(f"RF平均:  {np.mean(rf_accuracies):.4f}")
print(f"★ アンサンブル平均: {np.mean(ensemble_accuracies):.4f}")

# =================================================
# 3. 本番用データの作成
# =================================================
# 全データで再学習
# LightGBM
lgb_train_final = lgb.Dataset(X_train, y_train)
model_lgb_final = lgb.train(lgb_params, lgb_train_final, num_boost_round=100)
prob_lgb_test = model_lgb_final.predict(X_test)

# Random Forest
model_rf_final = RandomForestClassifier(n_estimators=100, max_depth=7, random_state=42)
model_rf_final.fit(X_train, y_train)
prob_rf_test = model_rf_final.predict_proba(X_test)[:, 1]

# 平均をとる
prob_ensemble_test = (prob_lgb_test + prob_rf_test) / 2
y_pred_test = np.where(prob_ensemble_test >= 0.5, 1, 0)

# 提出ファイル作成
submission = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Survived': y_pred_test
})
submission.to_csv('submission_ensemble.csv', index=False)
print("提出ファイル 'submission_ensemble.csv' を作成しました！")

  combined['Title'] = combined['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)


★ アンサンブル学習（LightGBM + Random Forest）を開始します...

Fold 1 | LGB: 0.8380 | RF: 0.8324 | ★Ensemble: 0.8436
Fold 2 | LGB: 0.8315 | RF: 0.8315 | ★Ensemble: 0.8315
Fold 3 | LGB: 0.8933 | RF: 0.8708 | ★Ensemble: 0.8933
Fold 4 | LGB: 0.8202 | RF: 0.7978 | ★Ensemble: 0.8146
Fold 5 | LGB: 0.8371 | RF: 0.8483 | ★Ensemble: 0.8539

LGB平均: 0.8440
RF平均:  0.8361
★ アンサンブル平均: 0.8474
提出ファイル 'submission_ensemble.csv' を作成しました！


In [None]:
# Kaggle APIを使って、submission.csvを直接提出するコマンド
!kaggle competitions submit -c titanic -f submission_ensemble.csv -m "LightGBM + Optuna + Sex feature"

100% 2.77k/2.77k [00:00<00:00, 14.0kB/s]
Successfully submitted to Titanic - Machine Learning from Disaster

In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder

# =================================================
# 1. データの読み込みと結合
# =================================================
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
combined = pd.concat([train_df, test_df], sort=False)

# =================================================
# 2. 特徴量エンジニアリング（汎化性能重視）
# =================================================

# --- Title（敬称）の抽出と整理 ---
combined['Title'] = combined['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
# 稀な敬称をまとめてシンプルにする
combined['Title'] = combined['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
combined['Title'] = combined['Title'].replace('Mlle', 'Miss')
combined['Title'] = combined['Title'].replace('Ms', 'Miss')
combined['Title'] = combined['Title'].replace('Mme', 'Mrs')
# 数値に変換
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
combined['Title'] = combined['Title'].map(title_mapping).fillna(0)

# --- Sex（性別） ---
combined['Sex'] = combined['Sex'].map({'male': 0, 'female': 1}).astype(int)

# --- Age（年齢）の補完とBinning（グルーピング） ---
# 敬称ごとの中央値で年齢を埋める（これが最も精度が高い）
combined['Age'] = combined['Age'].fillna(combined.groupby('Title')['Age'].transform('median'))

# 年齢をそのまま使わず、5つのグループに分ける（Binning）
# 0:子供, 1:若者, 2:成人, 3:中年, 4:老人
combined['AgeBin'] = pd.cut(combined['Age'], 5, labels=[0, 1, 2, 3, 4]).astype(int)

# --- Embarked（出港地） ---
combined['Embarked'] = combined['Embarked'].fillna(combined['Embarked'].mode()[0])
combined['Embarked'] = combined['Embarked'].map({'S': 0, 'C': 1, 'Q': 2}).astype(int)

# --- Fare（運賃）の補完とBinning ---
combined['Fare'] = combined['Fare'].fillna(combined['Fare'].median())
# 運賃を4つの等分グループに分ける（四分位数）
# これにより「高い」「安い」といった感覚的な分類が可能になる
combined['FareBin'] = pd.qcut(combined['Fare'], 4, labels=[0, 1, 2, 3]).astype(int)

# --- FamilySize（家族サイズ）のグルーピング ---
combined['FamilySize'] = combined['SibSp'] + combined['Parch'] + 1
# 家族の人数も「1人」「小家族」「大家族」に単純化する
combined['IsAlone'] = 0
combined['IsAlone'].loc[combined['FamilySize'] == 1] = 1

# =================================================
# 3. データセットの準備
# =================================================
# 不要な列を削除（AgeやFareなどの「生の値」は捨て、Binを使う）
drop_elements = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp', 'Parch', 'Age', 'Fare', 'FamilySize']
combined = combined.drop(drop_elements, axis=1)

train_df = combined.iloc[:len(train_df)]
test_df = combined.iloc[len(train_df):]

X_train = train_df.drop("Survived", axis=1)
y_train = train_df["Survived"].astype(int)
X_test = test_df.drop("Survived", axis=1)

# 線形モデル（Logistic Regression）用にデータの縮尺を整える
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# =================================================
# 4. 最強のアンサンブル（Voting）
# =================================================
print("★ 学習開始：3つの異なる頭脳を組み合わせます...")

# モデル1: Random Forest（安定感抜群）
clf_rf = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)

# モデル2: LightGBM（高精度）
clf_lgb = lgb.LGBMClassifier(objective='binary', verbose=-1, random_state=42)

# モデル3: Logistic Regression（シンプル・イズ・ベスト）
# ※過学習を抑える役割を果たします
clf_lr = LogisticRegression(random_state=42)

# ★ 投票システム（VotingClassifier）
# 'soft'投票は、確率の平均を取ります
voting_clf = VotingClassifier(
    estimators=[('rf', clf_rf), ('lgb', clf_lgb), ('lr', clf_lr)],
    voting='soft'
)

# 学習
voting_clf.fit(X_train_scaled, y_train)

# 交差検証スコアの確認
scores = cross_val_score(voting_clf, X_train_scaled, y_train, cv=5, scoring='accuracy')
print(f"==================================")
print(f"Voting CV平均正解率: {scores.mean():.4f}")
print(f"==================================")

# =================================================
# 5. 提出
# =================================================
y_pred = voting_clf.predict(X_test_scaled)

submission = pd.read_csv('test.csv')[['PassengerId']] # 元のファイルからIDだけ借りる
submission['Survived'] = y_pred

submission.to_csv('submission_best.csv', index=False)
print("提出用ファイル 'submission_best.csv' が作成されました！")

  combined['Title'] = combined['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  combined['IsAlone'].loc[combined['FamilySize'] == 1] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/

★ 学習開始：3つの異なる頭脳を組み合わせます...




Voting CV平均正解率: 0.8126
提出用ファイル 'submission_best.csv' が作成されました！




In [None]:
# Kaggle APIを使って、submission.csvを直接提出するコマンド
!kaggle competitions submit -c titanic -f submission_best.csv -m "LightGBM + Optuna + Sex feature"

100% 2.77k/2.77k [00:00<00:00, 14.6kB/s]
Successfully submitted to Titanic - Machine Learning from Disaster

In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

# =================================================
# 1. データ準備（0.787を出したときと同じ処理）
# =================================================
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
combined = pd.concat([train_df, test_df], sort=False)

# Title
combined['Title'] = combined['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
combined['Title'] = combined['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
combined['Title'] = combined['Title'].replace(['Mlle', 'Ms'], 'Miss')
combined['Title'] = combined['Title'].replace('Mme', 'Mrs')
combined['Title'] = combined['Title'].map({"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}).fillna(0)

# Sex
combined['Sex'] = combined['Sex'].map({'male': 0, 'female': 1}).astype(int)

# Age (Binning)
combined['Age'] = combined['Age'].fillna(combined.groupby('Title')['Age'].transform('median'))
combined['AgeBin'] = pd.cut(combined['Age'], 5, labels=[0, 1, 2, 3, 4]).astype(int)

# Embarked
combined['Embarked'] = combined['Embarked'].fillna(combined['Embarked'].mode()[0])
combined['Embarked'] = combined['Embarked'].map({'S': 0, 'C': 1, 'Q': 2}).astype(int)

# Fare (Binning)
combined['Fare'] = combined['Fare'].fillna(combined['Fare'].median())
combined['FareBin'] = pd.qcut(combined['Fare'], 4, labels=[0, 1, 2, 3]).astype(int)

# FamilySize & IsAlone
combined['FamilySize'] = combined['SibSp'] + combined['Parch'] + 1
combined['IsAlone'] = 0
combined['IsAlone'].loc[combined['FamilySize'] == 1] = 1

# 不要列の削除
drop_elements = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp', 'Parch', 'Age', 'Fare', 'FamilySize']
combined = combined.drop(drop_elements, axis=1)

train_df = combined.iloc[:len(train_df)]
test_df = combined.iloc[len(train_df):]

X_train = train_df.drop("Survived", axis=1)
y_train = train_df["Survived"].astype(int)
X_test = test_df.drop("Survived", axis=1)

# スケーリング（SVCやLogisticRegressionのために必須）
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# =================================================
# 2. スタッキング（Stacking）の構築
# =================================================
print("★ スタッキング学習を開始します...")

# 1層目のモデルたち（Base Models）
level0 = list()
level0.append(('lr', LogisticRegression(C=0.1, random_state=42)))
level0.append(('rf', RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)))
level0.append(('lgb', lgb.LGBMClassifier(n_estimators=100, max_depth=4, verbose=-1, random_state=42)))
level0.append(('svm', SVC(probability=True, random_state=42))) # 新戦力：サポートベクターマシン

# 2層目のモデル（Meta Model）：みんなの意見をまとめるリーダー
# ここではロジスティック回帰が最適です
level1 = LogisticRegression()

# スタッキングモデルの定義
model_stacking = StackingClassifier(estimators=level0, final_estimator=level1, cv=5)

# 学習
model_stacking.fit(X_train_scaled, y_train)

# =================================================
# 3. 提出
# =================================================
y_pred = model_stacking.predict(X_test_scaled)

submission = pd.read_csv('test.csv')[['PassengerId']]
submission['Survived'] = y_pred
submission.to_csv('submission_stacking.csv', index=False)

print("提出用ファイル 'submission_stacking.csv' が作成されました！")
print("これが今の私たちに出せる、最強の一手です。")

  combined['Title'] = combined['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  combined['IsAlone'].loc[combined['FamilySize'] == 1] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/

★ スタッキング学習を開始します...




提出用ファイル 'submission_stacking.csv' が作成されました！
これが今の私たちに出せる、最強の一手です。




In [None]:
# Kaggle APIを使って、submission.csvを直接提出するコマンド
!kaggle competitions submit -c titanic -f submission_stacking.csv -m "LightGBM + Optuna + Sex feature"

100% 2.77k/2.77k [00:00<00:00, 14.1kB/s]
Successfully submitted to Titanic - Machine Learning from Disaster

In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

# =================================================
# 1. データ準備（前回と同じ0.79を出した構成）
# =================================================
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
combined = pd.concat([train_df, test_df], sort=False)

# 特徴量エンジニアリング（省略なしで再現）
combined['Title'] = combined['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
combined['Title'] = combined['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
combined['Title'] = combined['Title'].replace(['Mlle', 'Ms'], 'Miss')
combined['Title'] = combined['Title'].replace('Mme', 'Mrs')
combined['Title'] = combined['Title'].map({"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}).fillna(0)
combined['Sex'] = combined['Sex'].map({'male': 0, 'female': 1}).astype(int)
combined['Age'] = combined['Age'].fillna(combined.groupby('Title')['Age'].transform('median'))
combined['AgeBin'] = pd.cut(combined['Age'], 5, labels=[0, 1, 2, 3, 4]).astype(int)
combined['Embarked'] = combined['Embarked'].fillna(combined['Embarked'].mode()[0])
combined['Embarked'] = combined['Embarked'].map({'S': 0, 'C': 1, 'Q': 2}).astype(int)
combined['Fare'] = combined['Fare'].fillna(combined['Fare'].median())
combined['FareBin'] = pd.qcut(combined['Fare'], 4, labels=[0, 1, 2, 3]).astype(int)
combined['FamilySize'] = combined['SibSp'] + combined['Parch'] + 1
combined['IsAlone'] = 0
combined['IsAlone'].loc[combined['FamilySize'] == 1] = 1

drop_elements = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp', 'Parch', 'Age', 'Fare', 'FamilySize']
combined = combined.drop(drop_elements, axis=1)

train_df = combined.iloc[:len(train_df)]
test_df = combined.iloc[len(train_df):]

X_train = train_df.drop("Survived", axis=1)
y_train = train_df["Survived"].astype(int)
X_test = test_df.drop("Survived", axis=1)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# =================================================
# 2. 最初のモデル学習（スタッキング）
# =================================================
print("★ Step 1: ベースモデルの学習中...")

level0 = [
    ('lr', LogisticRegression(C=0.1, random_state=42)),
    ('rf', RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)),
    ('lgb', lgb.LGBMClassifier(n_estimators=100, max_depth=4, verbose=-1, random_state=42)),
    ('svm', SVC(probability=True, random_state=42))
]
level1 = LogisticRegression()
model = StackingClassifier(estimators=level0, final_estimator=level1, cv=5)
model.fit(X_train_scaled, y_train)

# =================================================
# 3. Pseudo Labeling（ここが裏技！）
# =================================================
print("★ Step 2: 擬似ラベリング（Pseudo Labeling）を実行中...")

# テストデータに対する予測確率を出す
y_test_proba = model.predict_proba(X_test_scaled)

# 自信満々のデータを探す
# 0である確率が90%以上、または1である確率が90%以上のデータだけ抽出
high_confidence_idx = np.where((y_test_proba[:, 0] > 0.9) | (y_test_proba[:, 1] > 0.9))[0]

print(f" -> テストデータ418件中、{len(high_confidence_idx)}件を「自信あり」として学習データに追加します。")

# 自信のあるテストデータを「擬似的な学習データ」として作成
X_pseudo = X_test_scaled[high_confidence_idx]
y_pseudo = np.argmax(y_test_proba[high_confidence_idx], axis=1) # 0か1に変換

# 元の学習データと合体させる！
X_train_pseudo = np.vstack((X_train_scaled, X_pseudo))
y_train_pseudo = np.concatenate((y_train, y_pseudo))

# =================================================
# 4. 再学習と最終予測
# =================================================
print("★ Step 3: 増量したデータで再学習中...")

# パワーアップしたデータでもう一度学習
model.fit(X_train_pseudo, y_train_pseudo)
y_pred_final = model.predict(X_test_scaled)

# 提出
submission = pd.read_csv('test.csv')[['PassengerId']]
submission['Survived'] = y_pred_final
submission.to_csv('submission_pseudo.csv', index=False)

print("提出用ファイル 'submission_pseudo.csv' が作成されました！")
print("さあ、0.8の壁を超えられるか……！？")

  combined['Title'] = combined['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  combined['IsAlone'].loc[combined['FamilySize'] == 1] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/

★ Step 1: ベースモデルの学習中...




★ Step 2: 擬似ラベリング（Pseudo Labeling）を実行中...
 -> テストデータ418件中、85件を「自信あり」として学習データに追加します。
★ Step 3: 増量したデータで再学習中...




提出用ファイル 'submission_pseudo.csv' が作成されました！
さあ、0.8の壁を超えられるか……！？




In [None]:
# Kaggle APIを使って、submission.csvを直接提出するコマンド
!kaggle competitions submit -c titanic -f submission_pseudo.csv -m "LightGBM + Optuna + Sex feature"

100% 2.77k/2.77k [00:00<00:00, 15.7kB/s]
Successfully submitted to Titanic - Machine Learning from Disaster

In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

# =================================================
# 1. データ準備（ベストスコア0.79186と同じ構成）
# =================================================
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
combined = pd.concat([train_df, test_df], sort=False)

# 特徴量エンジニアリング
combined['Title'] = combined['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
combined['Title'] = combined['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
combined['Title'] = combined['Title'].replace(['Mlle', 'Ms'], 'Miss')
combined['Title'] = combined['Title'].replace('Mme', 'Mrs')
combined['Title'] = combined['Title'].map({"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}).fillna(0)
combined['Sex'] = combined['Sex'].map({'male': 0, 'female': 1}).astype(int)
combined['Age'] = combined['Age'].fillna(combined.groupby('Title')['Age'].transform('median'))
combined['AgeBin'] = pd.cut(combined['Age'], 5, labels=[0, 1, 2, 3, 4]).astype(int)
combined['Embarked'] = combined['Embarked'].fillna(combined['Embarked'].mode()[0])
combined['Embarked'] = combined['Embarked'].map({'S': 0, 'C': 1, 'Q': 2}).astype(int)
combined['Fare'] = combined['Fare'].fillna(combined['Fare'].median())
combined['FareBin'] = pd.qcut(combined['Fare'], 4, labels=[0, 1, 2, 3]).astype(int)
combined['FamilySize'] = combined['SibSp'] + combined['Parch'] + 1
combined['IsAlone'] = 0
combined['IsAlone'].loc[combined['FamilySize'] == 1] = 1

drop_elements = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp', 'Parch', 'Age', 'Fare', 'FamilySize']
combined = combined.drop(drop_elements, axis=1)

train_df = combined.iloc[:len(train_df)]
test_df = combined.iloc[len(train_df):]

X_train = train_df.drop("Survived", axis=1)
y_train = train_df["Survived"].astype(int)
X_test = test_df.drop("Survived", axis=1)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# =================================================
# 2. Seed Averaging（10回の平均を取る！）
# =================================================
print("★ Seed Averagingを開始します（少し時間がかかります）...")

# 結果を足し合わせるための箱
test_pred_prob_sum = np.zeros((len(X_test), 2)) # 0と1の確率を入れる箱

# 10種類の異なる「運（SEED）」でモデルを回す
seeds = [42, 2023, 123, 999, 7, 55, 101, 333, 777, 888]

for i, seed in enumerate(seeds):
    print(f" -> Round {i+1}/{len(seeds)} (Seed: {seed})")

    # モデル定義（中身はベスト版と同じですが、random_stateだけ変えます）
    level0 = [
        ('lr', LogisticRegression(C=0.1, random_state=seed)),
        ('rf', RandomForestClassifier(n_estimators=100, max_depth=5, random_state=seed)),
        ('lgb', lgb.LGBMClassifier(n_estimators=100, max_depth=4, verbose=-1, random_state=seed)),
        ('svm', SVC(probability=True, random_state=seed))
    ]
    level1 = LogisticRegression() # メタモデルはデフォルトでOK

    model = StackingClassifier(estimators=level0, final_estimator=level1, cv=5)
    model.fit(X_train_scaled, y_train)

    # 確率を予測して足し込む
    test_pred_prob_sum += model.predict_proba(X_test_scaled)

# =================================================
# 3. 平均をとって提出
# =================================================
# 10回分の合計を10で割って平均確率を出す
final_proba = test_pred_prob_sum / len(seeds)

# 0.5を境目に0か1かを決める
y_pred_final = np.argmax(final_proba, axis=1)

submission = pd.read_csv('test.csv')[['PassengerId']]
submission['Survived'] = y_pred_final
submission.to_csv('submission_seed_avg.csv', index=False)

print("\n提出用ファイル 'submission_seed_avg.csv' が作成されました！")
print("これが確率のブレを極限までなくした、最も信頼できる答えです。")

  combined['Title'] = combined['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  combined['IsAlone'].loc[combined['FamilySize'] == 1] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/

★ Seed Averagingを開始します（少し時間がかかります）...
 -> Round 1/10 (Seed: 42)




 -> Round 2/10 (Seed: 2023)




 -> Round 3/10 (Seed: 123)




 -> Round 4/10 (Seed: 999)




 -> Round 5/10 (Seed: 7)




 -> Round 6/10 (Seed: 55)




 -> Round 7/10 (Seed: 101)




 -> Round 8/10 (Seed: 333)




 -> Round 9/10 (Seed: 777)




 -> Round 10/10 (Seed: 888)





提出用ファイル 'submission_seed_avg.csv' が作成されました！
これが確率のブレを極限までなくした、最も信頼できる答えです。




In [None]:
# Kaggle APIを使って、submission.csvを直接提出するコマンド
!kaggle competitions submit -c titanic -f submission_seed_avg.csv -m "LightGBM + Optuna + Sex feature"

100% 2.77k/2.77k [00:00<00:00, 14.1kB/s]
Successfully submitted to Titanic - Machine Learning from Disaster