<a href="https://colab.research.google.com/github/tatsutatsutatsu611803/Kaggletest/blob/main/Titanic_ipynb_%E3%81%AE%E3%82%B3%E3%83%94%E3%83%BC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ライブラリのインポート
from google.colab import drive
drive.mount('/content/drive')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# LightGBMのインストールとインポート
!pip install lightgbm --upgrade
import lightgbm as lgb
from lightgbm import callback

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings('ignore')

# データの読み込み（パスはご自身の環境に合わせて調整してください）
df = pd.read_csv("/content/drive/MyDrive/csvdata/train.csv")
df_test = pd.read_csv("/content/drive/MyDrive/csvdata/test.csv")

print('訓練データのデータ数は{}、変数は{}種類です。'.format(df.shape[0], df.shape[1]))
print('テストデータのデータ数は{}、変数は{}種類です'.format(df_test.shape[0], df_test.shape[1]))

# データの結合
combined = pd.concat([df, df_test], sort=False).reset_index(drop=True)

# 欠損値の確認
print("欠損値の確認:\n", combined.isnull().sum())

# 欠損値の処理
combined['Age'].fillna(combined['Age'].median(), inplace=True)
combined['Fare'].fillna(combined['Fare'].median(), inplace=True)
combined['Embarked'].fillna(combined['Embarked'].mode()[0], inplace=True)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
訓練データのデータ数は891、変数は12種類です。
テストデータのデータ数は418、変数は11種類です
欠損値の確認:
 PassengerId       0
Perished        418
Pclass            0
Name              0
Sex               0
Age             263
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1014
Embarked          2
dtype: int64


In [None]:
# 'Cabin'列は欠損値が多いため削除
combined.drop('Cabin', axis=1, inplace=True)

# 'Name'からタイトルを抽出
combined['Title'] = combined['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
combined['Title'] = combined['Title'].replace(['Lady', 'Countess', 'Capt', 'Col',
                                               'Don', 'Dr', 'Major', 'Rev', 'Sir',
                                               'Jonkheer', 'Dona'], 'Rare')
combined['Title'] = combined['Title'].replace(['Mlle', 'Ms'], 'Miss')
combined['Title'] = combined['Title'].replace('Mme', 'Mrs')
title_mapping = {'Mr': 1, 'Miss': 2, 'Mrs': 3, 'Master': 4, 'Rare': 5}
combined['Title'] = combined['Title'].map(title_mapping).fillna(0).astype(int)


In [None]:

# 性別を数値にマッピング
combined['Sex'] = combined['Sex'].map({'male': 0, 'female': 1}).astype(int)

# 家族関連の特徴量作成
combined['FamilySize'] = combined['SibSp'] + combined['Parch'] + 1
combined['IsAlone'] = (combined['FamilySize'] == 1).astype(int)

# 不要な列の削除
combined.drop(['Name', 'Ticket'], axis=1, inplace=True)

# 'Embarked'を数値にマッピング
combined['Embarked'] = combined['Embarked'].map({'S': 0, 'C': 1, 'Q': 2}).astype(int)

# 'Age'のビン分割と数値マッピング
combined['Age'] = pd.cut(combined['Age'], 5, labels=False).astype(int)

# 'Fare'のビン分割と数値マッピング
combined['Fare'] = pd.qcut(combined['Fare'], 4, labels=False).astype(int)


In [None]:
# 訓練データとテストデータに再分割
train = combined.iloc[:len(df)]
test = combined.iloc[len(df):]

X = train.drop('Perished', axis=1)
y = train['Perished'].fillna(y.mode()[0])  # 欠損値処理

X_test = test.drop('Perished', axis=1)


In [None]:
# ハイパーパラメータの候補
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'num_leaves': [20, 31, 50],
    'max_depth': [-1, 5, 10],
    'min_child_samples': [10, 20, 30],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'reg_alpha': [0, 0.1, 0.5],
    'reg_lambda': [0, 0.1, 0.5],
    'n_estimators': [100, 200, 500]
}


In [None]:
from sklearn.model_selection import RandomizedSearchCV

random_search = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_grid,
    n_iter=50,  # 試行回数を減らしてみる
    scoring='accuracy',
    cv=kf,
    n_jobs=-1,
    verbose=1,
    random_state=42
)
random_search.fit(X, y)


Fitting 5 folds for each of 50 candidates, totalling 250 fits
[LightGBM] [Info] Number of positive: 549, number of negative: 342
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000123 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 305
[LightGBM] [Info] Number of data points in the train set: 891, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.616162 -> initscore=0.473288
[LightGBM] [Info] Start training from score 0.473288


In [None]:
# 最適なパラメータでのモデル構築と評価
best_model = lgb.LGBMClassifier(**random_search.best_params_, random_state=42) # Use random_search instead of grid_search
best_model.fit(X, y)
scores = cross_val_score(best_model, X, y, cv=kf, scoring='accuracy')
print('クロスバリデーションの精度:', scores)
print('平均精度:', scores.mean())

[LightGBM] [Info] Number of positive: 549, number of negative: 342
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000152 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 305
[LightGBM] [Info] Number of data points in the train set: 891, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.616162 -> initscore=0.473288
[LightGBM] [Info] Start training from score 0.473288
[LightGBM] [Info] Number of positive: 439, number of negative: 273
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000142 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 287
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 11
[LightGBM] [Info] [binary:BoostFro

In [None]:
# テストデータで予測
predictions = best_model.predict(X_test)


In [None]:
# 提出ファイルの作成
submission = pd.DataFrame({
    'PassengerId': df_test['PassengerId'],
    'Perished': predictions.astype(int)
})
submission.to_csv('submission.csv', index=False)

# 提出ファイルのダウンロード
from google.colab import files
files.download('submission.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>