In [None]:
# wandbのライブラリをimport
import wandb

# wandbへログイン
wandb.login()

In [2]:
# Pandasのライブラリをインポート
import pandas as pd

# 学習データを読み込んで変数 train に格納
train = pd.read_csv('./train.csv')

# 学習データの表示
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
# テストデータを読み込んで変数 test に格納
test = pd.read_csv('./test.csv')

# テストデータの表示
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [4]:
# 学習データとテストデータ数を確認
print(train.shape)
print(test.shape)

(891, 12)
(418, 11)


In [5]:
# trainの欠損値の数を調査して、表示する
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [6]:
# testの欠損値の数を調査して、表示する
test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [7]:
# 新しい空のDataFrameを作成する
temp = pd.DataFrame()

# 学習データとテストデータのAge列を連結させ、tempにAge列として追加する
temp['Age'] = pd.concat([train['Age'], test['Age']])

# trainとtestのAge列について、欠損値をtempのAge列の平均値で埋める
train['Age'] = train['Age'].fillna(temp['Age'].mean())
test['Age'] = test['Age'].fillna(temp['Age'].mean())

In [8]:
# 学習データとテストデータのFare列を連結させ、tempにFare列として追加する
temp['Fare'] = pd.concat([train['Fare'], test['Fare']])

# testのみに存在するFare列の欠損値をtempのFare列の平均値で埋める
test['Fare'] = test['Fare'].fillna(temp['Fare'].mean())

In [9]:
# 学習データとテストデータのEmbarked列を連結させ、tempにEmbarked列として追加する
temp['Embarked'] = pd.concat([train['Embarked'], test['Embarked']])

# tempのEmbakedの値を集計する
temp['Embarked'].value_counts()

Embarked
S    914
C    270
Q    123
Name: count, dtype: int64

In [10]:
# trainのみに存在するEmbarked列の欠損値を'S'で埋める
train['Embarked'] = train['Embarked'].fillna('S')

In [11]:
# trainとtestから、Cabin,Name,Ticket列を削除する
train = train.drop(columns=['Cabin', 'Name', 'Ticket'])
test = test.drop(columns=['Cabin', 'Name', 'Ticket'])

In [12]:
# trainのSex列とEmbarked列をダミー変数化して、変数train2に格納する
train2 = pd.get_dummies(data=train, columns=['Sex', 'Embarked'])

# testのSex列とEmbarked列をダミー変数化して、変数test2に格納する
test2 = pd.get_dummies(data=test, columns=['Sex', 'Embarked'])

In [13]:
# train2の欠損値の数を調査して、表示する
train2.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Age            0
SibSp          0
Parch          0
Fare           0
Sex_female     0
Sex_male       0
Embarked_C     0
Embarked_Q     0
Embarked_S     0
dtype: int64

In [14]:
# test2の欠損値の数を調査して、表示する
test2.isnull().sum()

PassengerId    0
Pclass         0
Age            0
SibSp          0
Parch          0
Fare           0
Sex_female     0
Sex_male       0
Embarked_C     0
Embarked_Q     0
Embarked_S     0
dtype: int64

In [15]:
# train2の各列の型を表示する
train2.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Age            float64
SibSp            int64
Parch            int64
Fare           float64
Sex_female        bool
Sex_male          bool
Embarked_C        bool
Embarked_Q        bool
Embarked_S        bool
dtype: object

In [16]:
# numpyのimport
import numpy as np

# train2をX_trainとY_trainに分ける
X_train = np.array(train2.drop(columns=['Survived'])).astype('float32')
Y_train = np.array(train2['Survived']).astype('float32')

# test2のデータ全体をX_testに格納する
X_test = np.array(test2).astype('float32')

In [17]:
# X_trainとY_trainの3割をX_validとY_validに分割する
from sklearn.model_selection import train_test_split

X_train, X_valid, Y_train, Y_valid = train_test_split(X_train, Y_train, test_size=0.3, random_state=0)

In [18]:
# 学習データと検証データ、テストデータの形状を確認
print("X_train=", X_train.shape, ", Y_train=", Y_train.shape)
print("X_valid=", X_valid.shape, ", Y_valid=", Y_valid.shape)
print("X_test=", X_test.shape)

X_train= (623, 11) , Y_train= (623,)
X_valid= (268, 11) , Y_valid= (268,)
X_test= (418, 11)


In [19]:
# tensorflowのimport
import tensorflow as tf

In [20]:
# モデルの構築と学習を定義する関数
def train_model():
    # wandbの初期設定
    wandb.init(
        # wandbでのプロジェクト名
        project="kaggle-titanic",
        # wandbで記録してもらいたい設定値
        config={
            "input_dense_shape": 8,
            "hidden_dense_shape": 8,
            "optimizer": "rmsprop",
            "batch_size": 32
        })

    # モデルの初期化とレイヤー定義
    model = tf.keras.Sequential([
        # 入力層 (Inputオブジェクトを使用)
        tf.keras.Input(shape=(11,)),
        tf.keras.layers.Dense(wandb.config.input_dense_shape, activation='relu'),
        # 隠れ層
        tf.keras.layers.Dense(wandb.config.hidden_dense_shape, activation='relu'),
        # 出力層
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])

    # モデルの構築
    model.compile(optimizer=wandb.config.optimizer,
                  loss='binary_crossentropy',
                  metrics=['accuracy'])

    # 学習の実施
    log = model.fit(X_train, Y_train, epochs=5000, batch_size=wandb.config.batch_size, verbose=True,
                    callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                                                min_delta=0, patience=100,
                                                                verbose=1),
                              wandb.keras.WandbMetricsLogger(log_freq='epoch')
                              ],
                    validation_data=(X_valid, Y_valid))

In [None]:
# train_modelを実行する
train_model()

In [None]:
# wandbの動作を終了させる
wandb.finish()

In [None]:
# wandbでsweep（グリッドサーチ）を行なうための設定
sweep_config = {
    'method': 'grid',
    'name': 'kaggle-titanic-sweep',
    'metric': {
        'goal': 'maximize',
        'name': 'accuracy'
    },
    'parameters': {
        'input_dense_shape': {'values': [8, 16, 24]},
        'hidden_dense_shape': {'values': [8, 16, 24]},
        'optimizer': {'values': ['sgd', 'rmsprop', 'adam']},
        'batch_size': {'values': [16, 32, 64]}
     }
}

# sweep_configの設定値でsweepを初期化する
sweep_id = wandb.sweep(sweep=sweep_config, project="kaggle-titanic")

In [None]:
# sweepを開始する
wandb.agent(sweep_id, function=train_model)

In [25]:
# グリッドサーチで得た最適なパラメータ値でモデルを作り直す
# （wandbは利用しない）
# テストデータによる予測も行なう
def train_model():
    # モデルの初期化とレイヤー定義
    model = tf.keras.Sequential([
        # 入力層
        tf.keras.Input(shape=(11,)),
        tf.keras.layers.Dense(16, activation='relu'),
        # 隠れ層
        tf.keras.layers.Dense(16, activation='relu'),
        # 出力層
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])

    # モデルの構築
    model.compile(optimizer='rmsprop',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])

    # 学習の実施
    log = model.fit(X_train, Y_train,
                    epochs=5000,
                    batch_size=64,
                    verbose=True,
                    callbacks=[
                        tf.keras.callbacks.EarlyStopping(
                            monitor='val_loss',
                            min_delta=0,
                            patience=100,
                            verbose=1
                        )
                    ],
                    validation_data=(X_valid, Y_valid))

    # テストデータによる予測
    Y_pred_proba = model.predict(X_test)
    Y_pred = (Y_pred_proba > 0.5).astype("int32")

    # Y_predを返す
    return Y_pred

In [26]:
# train_modelを実行する
Y_pred = train_model()
Y_pred

Epoch 1/5000
10/10 ━━━━━━━━━━━━━━━━━━━━ 3s 431ms/step - accuracy: 0.6406 - loss: 8.30 ━━━━━━━━━━━━━━━━━━━━ 1s 17ms/step - accuracy: 0.5946 - loss: 4.5235 - val_accuracy: 0.4701 - val_loss: 1.1598
Epoch 2/5000
10/10 ━━━━━━━━━━━━━━━━━━━━ 0s 20ms/step - accuracy: 0.3906 - loss: 1.422 ━━━━━━━━━━━━━━━━━━━━ 0s 9ms/step - accuracy: 0.5273 - loss: 1.0452 - val_accuracy: 0.5037 - val_loss: 1.1344
Epoch 3/5000
10/10 ━━━━━━━━━━━━━━━━━━━━ 0s 88ms/step - accuracy: 0.5000 - loss: 1.000 ━━━━━━━━━━━━━━━━━━━━ 0s 8ms/step - accuracy: 0.5622 - loss: 0.9239 - val_accuracy: 0.5187 - val_loss: 1.0290
Epoch 4/5000
10/10 ━━━━━━━━━━━━━━━━━━━━ 0s 10ms/step - accuracy: 0.6250 - loss: 0.758 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step - accuracy: 0.6359 - loss: 0.7254 - val_accuracy: 0.6119 - val_loss: 0.7659
Epoch 5/5000
10/10 ━━━━━━━━━━━━━━━━━━━━ 0s 19ms/step - accuracy: 0.5781 - loss: 0.670 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step - accuracy: 0.6057 - loss: 0.7299 - val_accuracy: 0.6828 - val_loss: 0.6972
Epoch 6/5000
10/10 ━━━━━

array([[0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [1],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
    

In [27]:
# X_testをDataFrameに戻し、X_test2に格納
X_test2 = pd.DataFrame(X_test, columns=test2.columns)

In [28]:
# Kaggleへ提出するためのデータが入ったDataFrameを作成
submission_data = pd.DataFrame()
submission_data["PassengerId"] = X_test2["PassengerId"].astype("int32")
submission_data["Survived"] = Y_pred
submission_data

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [29]:
# Kaggleに提出するためのCSVファイルを作成
submission_data.to_csv("my_submission_titanic.csv", index=False)