In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder

from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix
import lightgbm as lgb


In [2]:
train = pd.read_csv('train.csv')
test= pd.read_csv('test.csv')

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [4]:
train_x, train_y = train[['Pclass', 'Fare', 'Age']], train[['Survived']]

In [5]:
x_tr, x_va, y_tr, y_va = train_test_split(train_x, train_y, test_size=0.2, shuffle=True, stratify=train_y, random_state=123)


In [6]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'learning_rate': 0.1,
    'num_leaves': 16,
    'n_estimators': 100000,
    'random_state': 123,
    'importance_type': 'gain',
}

metrics = []
imp = pd.DataFrame()
fold = 0
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=123).split(train_x, train_y)

for idx_tr, idx_va in list(cv):
    fold += 1
    print('-' * 40)
    x_tr, y_tr = train_x.loc[idx_tr, :], train_y.loc[idx_tr, :]
    x_va, y_va = train_x.loc[idx_va, :], train_y.loc[idx_va, :]
    
    model = lgb.LGBMClassifier(**params)
    model.fit(x_tr, y_tr, eval_set=[(x_va, y_va)], early_stopping_rounds=100, verbose=100,)

    y_tr_pred = model.predict(x_tr)
    y_va_pred = model.predict(x_va)
    metrics_tr = accuracy_score(y_tr, y_tr_pred)
    metrics_va = accuracy_score(y_va, y_va_pred)
    print(metrics_tr, metrics_va)
    metrics.append([fold, metrics_tr, metrics_va])

print('-' * 40)
metrics = np.array(metrics)
print(metrics)


----------------------------------------
[100]	valid_0's auc: 0.721937
0.824438202247191 0.7039106145251397
----------------------------------------
[100]	valid_0's auc: 0.736096
0.7573632538569425 0.7191011235955056
----------------------------------------
[100]	valid_0's auc: 0.75488
0.7026647966339411 0.6573033707865169
----------------------------------------
[100]	valid_0's auc: 0.730348
0.7685834502103787 0.6741573033707865
----------------------------------------
[100]	valid_0's auc: 0.746177
0.7769985974754559 0.7134831460674157
----------------------------------------
[[1.         0.8244382  0.70391061]
 [2.         0.75736325 0.71910112]
 [3.         0.7026648  0.65730337]
 [4.         0.76858345 0.6741573 ]
 [5.         0.7769986  0.71348315]]


In [7]:
print(model.feature_importances_)

[336.67414913 643.20775247 529.5688886 ]


In [8]:
y_test_pred = model.predict(test[['Pclass', 'Fare', 'Age']])
sub = pd.DataFrame({"PassengerId": test['PassengerId'], "Survived": y_test_pred})

In [9]:
sub.to_csv('second_submission.csv', index=None)