In [1]:
import numpy as np
import pandas as pd

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/titanic/gender_submission.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/train.csv


In [2]:
import random
import time
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb

from contextlib import contextmanager
from pathlib import Path
from typing import Optional
from tqdm.notebook import tqdm
from sklearn.metrics import average_precision_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold

import math

In [3]:
@contextmanager
def timer(name: str):
    t0 = time.time()
    print(f"[{name}] start")
    yield
    msg = f"[{name}] done in {time.time() - t0:.0f} s"
    print(msg)
    
    
def set_seed(seed=42):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)

In [4]:
from pandas_profiling import ProfileReport
from matplotlib_venn import venn2
sns.set_style('darkgrid')

In [5]:
INPUT_DIR = '../input/titanic'
OUTPUT_DIR = './'
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [6]:
train_df = pd.read_csv("../input/titanic/train.csv")
test_df = pd.read_csv("../input/titanic/test.csv")

In [7]:
whole_df=pd.concat([train_df, test_df], ignore_index=True)

In [8]:
#train_df.dtypes

In [9]:
#report = ProfileReport(train_df)
#report.to_file(os.path.join(OUTPUT_DIR, 'train_report.html'))

In [10]:
#report = ProfileReport(test_df)
#report.to_file(os.path.join(OUTPUT_DIR, 'test_report.html'))

In [11]:
#report = ProfileReport(whole_df)
#report.to_file(os.path.join(OUTPUT_DIR, 'whole_report.html'))

In [12]:
#!pip install sweetviz
#import sweetviz as sv

In [13]:
#compare_report = sv.compare([train_df, 'Training Data'],
#                            [test_df, 'Test Data'])
#compare_report.show_html('./compare.html')

In [14]:

def create_sex_value(input_df):
    use_columns = ['Sex']  
    
    td=[]
    for i in range(0, len(input_df)):
        if input_df['Sex'][i] == 'male':
            td.append(0)
        else:
            td.append(1)   
    sex_value = pd.DataFrame(td, columns=['sex_value'])
              
    return sex_value.copy()


In [15]:

def create_family_size(input_df):
    use_columns = ['SibSp','Parch']  
    
    td=[]
    for i in range(0, len(input_df)):

        td.append((input_df['SibSp'][i]+input_df['Parch'][i]+1)/10)          
        
    family_size = pd.DataFrame(td, columns=['family_size'])
              
    return family_size.copy()


In [16]:
def create_sibsp_size(input_df):
    use_columns = ['SibSp']  
    
    td=[]
    for i in range(0, len(input_df)):

        td.append(input_df['SibSp'][i]/4)          
        
    sibsp_size = pd.DataFrame(td, columns=['sibsp_size'])
              
    return sibsp_size.copy()

In [17]:
def create_parch_size(input_df):
    use_columns = ['Parch']  
    
    td=[]
    for i in range(0, len(input_df)):

        td.append(input_df['Parch'][i]/4)          
        
    parch_size = pd.DataFrame(td, columns=['parch_size'])
              
    return parch_size.copy()

In [18]:

def create_embarked_value(input_df):
    use_columns = ['Embarked']  
    
    td=[]
    for i in range(0, len(input_df)):

        if input_df['Embarked'][i]=='S':
            td.append(1)          
        elif input_df['Embarked'][i]=='Q':
            td.append(0.6)          
        elif input_df['Embarked'][i]=='C':
            td.append(0.2)          
        else:
            td.append(1)           
        
    embarked_value = pd.DataFrame(td, columns=['embarked_value'])
              
    return embarked_value.copy()

In [19]:

def create_fare_value(input_df):
    use_columns = ['Fare']  
    
    td=[]
    for i in range(0, len(input_df)):

        td.append(input_df['Fare'][i]/100)         
             
    fare_value = pd.DataFrame(td, columns=['fare_value'])
              
    return fare_value.copy()

In [20]:

def create_age_value(input_df):
    use_columns = ['Age']  
    
    td=[]
    for i in range(0, len(input_df)):

        td.append((input_df['Age'][i])/50)         
             
    age_value = pd.DataFrame(td, columns=['age_value'])
              
    return age_value.copy()

In [21]:

def create_pclass_value(input_df):
    use_columns = ['Pclass']  
    
    td=[]
    for i in range(0, len(input_df)):

        if input_df['Pclass'][i]==3:
            td.append(0)          
        elif input_df['Pclass'][i]==2:
            td.append(0.4)          
        elif input_df['Pclass'][i]==1:
            td.append(0.8)          
        else:
            td.append(0)       
             
    pclass_value = pd.DataFrame(td, columns=['pclass_value'])
              
    return pclass_value.copy()

In [22]:
def create_cabin_value(input_df):
    use_columns = ['Cabin']  
    
    td=[]
    for i in range(0, len(input_df)):
        first = str(input_df['Cabin'][i])[0]
        if first=='B':
            td.append(0)          
        elif first=='C':
            td.append(0.2)        
        elif first=='G':
            td.append(0.4)          
        elif first=='D':
            td.append(0.6)        
        elif first=='E':
            td.append(0.8)  
        else:
            td.append(1)       
             
    cabin_value = pd.DataFrame(td, columns=['cabin_value'])
              
    return cabin_value.copy()

In [23]:
from contextlib import contextmanager
from time import time

@contextmanager
def timer(logger=None, format_str='{:.3f}[s]', prefix=None, suffix=None):
    if prefix: format_str = str(prefix) + format_str
    if suffix: format_str = format_str + str(suffix)
    start = time()
    yield
    d = time() - start
    out_str = format_str.format(d)
    if logger:
        logger.info(out_str)
    else:
        print(out_str)

In [24]:
from tqdm import tqdm

def to_feature(input_df):
    processors = [
        create_sex_value,
        create_family_size,
        create_sibsp_size,
        create_parch_size,           
        create_embarked_value,
        create_fare_value,
        create_age_value,
        create_pclass_value,
        create_cabin_value,
    ]
    
    out_df = pd.DataFrame()
    
    for func in tqdm(processors, total=len(processors)):
        with timer(prefix='create ' + func.__name__ + ' '):
            _df = func(input_df)
        
        assert len(_df) == len(input_df), func.__name__
        out_df = pd.concat([out_df, _df], axis=1)
        
    return out_df

In [25]:
train_feat_df = to_feature(train_df)
test_feat_df = to_feature(test_df)

100%|██████████| 9/9 [00:00<00:00, 62.12it/s]
100%|██████████| 9/9 [00:00<00:00, 122.17it/s]

create create_sex_value 0.013[s]
create create_family_size 0.024[s]
create create_sibsp_size 0.012[s]
create create_parch_size 0.012[s]
create create_embarked_value 0.016[s]
create create_fare_value 0.013[s]
create create_age_value 0.012[s]
create create_pclass_value 0.020[s]
create create_cabin_value 0.012[s]
create create_sex_value 0.006[s]
create create_family_size 0.011[s]
create create_sibsp_size 0.006[s]
create create_parch_size 0.006[s]
create create_embarked_value 0.009[s]
create create_fare_value 0.006[s]
create create_age_value 0.006[s]
create create_pclass_value 0.009[s]
create create_cabin_value 0.006[s]





In [26]:
train_feat_df2 = pd.DataFrame(train_feat_df)
train_feat_df2.to_csv('train_feat_df.csv')
#train_feat_df2.to_csv('train_feat_df.csv', index=False, header=False)
test_feat_df2 = pd.DataFrame(test_feat_df)
test_feat_df2.to_csv('test_feat_df.csv')
#test_feat_df2.to_csv('test_feat_df.csv', index=False, header=False)

In [27]:
train_feat_df

Unnamed: 0,sex_value,family_size,sibsp_size,parch_size,embarked_value,fare_value,age_value,pclass_value,cabin_value
0,0,0.2,0.25,0.0,1.0,0.072500,0.44,0.0,1.0
1,1,0.2,0.25,0.0,0.2,0.712833,0.76,0.8,0.2
2,1,0.1,0.00,0.0,1.0,0.079250,0.52,0.0,1.0
3,1,0.2,0.25,0.0,1.0,0.531000,0.70,0.8,0.2
4,0,0.1,0.00,0.0,1.0,0.080500,0.70,0.0,1.0
...,...,...,...,...,...,...,...,...,...
886,0,0.1,0.00,0.0,1.0,0.130000,0.54,0.4,1.0
887,1,0.1,0.00,0.0,1.0,0.300000,0.38,0.8,0.0
888,1,0.4,0.25,0.5,1.0,0.234500,,0.0,1.0
889,0,0.1,0.00,0.0,0.2,0.300000,0.52,0.8,0.2


In [28]:
from sklearn.metrics import average_precision_score
import lightgbm as lgbm

def pr_auc(y_true, y_pred):

    score = average_precision_score(y_true, y_pred)
    return "pr_auc", score, True

def fit_lgbm(X, y, cv, params: dict=None, verbose=100):

    if params is None:
        params = {}

    models = []

    oof_pred = np.zeros_like(y, dtype=np.float)

    for i, (idx_train, idx_valid) in enumerate(cv): 

        x_train, y_train = X[idx_train], y[idx_train]
        x_valid, y_valid = X[idx_valid], y[idx_valid]

        clf = lgbm.LGBMClassifier(**params)
        
        with timer(prefix='fit fold={} '.format(i + 1)):
            clf.fit(x_train, y_train, 
                    eval_set=[(x_valid, y_valid)],  
                    early_stopping_rounds=verbose, 
                    eval_metric=pr_auc,
                    verbose=verbose)

        pred_i = clf.predict_proba(x_valid)[:, 1]
        oof_pred[idx_valid] = pred_i
        models.append(clf)

        print(f'Fold {i} PR-AUC: {average_precision_score(y_valid, pred_i):.4f}')

    score = average_precision_score(y, oof_pred)
    print('FINISHED \ whole score: {:.4f}'.format(score))
    return oof_pred, models        

In [29]:
params = {
    'objective': 'binary',
    'learning_rate': 0.05,
    'max_depth': 8,
    'n_estimators': 10000000,
    'colsample_bytree': .5,
}

y = train_df['Survived'].values

In [30]:
train_df['Survived']

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

In [31]:
y

array([0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1,

In [32]:
from sklearn.model_selection import StratifiedKFold

fold = StratifiedKFold(n_splits=8, shuffle=True, random_state=71)
cv = list(fold.split(train_feat_df, y))

In [33]:
train_feat_df

Unnamed: 0,sex_value,family_size,sibsp_size,parch_size,embarked_value,fare_value,age_value,pclass_value,cabin_value
0,0,0.2,0.25,0.0,1.0,0.072500,0.44,0.0,1.0
1,1,0.2,0.25,0.0,0.2,0.712833,0.76,0.8,0.2
2,1,0.1,0.00,0.0,1.0,0.079250,0.52,0.0,1.0
3,1,0.2,0.25,0.0,1.0,0.531000,0.70,0.8,0.2
4,0,0.1,0.00,0.0,1.0,0.080500,0.70,0.0,1.0
...,...,...,...,...,...,...,...,...,...
886,0,0.1,0.00,0.0,1.0,0.130000,0.54,0.4,1.0
887,1,0.1,0.00,0.0,1.0,0.300000,0.38,0.8,0.0
888,1,0.4,0.25,0.5,1.0,0.234500,,0.0,1.0
889,0,0.1,0.00,0.0,0.2,0.300000,0.52,0.8,0.2


In [34]:
oof, models = fit_lgbm(train_feat_df.values, y, cv, params=params)

Training until validation scores don't improve for 100 rounds
[100]	valid_0's binary_logloss: 0.462459	valid_0's pr_auc: 0.866014
Early stopping, best iteration is:
[78]	valid_0's binary_logloss: 0.45297	valid_0's pr_auc: 0.866299
fit fold=1 0.273[s]
Fold 0 PR-AUC: 0.8663
Training until validation scores don't improve for 100 rounds
[100]	valid_0's binary_logloss: 0.41347	valid_0's pr_auc: 0.854921
Early stopping, best iteration is:
[50]	valid_0's binary_logloss: 0.432949	valid_0's pr_auc: 0.861536
fit fold=2 0.202[s]
Fold 1 PR-AUC: 0.8615
Training until validation scores don't improve for 100 rounds
[100]	valid_0's binary_logloss: 0.393689	valid_0's pr_auc: 0.857917
[200]	valid_0's binary_logloss: 0.38807	valid_0's pr_auc: 0.870942
Early stopping, best iteration is:
[163]	valid_0's binary_logloss: 0.3826	valid_0's pr_auc: 0.874331
fit fold=3 0.348[s]
Fold 2 PR-AUC: 0.8743
Training until validation scores don't improve for 100 rounds
[100]	valid_0's binary_logloss: 0.325332	valid_0's p

In [35]:
def visualize_importance(models, feat_train_df):

    feature_importance_df = pd.DataFrame()
    for i, model in enumerate(models):
        _df = pd.DataFrame()
        _df['feature_importance'] = model.feature_importances_
        _df['column'] = feat_train_df.columns
        _df['fold'] = i + 1
        feature_importance_df = pd.concat([feature_importance_df, _df], axis=0, ignore_index=True)

    order = feature_importance_df.groupby('column')\
        .sum()[['feature_importance']]\
        .sort_values('feature_importance', ascending=False).index[:50]

    fig, ax = plt.subplots(figsize=(max(6, len(order) * .4), 7))
    sns.boxenplot(data=feature_importance_df, x='column', y='feature_importance', order=order, ax=ax, palette='viridis')
    ax.tick_params(axis='x', rotation=90)
    ax.grid()
    fig.tight_layout()
    return fig, ax

In [36]:
pred = np.array([model.predict_proba(test_feat_df.values)[:, 1] for model in models])
pred = np.mean(pred, axis=0)

#sub_df = pd.DataFrame({ 'target': pred })
#sub_df.to_csv(os.path.join(OUTPUT_DIR, 'titanic_submission.csv'), index=False)

In [37]:
pred

array([0.11279764, 0.24863331, 0.13173747, 0.2230103 , 0.42880179,
       0.2006597 , 0.49321524, 0.32452117, 0.64008459, 0.13196264,
       0.07125503, 0.191988  , 0.87419613, 0.12328849, 0.83009119,
       0.75200365, 0.17383175, 0.24757666, 0.45751579, 0.3840811 ,
       0.41918787, 0.31771886, 0.7810322 , 0.34136454, 0.81159223,
       0.09436733, 0.88148153, 0.24805027, 0.51396402, 0.3005384 ,
       0.16742362, 0.23643679, 0.57129338, 0.44998355, 0.52111966,
       0.23664653, 0.28646354, 0.25254914, 0.1342583 , 0.35890082,
       0.16832271, 0.59168231, 0.11592595, 0.78085266, 0.85520677,
       0.1809374 , 0.39217949, 0.17329265, 0.84630417, 0.59340421,
       0.42251249, 0.19713388, 0.76615955, 0.69082651, 0.21005001,
       0.12517792, 0.07308582, 0.15034035, 0.16333315, 0.92230072,
       0.09883457, 0.25631516, 0.15168189, 0.63674446, 0.59502232,
       0.7333892 , 0.66520241, 0.20569645, 0.56528205, 0.71952847,
       0.65334719, 0.07643446, 0.46032513, 0.64471002, 0.89787

In [38]:
pred_life=[]

for i in range(len(pred)):
    if pred[i]<0.5:
        pred_life.append(0)
    else:
        pred_life.append(1)        

my_submit0 = ['PassengerId']
my_submit1 = ['Survived']


for i in range(0, len(pred)):
    my_submit0.append(str(892+i))
    my_submit1.append(str(pred_life[i]))

    
my_submit=[]

my_submit.append(my_submit0)
my_submit.append(my_submit1)

my_submit2=np.transpose(my_submit)

my_submit3 = pd.DataFrame(my_submit2)
my_submit3.to_csv('titanic_lgb6_submission.csv', index=False, header=False)