In [1]:
# Neptune

In [2]:
import os
# !pip install swifter

In [3]:
# Path 

In [4]:
train_path = "../../data/train.csv"
test_path = "../../data/test.csv"

In [5]:
print("data")
print(train_path)
print(test_path)

data
../../data/train.csv
../../data/test.csv


In [6]:
# Load data 

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import precision_recall_curve, accuracy_score
import seaborn as sns
from tqdm import tqdm
import swifter

In [8]:
import re


In [9]:
df_train = pd.read_csv(train_path)

In [10]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,1,1,"Oconnor, Frankie",male,,2,0,209245,27.14,C12239,S
1,1,0,3,"Bryan, Drew",male,,0,0,27323,13.35,,S
2,2,0,3,"Owens, Kenneth",male,0.33,1,2,CA 457703,71.29,,S
3,3,0,3,"Kramer, James",male,19.0,0,0,A. 10866,13.04,,S
4,4,1,3,"Bond, Michael",male,25.0,0,0,427635,7.76,,S


In [11]:
df_train.count()

PassengerId    100000
Survived       100000
Pclass         100000
Name           100000
Sex            100000
Age             96708
SibSp          100000
Parch          100000
Ticket          95377
Fare            99866
Cabin           32134
Embarked        99750
dtype: int64

In [12]:
df_test = pd.read_csv(test_path)

In [13]:
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,100000,3,"Holliday, Daniel",male,19.0,0,0,24745,63.01,,S
1,100001,3,"Nguyen, Lorraine",female,53.0,0,0,13264,5.81,,S
2,100002,1,"Harris, Heather",female,19.0,0,0,25990,38.91,B15315,C
3,100003,2,"Larsen, Eric",male,25.0,0,0,314011,12.93,,S
4,100004,1,"Cleary, Sarah",female,17.0,0,2,26203,26.89,B22515,C


# Feature Extraction 

In [14]:
from sklearn import preprocessing

## Encoding Sex

In [15]:
sex_encoder = preprocessing.LabelEncoder()

In [16]:
sex_encoder.fit(list(df_train["Sex"]) + list(df_test["Sex"]))

LabelEncoder()

In [17]:
df_train["Sex_feature"] = sex_encoder.transform(df_train["Sex"])

In [18]:
df_test["Sex_feature"] = sex_encoder.transform(df_test["Sex"])

## Encoding Embarked

In [19]:
embarked_encoder = preprocessing.LabelEncoder()
df_train["Embarked"] = df_train["Embarked"].fillna("UNK")
df_test["Embarked"] = df_test["Embarked"].fillna("UNK")

In [20]:
embarked_encoder.fit(list(df_train["Embarked"]) + list(df_test["Embarked"]))

LabelEncoder()

In [21]:
df_train["Embarked_feature"] = embarked_encoder.transform(df_train["Embarked"])

In [22]:
df_test["Embarked_feature"] = embarked_encoder.transform(df_test["Embarked"])

## Extract Cabin

In [23]:
cabin_encoder = preprocessing.LabelEncoder()

In [24]:
def extract_cabin_type(cabin_text):
    if cabin_text:
        cabin_type = cabin_text[0]
        return cabin_type
    else:
        return "U"

In [25]:
df_train["Cabin_type"] = df_train["Cabin"].fillna("U").swifter.apply(extract_cabin_type)


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

In [26]:
df_test["Cabin_type"] = df_test["Cabin"].fillna("U").swifter.apply(extract_cabin_type)

Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

In [27]:
cabin_encoder.fit(list(df_train["Cabin_type"]) + list(df_test["Cabin_type"]))

LabelEncoder()

In [28]:
df_train["Cabin_type_feature"] = cabin_encoder.transform(df_train["Cabin_type"])

In [29]:
df_test["Cabin_type_feature"] = cabin_encoder.transform(df_test["Cabin_type"])

## Extract Ticket text

In [30]:
def extract_text(text):
    result = ""
    if text:
        text = str(text)
        result = "".join(re.findall("[a-zA-Z]+", text))
    else:
        result = "NULL"
    if result == "":
        result = "NO_TEXT"
    return result

In [31]:
df_train["Ticket_type"] = df_train["Ticket"].fillna("UNK").swifter.apply(extract_text)

Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

In [32]:
df_test["Ticket_type"] = df_test["Ticket"].fillna("UNK").swifter.apply(extract_text)

Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

In [33]:
ticket_encoder = preprocessing.LabelEncoder()

In [34]:
ticket_encoder.fit(list(df_train["Ticket_type"]) + list(df_test["Ticket_type"]))

LabelEncoder()

In [35]:
df_train["Ticket_type_feature"] = ticket_encoder.transform(df_train["Ticket_type"])

In [36]:
df_test["Ticket_type_feature"] = ticket_encoder.transform(df_test["Ticket_type"])

## Choice feature

In [37]:
choice_feature = ["Sex_feature", "Age", "Fare", "SibSp", "Pclass", "Embarked_feature", "Cabin_type_feature", "Ticket_type_feature"]

In [38]:
X = df_train[choice_feature]
y = df_train["Survived"]

In [39]:
X_test = df_test[choice_feature]


In [40]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=3041975)

In [41]:
lgb_train_data = lgb.Dataset(data=X_train, label=y_train, categorical_feature=["Sex_feature", "Embarked_feature", "Pclass", "SibSp", "Cabin_type_feature"])

In [42]:
lgb_val_data = lgb.Dataset(data=X_val, label=y_val, reference=lgb_train_data)

In [43]:
# lgbm_params = {
#     'boosting': 'gbdt',          # dart (drop out trees) often performs better
#     'application': 'binary',     # Binary classification
#     'learning_rate': 0.02,       # Learning rate, controls size of a gradient descent step
#     'min_data_in_leaf': 100,      # Data set is quite small so reduce this a bit
#     'feature_fraction': 0.7,     # Proportion of features in each boost, controls overfitting
#     'metric': 'auc',  # Area under ROC curve as the evaulation metric,
#     'lambda_l1': 5e-05, 
#     'lambda_l2': 1.35e-08, 
#     'num_leaves': 216, 
#     'feature_fraction': 0.7458519562366559, 
#     'bagging_fraction': 0.7835116194444349, 
#     'bagging_freq': 2, 
#     'min_child_samples': 95
# }

In [44]:
lgbm_params = {
    'boosting': 'gbdt',          # dart (drop out trees) often performs better
    'application': 'binary',     # Binary classification
    'learning_rate': 0.001,       # Learning rate, controls size of a gradient descent step
    # 'feature_fraction': 0.7,     # Proportion of features in each boost, controls overfitting
    'metric': 'auc',  # Area under ROC curve as the evaulation metric,
    'lambda_l1': 5e-05, 
    'lambda_l2': 1.35e-08, 
    'num_leaves': 216, 
}

In [45]:
# lgbm_params = {
#     'boosting': 'gbdt',          # dart (drop out trees) often performs better
#     'objective': 'binary',     # Binary classification
# }

In [46]:
# experiment = neptune.create_experiment(name='LightGBM-training', params=lgbm_params)

In [47]:
num_boost_round = 5000
# from neptunecontrib.monitoring.lightgbm import neptune_monitor
model = lgb.train(lgbm_params, lgb_train_data, valid_sets = [lgb_train_data, lgb_val_data], valid_names=["train_data", "val_data"], verbose_eval=500, num_boost_round=num_boost_round)

New categorical_feature is ['Cabin_type_feature', 'Embarked_feature', 'Pclass', 'Sex_feature', 'SibSp']


[LightGBM] [Info] Number of positive: 38563, number of negative: 51437
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 471
[LightGBM] [Info] Number of data points in the train set: 90000, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.428478 -> initscore=-0.288064
[LightGBM] [Info] Start training from score -0.288064
[500]	train_data's auc: 0.860491	val_data's auc: 0.848766
[1000]	train_data's auc: 0.862735	val_data's auc: 0.849931
[1500]	train_data's auc: 0.865083	val_data's auc: 0.850717
[2000]	train_data's auc: 0.867171	val_data's auc: 0.850858
[2500]	train_data's auc: 0.869234	val_data's auc: 0.850798
[3000]	train_data's auc: 0.871457	val_data's auc: 0.850651
[3500]	train_data's auc: 0.873722	val_data's auc: 0.850343
[4000]	train_data's auc: 0.876056	val_data's auc: 0.850222
[4500]	train_data's auc: 0.878213	val_data's auc: 0.850173
[5000]	train_data'

# Evaluation 

In [48]:
flag = False

In [49]:
if "local" in train_path:
    flag = True

In [50]:
flag

False

In [51]:
y_pred = model.predict(X_test, num_iteration=model.best_iteration)

In [52]:
if flag:
    y_test = df_test["Survived"]
    precision, recall, thresholds = precision_recall_curve(y_test, y_pred)
    df_curve = pd.DataFrame({"precision": precision, "recall": recall})
    

In [53]:
if flag:
    sns.lineplot(data=df_curve)

In [54]:
thresholds_meow = [i*0.01 for i in range(1,100)]

In [55]:
if flag: 
    acc_list = []
    for threshold in thresholds_meow:
        y_pred_one = [1 if y > threshold else 0 for y in y_pred]
        accuracy = accuracy_score(y_test, y_pred_one)
        acc_list.append(accuracy)
    df_acc = pd.DataFrame({"threshold":thresholds_meow, "accuracy":acc_list})
    ax = sns.lineplot(data=df_acc, x="threshold", y="accuracy")
    ax.set_ylim(0, 1)

In [56]:
if flag:
    max_acc = max(acc_list)
    max_index = acc_list.index(max_acc)
    best_threshold = thresholds_meow[max_index]
    print("Best Acc ", max_acc)
    print("Best threshold ", best_threshold)



## Online submission

In [58]:
if not flag:
    best_threshold = 0.5
    y_pred_one = [1 if y > best_threshold else 0 for y in y_pred]
    df_submission = pd.DataFrame({"PassengerId":df_test["PassengerId"], "Survived": y_pred_one})
    display(df_submission)
    df_submission.to_csv("submission_lgb5.csv", index=False)

Unnamed: 0,PassengerId,Survived
0,100000,0
1,100001,0
2,100002,1
3,100003,0
4,100004,1
...,...,...
99995,199995,1
99996,199996,0
99997,199997,0
99998,199998,1
