In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# データ読み込み

In [None]:
train = pd.read_csv('../input/spaceship-titanic/train.csv')
test = pd.read_csv('../input/spaceship-titanic/test.csv')
submit = pd.read_csv('../input/spaceship-titanic/sample_submission.csv')

# EDA

In [None]:
train.info()

In [None]:
test.info()

In [None]:
submit.head()

In [None]:
!pip install sweetviz
import sweetviz as sv

In [None]:
# # train のEDA
# my_report_train = sv.analyze(train)
# my_report_train.show_html("sweetviz_report_Spaceship_train_V1.html")

# # train と test の関係
# my_report_trainVStest = sv.compare([train, "Train"], [test, "Test"], "Transported")
# my_report_trainVStest.show_html("sweetviz_report_Spaceship_trainVStest_V1.html")

# 前処理 
1. NaNがあるかないか
2. Cabin分裂(deck(encoding),side(encoding),num(そのまま))
3. サービス系(RoomService, FoodCourt, ShoppingMall, Spa, VRDeck)の欠損値をLightGBMで予測して補完
4. HomePlanetとDestination合併
3. 家族(nameから)
4. 同室人数
5. カテゴリ変数の欠損値補完
6. カテゴリ変数の変換 (HomePlanet・Destination・CryoSleep・VIP・Transportedを数値変換)
7. 同室確認
8. サービス料合計
9. 不要な列を削除
10. clipping
11. binning

In [None]:
df = pd.concat([train.drop(["Transported"], axis=1), test], ignore_index=True)

In [None]:
#　NaNがあるかないか
df_colmns_list = df.drop(["PassengerId"], axis=1).columns

for column in df_colmns_list:
    df["Nan_"+ column] = np.where(df[column].isna(), 1, 0)

In [None]:
# Cabin分裂(deck(encoding),side(encoding),num(そのまま))

CabinAry_df = df["Cabin"].str.split("/", expand=True)

df["Cabin_Deck"] = CabinAry_df[0]
df["Cabin_Num"] = CabinAry_df[1]
df["Cabin_Side"] = CabinAry_df[2]

# Cabin_Num がoblect型になっていてlightgbmに突っ込めないからfloat型にする
df["Cabin_Num"] = df["Cabin_Num"].astype(float)

In [None]:
import lightgbm as lgbm
from lightgbm import early_stopping, log_evaluation
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.simplefilter('ignore')

In [None]:
# 欠損値学習のためのLabelEncording

NaN_cat_columns_df = ["HomePlanet","CryoSleep","Cabin_Deck"]
NaN_drop_list = ["PassengerId","Cabin",'Nan_HomePlanet', 'Nan_CryoSleep', 'Nan_Cabin', 'Nan_Destination',
                 'Nan_Age', 'Nan_VIP', 'Nan_RoomService', 'Nan_FoodCourt','Nan_ShoppingMall',
                 'Nan_Spa', 'Nan_VRDeck', 'Nan_Name',"Cabin_Side","Destination","VIP","Name"]
df_NaN = df.drop(NaN_drop_list, axis=1) # 欠損値学習に使わないカラムを落とす

for c in NaN_cat_columns_df:
    le = LabelEncoder()
    le.fit(df_NaN[c])
    df_NaN[c] = le.transform(df_NaN[c])

In [None]:
# サービス系(RoomService, FoodCourt, ShoppingMall, Spa, VRDeck)の欠損値をLightGBMで予測して補完

Service_list = ["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]

for column_service in Service_list:
    NonExist_NaN_df = df_NaN[df_NaN[column_service].notna()]  # columnにNaNがないdf
    Only_NaN_df = df_NaN[df_NaN[column_service].isna()]  # columnがNaNのみのdf

    NaN_train_X_row = NonExist_NaN_df.drop([column_service], axis=1)
    NaN_train_y_row = NonExist_NaN_df[column_service]
    NaN_test_X = Only_NaN_df.drop([column_service], axis=1)
    NaN_test_y = Only_NaN_df[column_service]

    NaN_train_X,NaN_valid_X, NaN_train_y, NaN_valid_y = train_test_split(NaN_train_X_row, NaN_train_y_row, test_size=0.25, random_state=42)

    lgb_NaN_train = lgbm.Dataset(NaN_train_X,NaN_train_y)
    lgb_NaN_valid = lgbm.Dataset(NaN_valid_X,NaN_valid_y)

    params = {
                    "objective": "regression", 
                    'metric': 'rmse',
                    "learning_rate": .1,
                    "reg_lambda": .1,
                    "reg_alpha": 0,
                    "max_depth": 5, 
                    "n_estimators": 10000, 
                    "colsample_bytree": .5, 
                    "min_child_samples": 10,
                    "subsample_freq": 3,
                    "subsample": .9,
                    "random_state": 1,
                    'verbose': -1
                }

    gbm = lgbm.train(params,
                     train_set=lgb_NaN_train,
                     valid_sets=[lgb_NaN_valid],
                     callbacks=[early_stopping(stopping_rounds=100,
                                    verbose=False),
                               log_evaluation(0)]
                     )

    NaN_valid_y_pred = gbm.predict(NaN_valid_X)
    NaN_score = mean_squared_error(y_true=NaN_valid_y, y_pred=NaN_valid_y_pred, squared=False)
    print(f'{column_service}:RMSE={NaN_score}\n')

    # feature importanceを表示
    importance = pd.DataFrame(gbm.feature_importance(importance_type='gain'), index=NaN_train_X.columns, columns=['importance'])
    importance = importance.sort_values('importance', ascending=False)
    display(importance)
    print("-" * 50)

    NaN_test_y_pred = gbm.predict(NaN_test_X)
    NaN_test_y = pd.Series(data=NaN_test_y_pred, index=NaN_test_y.index)
    
    df[column_service].fillna(NaN_test_y, inplace=True)

In [None]:
df.isna().sum()

In [None]:
# HomePlanetとDestination合併

df["Home×Dest"] = df["HomePlanet"] + df["Destination"]

In [None]:
# 家族(nameから)

df["Family"] = df["Name"].str.split(" ", expand=True)[1]

In [None]:
# 同室人数

cabin_group = df.groupby("Cabin")
df_Sameroom = pd.DataFrame({"SameRoomNum":cabin_group.size()})
df = pd.merge(df,df_Sameroom,how="left",on="Cabin")

In [None]:
# 同室確認

df["SameRoomBinary"] = np.where((df["SameRoomNum"]==1) | (df["SameRoomNum"].isna()), 0, 1)

In [None]:
#  サービス料合計

df["Service_Sum"] = df[Service_list].sum(axis=1)

In [None]:
# カテゴリ変数の変換 (HomePlanet,Destination,CryoSleep,VIP,Cabin_Deck,Cabin_Side,Home×Dest,Family,Transported を数値変換)

from sklearn.preprocessing import LabelEncoder

cat_columns_df = ["HomePlanet","Destination","CryoSleep","VIP","Cabin_Deck","Cabin_Side","Home×Dest","Family"]

for c in cat_columns_df:
    le = LabelEncoder()
    le.fit(df[c])
    df[c] = le.transform(df[c])

In [None]:
# 前処理後の train のEDA
my_report_train = sv.analyze(train)
my_report_train.show_html("sweetviz_report_Spaceship_train_V2.html")

# 前処理後の train と test の関係
my_report_trainVStest = sv.compare([train, "Train"], [test, "Test"], "Transported")
my_report_trainVStest.show_html("sweetviz_report_Spaceship_trainVStest_V2.html")

In [None]:
df.columns

In [None]:
# 不要な列を削除
drop_list = ['PassengerId', 'Cabin', 'Name', "VIP", "Destination", "HomePlanet", "SameRoomNum"]

df.drop(drop_list, axis=1, inplace=True)

In [None]:
df

In [None]:
# clipping

In [None]:
# binning

# 学習

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, log_loss

In [None]:
train_X = df.iloc[:train.shape[0],]
train_y = train["Transported"].astype(int)
test = df.iloc[train.shape[0]:,:]

In [None]:
class LightGBMCV:
    def __init__(self, fold, params=None):
        if params is None:
            self.params = {
                "objective": "binary", 
                "learning_rate": .1,
                "reg_lambda": .1,
                "reg_alpha": 0,
                "max_depth": 5, 
                "n_estimators": 10000, 
                "colsample_bytree": .5, 
                "min_child_samples": 10,
                "subsample_freq": 3,
                "subsample": .9,
                "importance_type": "gain", 
                "random_state": 1
            }
        else:
            self.params = params
        self.fold = fold
    
    @property
    def models(self):
        return self._models
    
    @property
    def pred_array(self):
        return self._pred_array
    
    def fit(self, X, y, early_stopping, score_func, **kwargs):
        self._feature_name = X.columns
        X, y = X.values, y.values
        self._models = []
        self._pred_array = np.zeros(len(y), dtype=np.float32)
            
        cv = self.fold.split(X, y)
        for i, (idx_train, idx_valid) in enumerate(cv):
            X_train, y_train = X[idx_train], y[idx_train]
            X_valid, y_valid = X[idx_valid], y[idx_valid]
            
            model = lgbm.LGBMModel(**self.params)
            model.fit(
                X_train,
                y_train,
                eval_set=[(X_valid, y_valid)],
                callbacks=[early_stopping, log_evaluation(period=0, show_stdv=False)]
            )
            self._models.append(model)
            y_pred = model.predict(X_valid, **kwargs)
            self._pred_array[idx_valid] = y_pred
            if score_func in [accuracy_score, f1_score]:
                score = score_func(y_valid, np.where(y_pred >= 0.5, 1, 0), **kwargs)
            else:
                score = score_func(y_valid, y_pred, **kwargs)
            print(f" - fold{i + 1} - {score: .4f}")
        
        if score_func in [accuracy_score, f1_score]:
            total_score = score_func(y, np.rint(self._pred_array), **kwargs)
        else:
            total_score = score_func(y, self._pred_array, **kwargs)
        print(f": {total_score: .4f}")
        
    def predict(self, test):
        test = test.values
        pred = np.array([model.predict(test) for model in self._models])
        pred = np.mean(pred, axis=0)
        return pred
    
    @property
    def df_feature_importance(self):
        return self._df_feature_importance
        
    def visualize_importance(self, top_num=10):
        
        fig, ax = plt.subplots(1, 1, figsize=(max(8, 1.2*top_num), 20))
        
        self._df_feature_importance = pd.DataFrame()
        for idx, clf  in enumerate(self._models):
            _df = pd.DataFrame()
            _df["feature_importance"] = clf.feature_importances_
            _df["feature_name"] = self._feature_name
            _df["fold"] = idx + 1
            self._df_feature_importance = pd.concat([self._df_feature_importance, _df])

        order = self._df_feature_importance.groupby("feature_name")["feature_importance"].sum()\
                                .sort_values(ascending=False).index[:top_num]

        sns.boxenplot(
            x="feature_importance", 
            y="feature_name", 
            data=self._df_feature_importance, 
            order=order, ax=ax
        )
        ax.grid()
        
        plt.show()

# fit

In [None]:
fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
model = LightGBMCV(fold=fold)

In [None]:
%%time
model.fit(
    train_X, 
    train_y, 
    early_stopping=early_stopping(100, verbose=False),
    score_func=accuracy_score
)

In [None]:
predict_prob = model.predict(test)

fig, ax = plt.subplots(1, 1, figsize=(10, 6))
sns.histplot(x=model.pred_array, bins=50, alpha=0.5, ax=ax, stat="density", label="Out Of Fold (Train)", color="tab:blue")
sns.histplot(x=predict_prob, bins=50, alpha=0.5, ax=ax, stat="density", label="Test", color="tab:red")

ax.set_title("Probability Density")
ax.legend()
plt.show()

In [None]:
model.visualize_importance(top_num=25)

In [None]:
predict_prob

In [None]:
# spaceship_tiatanic.competition_submit(
#     submit.assign(Transported=np.where(predict_prob >= 0.5, True, False)),
#     message="cv; StratifiedKfold(5) features; add CabinCount DestinationFromDepature AgeByHomePlanetMean AgeByHomePlanetStd LuxuryBilledAmount LuxuryBilledCount, without VIP from Baseline",
#     file_name="7th_sub",
#     path="submission"
# )

In [None]:
sub = submit.assign(Transported=np.where(predict_prob >= 0.5, True, False))

sub.to_csv("spaceship_StratifiedKFold_5-fold_CV_Yuiki's_Model_Service_fullNaN.csv", index=False)

# 課題
1. trainとtestをdfでまとめて特徴量を作る
2. for分を減らして実行速度上げる
3. 