In [44]:
import numpy 
import sklearn
import pandas as pd
import xgboost as xgb


In [45]:
Data_full = pd.read_csv("train.csv", index_col = "PassengerId")

y = Data_full.Transported
X = Data_full.drop("Transported", axis=1)




cleaning

In [46]:
# column manipulation
X[['Deck', 'CabinNumber', 'PortSide']] = X['Cabin'].str.split('/', expand=True)

X["PortSide"]= X["PortSide"].map({"p":True, "S":False, })
DeckMap = {'G':7, 'F':6, 'E':5, 'D':4, 'C':3, 'B':2, 'A':1, 'T':0}
X["Deck"] = X["Deck"].map(DeckMap)

features = ["Age", "CryoSleep", "VIP", 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', "HomePlanet","Destination", 'Deck', 'CabinNumber', 'PortSide']
X = X[features]
X_train, X_valid, y_train, y_valid = sklearn.model_selection.train_test_split(X, y)

X["Deck"].value_counts()


Deck
6.0    2794
7.0    2559
5.0     876
2.0     779
3.0     747
4.0     478
1.0     256
0.0       5
Name: count, dtype: int64

In [47]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

spending_cols = ["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]
numericall_cols = ["Age", "CabinNumber"]
bool_cols = ["CryoSleep", "VIP", "PortSide"]           # True/False mode imputation only
catgorial_cols = ["Destination", "HomePlanet"]
Median_cols = ["Deck"]

spending_transformer = Pipeline([   # ["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('scaler', StandardScaler())
])

numericall_transformer = Pipeline([        # ["Age", "CabinNumber"]
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

boolean_transformer = Pipeline([    #["CryoSleep", "VIP", "PortSide"]     
    ('imputer', SimpleImputer(strategy='most_frequent'))    # no one_hot_encode (try one hot encode later)
])

categorical_transformer = Pipeline([    # ["Destination", "HomePlanet"]
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

median_transformer = Pipeline([    # ["Deck"]
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('spending', spending_transformer, spending_cols),
    ('age', numericall_transformer, numericall_cols),
    ('bool', boolean_transformer, bool_cols),
    ('cat', categorical_transformer, catgorial_cols),
    ('median', median_transformer, Median_cols)
])



In [None]:
# train model
from sklearn.pipeline import Pipeline

Model = Pipeline([
    ('scaler', sklearn.preprocessing.StandardScaler()),
    ('model', xgb.XGBClassifier(
        n_estimators=100,
        learning_rate=0.1,
        max_depth=3,
        random_state=42,
        use_label_encoder=False))
])

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', Model)
])

pipeline.fit(X_train, y_train)
y_preds = pipeline.predict(X_valid)
print("MAE:")
print(sklearn.metrics.mean_absolute_error(y_valid, y_preds) * 100)

# runs 
# 1 : 31.4543
# 2: 28.408
# 3: 27.7449
# 4: 27.9839
# 5: 21.3891
# 6: 20.5151


MAE:
0.20515179392824287


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
