# Machine Learning Project - Spaceship Titanic Kaggle

In [11]:
import numpy as np
import pandas as pd
import math
from hyperopt import tpe
from hyperopt import STATUS_OK
from hyperopt import Trials
from hyperopt import hp
from hyperopt import fmin
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from catboost import CatBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [13]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

df_train.drop(['Name'], axis=1, inplace=True)
df_test.drop(['Name'], axis=1, inplace=True)

### TRAITEMENT DES DONNEES ET CREATION DE NOUVELLES VARIABLES

In [14]:
def clean_df(df, test=False):
    df = pd.get_dummies(df, columns=['HomePlanet', 'Destination'], dummy_na=True)
    df['VIP']=df['VIP'].replace([False, True],[0,1])
    df['CryoSleep']=df['CryoSleep'].replace([False, True],[0,1])
    if(test==False):
        df['Transported'] = df['Transported'].replace([False, True], [0,1])
    df['Deck'] = np.nan
    df['Side'] = np.nan
    #1 hot encoding Deck & Side & filling with nas the na values from get_dummies()
    for i in range(df.shape[0]):
        #get the deck and sed
        if type(df['Cabin'].iloc[i]) is str:
            df['Deck'].iloc[i]=df['Cabin'].iloc[i][:1]
            df['Side'].iloc[i]=df['Cabin'].iloc[i][-1:]
        if(df['HomePlanet_nan'].iloc[i]==1):
            #filling with nas where there should be nas but were replaced by get_dummies()
            df['HomePlanet_Earth'].iloc[i] = np.nan
            df['HomePlanet_Mars'].iloc[i] = np.nan
            df['HomePlanet_Europa'].iloc[i] = np.nan
        if(df['Destination_nan'].iloc[i]==1):
            df['Destination_55 Cancri e'].iloc[i] = np.nan
            df['Destination_PSO J318.5-22'].iloc[i] = np.nan
            df['Destination_TRAPPIST-1e'].iloc[i] = np.nan
    #1 hot encoding side
    df['Side']=df['Side'].replace(['P', 'S'],[0,1])
    #1 hot encoding Deck
    res = pd.get_dummies(df, columns=['Deck'], dummy_na=True)
    for i in range(res.shape[0]):
        #filling with nas where there should be nas but were replaced by get_dummies()
        if(res['Deck_nan'].iloc[i] == 1):
            decks=['A', 'B', 'C', 'D', 'E', 'F', 'G', 'T']
            for deck in decks:
                res[f'Deck_{deck}'].iloc[i] = np.nan
    res = res.drop(columns=['Cabin', 'Destination_nan', 'HomePlanet_nan', 'Deck_nan'])
    
    return res
df_train = clean_df(df_train)
df_test = clean_df(df_test, test=True)

### GESTION DES VALEURS MANQUANTES (NA)

#### 1 - Remplir les valeurs manquantes avec la moyenne

In [5]:
def handleNasMean(df):
    colset = set(df.columns.to_list())
    if('Transported' in colset):
        imput = df.drop(columns=['Transported'])
        colset.remove('Transported')
    else:
        imput = df

    numericals = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa']
    for num in numericals: # numericals are columns with numerical data, colset are thoses one hot encoded
        colset.remove(num)
    #for categorical data imputing with most frequent data
    for col in colset:
        imput[col] = imput[col].fillna(imput[col].mode()[0])
    
    #for numerical data imputing with mean
    for col in numericals:
        imput[col] = imput[col].fillna(imput[col].mean())
    #adding new features
    imput['tot_bill']=0
    imput['avg_bill']=0
    imput['max_bill']=0
    imput['type_max_bill']='A'
    imput['type_max_bill_RoomService']=0
    imput['type_max_bill_FoodCourt']=0
    imput['type_max_bill_ShoppingMall']=0
    imput['type_max_bill_Spa']=0
    imput['type_max_bill_VRDeck']=0

    dic={0:'RoomService', 1:'FoodCourt', 2:'ShoppingMall', 3:'Spa',4:'VRDeck', np.nan:np.nan}
    for i in range(imput.shape[0]):
        imput['tot_bill'].iloc[i]=imput['RoomService'].iloc[i]+imput['FoodCourt'].iloc[i]+imput['ShoppingMall'].iloc[i]+imput['Spa'].iloc[i]+imput['VRDeck'].iloc[i]
        imput['avg_bill'].iloc[i]=(imput['RoomService'].iloc[i]+imput['FoodCourt'].iloc[i]+imput['ShoppingMall'].iloc[i]+imput['Spa'].iloc[i]+imput['VRDeck'].iloc[i])/5
        tab=[imput['RoomService'].iloc[i],imput['FoodCourt'].iloc[i],imput['ShoppingMall'].iloc[i],imput['Spa'].iloc[i], imput['VRDeck'].iloc[i]]
        imput['max_bill'].iloc[i]=np.max(tab)
        if math.isnan(np.max(tab)) is False:
            imput['type_max_bill'].iloc[i]=dic[tab.index(np.max(tab))]
        else:
            imput['type_max_bill'].iloc[i]=np.nan
    
        if imput['type_max_bill'].iloc[i] == 'RoomService':
            imput['type_max_bill_RoomService'].iloc[i] = 1
        else:
            imput['type_max_bill_RoomService'].iloc[i] = 0
    
        if imput['type_max_bill'].iloc[i] == 'FoodCourt':
            imput['type_max_bill_FoodCourt'].iloc[i] = 1
        else:
            imput['type_max_bill_FoodCourt'].iloc[i] = 0
        
        if imput['type_max_bill'].iloc[i] == 'ShoppingMall':
            imput['type_max_bill_ShoppingMall'].iloc[i] = 1
        else:
            imput['type_max_bill_ShoppingMall'].iloc[i] = 0

        if imput['type_max_bill'].iloc[i] == 'Spa':
            imput['type_max_bill_Spa'].iloc[i] = 1
        else:
            imput['type_max_bill_Spa'].iloc[i] = 0

        if imput['type_max_bill'].iloc[i] == 'VRDeck':
            imput['type_max_bill_Spa'].iloc[i] = 1
        else:
            imput['type_max_bill_VRDeck'].iloc[i] = 0
    
    imput.drop(columns=['type_max_bill'], inplace=True)

    return imput
#### 2 - Remplir les valeurs manquantes avec un algorithme KNN
df_train_na_mean = handleNasMean(df_train)
df_test_na_mean = handleNasMean(df_test)

#### 2 - Remplir les valeurs manquantes avec un algorithme KNN

In [15]:
def handleNasKnn(df):
    # scaler = StandardScaler()
    # columnsToScale = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
    # df[columnsToScale] = scaler.fit_transform(df[columnsToScale])
    #Replacing the nas (nas from categorical data has to be recategorized)
    imputer = KNNImputer(n_neighbors=25)
    colset = set(df.columns.to_list())
    if('Transported' in colset):
        imput = df.drop(columns=['Transported'])
    else:
        imput = df
    imput = pd.DataFrame(imputer.fit_transform(imput), columns=imput.columns)
    #adding new features
    imput['tot_bill']=0
    imput['avg_bill']=0
    imput['max_bill']=0
    imput['type_max_bill']='A'
    imput['type_max_bill_RoomService']=0
    imput['type_max_bill_FoodCourt']=0
    imput['type_max_bill_ShoppingMall']=0
    imput['type_max_bill_Spa']=0
    imput['type_max_bill_VRDeck']=0

    dic={0:'RoomService', 1:'FoodCourt', 2:'ShoppingMall', 3:'Spa',4:'VRDeck', np.nan:np.nan}
    for i in range(imput.shape[0]):
        imput['tot_bill'].iloc[i]=imput['RoomService'].iloc[i]+imput['FoodCourt'].iloc[i]+imput['ShoppingMall'].iloc[i]+imput['Spa'].iloc[i]+imput['VRDeck'].iloc[i]
        imput['avg_bill'].iloc[i]=(imput['RoomService'].iloc[i]+imput['FoodCourt'].iloc[i]+imput['ShoppingMall'].iloc[i]+imput['Spa'].iloc[i]+imput['VRDeck'].iloc[i])/5
        tab=[imput['RoomService'].iloc[i],imput['FoodCourt'].iloc[i],imput['ShoppingMall'].iloc[i],imput['Spa'].iloc[i], imput['VRDeck'].iloc[i]]
        imput['max_bill'].iloc[i]=np.max(tab)
        if math.isnan(np.max(tab)) is False:
            imput['type_max_bill'].iloc[i]=dic[tab.index(np.max(tab))]
        else:
            imput['type_max_bill'].iloc[i]=np.nan
    
        if imput['type_max_bill'].iloc[i] == 'RoomService':
            imput['type_max_bill_RoomService'].iloc[i] = 1
        else:
            imput['type_max_bill_RoomService'].iloc[i] = 0
    
        if imput['type_max_bill'].iloc[i] == 'FoodCourt':
            imput['type_max_bill_FoodCourt'].iloc[i] = 1
        else:
            imput['type_max_bill_FoodCourt'].iloc[i] = 0
        
        if imput['type_max_bill'].iloc[i] == 'ShoppingMall':
            imput['type_max_bill_ShoppingMall'].iloc[i] = 1
        else:
            imput['type_max_bill_ShoppingMall'].iloc[i] = 0

        if imput['type_max_bill'].iloc[i] == 'Spa':
            imput['type_max_bill_Spa'].iloc[i] = 1
        else:
            imput['type_max_bill_Spa'].iloc[i] = 0

        if imput['type_max_bill'].iloc[i] == 'VRDeck':
            imput['type_max_bill_Spa'].iloc[i] = 1
        else:
            imput['type_max_bill_VRDeck'].iloc[i] = 0
    encodeds = ['CryoSleep', 'VIP', 'HomePlanet_Earth', 'HomePlanet_Earth', 'HomePlanet_Europa', 
        'Destination_PSO J318.5-22', 'Destination_TRAPPIST-1e', 'Deck_A', 'Deck_B', 'Deck_C', 'Deck_D', 'Deck_E', 'Deck_F', 'Deck_G', 'Side'
        ]
    #for categorical data, we re-categorzie from knn
    for col in encodeds:
        for i in range(imput.shape[0]):
            if(imput[col].iloc[i] - 0.5 <0):
                imput[col].iloc[i] = 0
            else:
                imput[col].iloc[i] = 1
    imput.drop(columns=['type_max_bill'], inplace=True)
    return imput

df_test_na_knn = handleNasKnn(df_test)
df_train_na_knn = handleNasKnn(df_train)

#### 3 - Remplir les valeurs manquantes avec un algorithme RandomForest

In [None]:
def handleNasForest(df, test=False):
    imputer = MissForest()
    colset = set(df.columns.to_list())
    if('Transported' in colset):
        imput = df.drop(columns=['Transported'])
    else:
        imput = df    
    imput = pd.DataFrame(imputer.fit_transform(imput), columns=imput.columns)
    return imput

### C - MODELE DE MACHINE LEARNING

In [6]:
X_train_na_mean = df_train_na_mean.drop(columns=['PassengerId'])
Y_train_na_mean = df_train['Transported']
X_test_na_mean = df_test_na_mean.drop(['PassengerId'], axis=1)

X_train_na_knn = df_train_na_knn.drop(columns=['PassengerId'])
Y_train_na_knn = df_train['Transported']
X_test_na_knn = df_test_na_knn.drop(['PassengerId'], axis=1)

#importer depuis un fichier csv
X_train_na_rf = pd.read_csv('df_train_na_rf_no_index.csv').drop(columns=['PassengerId'])
Y_train_na_rf = df_train['Transported']
X_test_na_rf = pd.read_csv('df_test_na_rf_no_index.csv').drop(columns=['PassengerId'])

#### 1 - Régression logistique

In [16]:
modele_logit = LogisticRegression(penalty='none',solver='newton-cg')

#Test avec df_na_mean
modele_logit.fit(X_train_na_mean,Y_train_na_mean)
df_coeff_na_mean=pd.DataFrame(np.concatenate([modele_logit.intercept_.reshape(-1,1),
                             modele_logit.coef_],axis=1),
             index = ["coef"],
             columns = ["constante"]+list(X_train_na_mean.columns)).T
res_logit_na_mean = modele_logit.predict(X_test_na_mean)
df_res_logit_na_mean=pd.DataFrame(df_test['PassengerId'])
df_res_logit_na_mean['Transported']=res_logit_na_mean
df_res_logit_na_mean['Transported']=df_res_logit_na_mean['Transported'].replace([0,1],[False, True])
df_res_logit_na_mean.set_index('PassengerId').to_csv('test_prediction_logit_na_mean.csv')

#Test avec df_na_knn
modele_logit.fit(X_train_na_knn,Y_train_na_knn)
df_coeff_na_knn=pd.DataFrame(np.concatenate([modele_logit.intercept_.reshape(-1,1),
                             modele_logit.coef_],axis=1),
             index = ["coef"],
             columns = ["constante"]+list(X_train_na_knn.columns)).T
res_logit_na_knn = modele_logit.predict(X_test_na_knn)
df_res_logit_na_knn=pd.DataFrame(df_test['PassengerId'])
df_res_logit_na_knn['Transported']=res_logit_na_knn
df_res_logit_na_knn['Transported']=df_res_logit_na_knn['Transported'].replace([0,1],[False, True])
df_res_logit_na_knn.set_index('PassengerId').to_csv('test_prediction_logit_na_knn.csv')

#Test avec df_na_rf
modele_logit.fit(X_train_na_rf,Y_train_na_rf)
df_coeff_na_rf=pd.DataFrame(np.concatenate([modele_logit.intercept_.reshape(-1,1),
                             modele_logit.coef_],axis=1),
             index = ["coef"],
             columns = ["constante"]+list(X_train_na_rf.columns)).T
res_logit_na_rf = modele_logit.predict(X_test_na_rf)
df_res_logit_na_rf=pd.DataFrame(df_test['PassengerId'])
df_res_logit_na_rf['Transported']=res_logit_na_rf
df_res_logit_na_rf['Transported']=df_res_logit_na_rf['Transported'].replace([0,1],[False, True])
df_res_logit_na_rf.set_index('PassengerId').to_csv('test_prediction_logit_na_rf.csv')

#### 2 - Modèle KNN

In [8]:
model_knn_na_mean = KNeighborsClassifier()
model_knn_na_knn = KNeighborsClassifier()
model_knn_na_rf = KNeighborsClassifier()

#Test avec df_na_mean
model_knn_na_mean.fit(X_train_na_mean,Y_train_na_mean)
res_knn_na_mean = model_knn_na_mean.predict(X_test_na_mean)
df_res_knn_na_mean=pd.DataFrame(df_test['PassengerId'])
df_res_knn_na_mean['Transported']=res_knn_na_mean
df_res_knn_na_mean['Transported']=df_res_knn_na_mean['Transported'].replace([0,1],[False, True])
df_res_knn_na_mean.set_index('PassengerId').to_csv('test_prediction_knn_na_mean.csv')

#Test avec df_na_knn
model_knn_na_knn.fit(X_train_na_knn,Y_train_na_knn)
res_knn_na_knn = model_knn_na_knn.predict(X_test_na_knn)
df_res_knn_na_knn=pd.DataFrame(df_test['PassengerId'])
df_res_knn_na_knn['Transported']=res_knn_na_knn
df_res_knn_na_knn['Transported']=df_res_knn_na_knn['Transported'].replace([0,1],[False, True])
df_res_knn_na_knn.set_index('PassengerId').to_csv('test_prediction_knn_na_knn.csv')

#Test avec df_na_rf
model_knn_na_rf.fit(X_train_na_rf,Y_train_na_rf)
res_knn_na_rf = model_knn_na_rf.predict(X_test_na_rf)
df_res_knn_na_rf=pd.DataFrame(df_test['PassengerId'])
df_res_knn_na_rf['Transported']=res_knn_na_rf
df_res_knn_na_rf['Transported']=df_res_knn_na_rf['Transported'].replace([0,1],[False, True])
df_res_knn_na_rf.set_index('PassengerId').to_csv('test_prediction_knn_na_rf.csv')

#### 3 - Gradient Boosting XGBoost Classifier

In [9]:
model_xgb_na_mean = XGBClassifier()
model_xgb_na_knn = XGBClassifier()
model_xgb_na_rf = XGBClassifier()


#Test avec df_na_mean
model_xgb_na_mean.fit(X_train_na_mean,Y_train_na_mean)
res_xgb_na_mean = model_xgb_na_mean.predict(X_test_na_mean)
df_res_xgb_na_mean=pd.DataFrame(df_test['PassengerId'])
df_res_xgb_na_mean['Transported']=res_xgb_na_mean
df_res_xgb_na_mean['Transported']=df_res_xgb_na_mean['Transported'].replace([0,1],[False, True])
df_res_xgb_na_mean.set_index('PassengerId').to_csv('test_prediction_xgb_na_mean.csv')

#Test avec df_na_knn
model_xgb_na_knn.fit(X_train_na_knn,Y_train_na_knn)
res_xgb_na_knn = model_xgb_na_knn.predict(X_test_na_knn)
df_res_xgb_na_knn=pd.DataFrame(df_test['PassengerId'])
df_res_xgb_na_knn['Transported']=res_xgb_na_knn
df_res_xgb_na_knn['Transported']=df_res_xgb_na_knn['Transported'].replace([0,1],[False, True])
df_res_xgb_na_knn.set_index('PassengerId').to_csv('test_prediction_xgb_na_knn.csv')

#Test avec df_na_rf
model_xgb_na_rf.fit(X_train_na_rf,Y_train_na_rf)
res_xgb_na_rf = model_xgb_na_rf.predict(X_test_na_rf)
df_res_xgb_na_rf=pd.DataFrame(df_test['PassengerId'])
df_res_xgb_na_rf['Transported']=res_xgb_na_rf
df_res_xgb_na_rf['Transported']=df_res_xgb_na_rf['Transported'].replace([0,1],[False, True])
df_res_xgb_na_rf.set_index('PassengerId').to_csv('test_prediction_xgb_na_rf.csv')

In [None]:
ft_imp_na_mean=pd.DataFrame()
ft_imp_na_mean['Column_name']=X_train_na_mean.columns
ft_imp_na_mean['value']=model_xgb_na_mean.feature_importances_
ft_imp_na_mean.sort_values(by='value', ascending=False)

ft_imp_na_knn=pd.DataFrame()
ft_imp_na_knn['Column_name']=X_train_na_knn.columns
ft_imp_na_knn['value']=model_xgb_na_knn.feature_importances_
ft_imp_na_knn.sort_values(by='value', ascending=False)

ft_imp_na_rf=pd.DataFrame()
ft_imp_na_rf['Column_name']=X_train_na_rf.columns
ft_imp_na_rf['value']=model_xgb_na_rf.feature_importances_
ft_imp_na_rf.sort_values(by='value', ascending=False)

#### 4 - Modèle CatBoost Classifier

In [10]:
model_cbc_na_mean = CatBoostClassifier(verbose=False)
model_cbc_na_knn = CatBoostClassifier(verbose=False)
model_cbc_na_rf = CatBoostClassifier(verbose=False)


#Test avec df_na_mean
model_cbc_na_mean.fit(X_train_na_mean,Y_train_na_mean)
res_cbc_na_mean = model_cbc_na_mean.predict(X_test_na_mean)
df_res_cbc_na_mean=pd.DataFrame(df_test['PassengerId'])
df_res_cbc_na_mean['Transported']=res_cbc_na_mean
df_res_cbc_na_mean['Transported']=df_res_cbc_na_mean['Transported'].replace([0,1],[False, True])
df_res_cbc_na_mean.set_index('PassengerId').to_csv('test_prediction_cbc_na_mean.csv')

#Test avec df_na_knn
model_cbc_na_knn.fit(X_train_na_knn,Y_train_na_knn)
res_cbc_na_knn = model_cbc_na_knn.predict(X_test_na_knn)
df_res_cbc_na_knn=pd.DataFrame(df_test['PassengerId'])
df_res_cbc_na_knn['Transported']=res_cbc_na_knn
df_res_cbc_na_knn['Transported']=df_res_cbc_na_knn['Transported'].replace([0,1],[False, True])
df_res_cbc_na_knn.set_index('PassengerId').to_csv('test_prediction_cbc_na_knn.csv')

#Test avec df_na_rf
model_cbc_na_rf.fit(X_train_na_rf,Y_train_na_rf)
res_cbc_na_rf = model_cbc_na_rf.predict(X_test_na_rf)
df_res_cbc_na_rf=pd.DataFrame(df_test['PassengerId'])
df_res_cbc_na_rf['Transported']=res_cbc_na_rf
df_res_cbc_na_rf['Transported']=df_res_cbc_na_rf['Transported'].replace([0,1],[False, True])
df_res_cbc_na_rf.set_index('PassengerId').to_csv('test_prediction_cbc_na_rf.csv')

In [None]:
ft_imp_na_mean=pd.DataFrame()
ft_imp_na_mean['Column_name']=X_train_na_mean.columns
ft_imp_na_mean['value']=model_cbc_na_mean.feature_importances_
ft_imp_na_mean.sort_values(by='value', ascending=False)

ft_imp_na_knn=pd.DataFrame()
ft_imp_na_knn['Column_name']=X_train_na_knn.columns
ft_imp_na_knn['value']=model_cbc_na_knn.feature_importances_
ft_imp_na_knn.sort_values(by='value', ascending=False)

ft_imp_na_rf=pd.DataFrame()
ft_imp_na_rf['Column_name']=X_train_na_rf.columns
ft_imp_na_rf['value']=model_cbc_na_rf.feature_importances_
ft_imp_na_rf.sort_values(by='value', ascending=False)

## D - OPTIMISATION DES HYPERPARAMETRES

#### 1 - Bayesian Search - essai non concluant

In [103]:
MAX_EVALS = 100
df_X = X_train_na_rf
df_Y = Y_train_na_rf

def objective(params):
    X_train,X_test, y_train, y_test = train_test_split(df_X, df_Y, test_size=0.3)
    model = CatBoostClassifier(depth=int(params['depth']), iterations=int(params['iterations']),
                             learning_rate=params['learning_rate'],
                             l2_leaf_reg=int(params['l2_leaf_reg']), verbose=False)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    actual = np.array(y_test)
    forecast = np.array(preds)
    best_score = accuracy_score(actual, forecast)
    return (1-best_score)

space_param = {
    "depth": hp.choice("depth", np.arange(5,12, dtype=int)),
    "learning_rate": hp.uniform("learning_rate", 0,1),
    "iterations": hp.choice("iterations", np.arange(1000,5000,100)),
    "l2_leaf_reg": hp.choice("l2_leaf_reg", np.arange(10,1000,100)),
}



def bayesian_opti(objective, space_param, MAX_EVALS=100):
    # Algorithm
    tpe_algorithm = tpe.suggest

    # Trials object to track progress
    bayes_trials = Trials()

    # Optimize
    best = fmin(fn=objective, space=space_param, algo=tpe_algorithm, max_evals=MAX_EVALS, trials=bayes_trials,
                return_argmin=False)
    df_best = pd.DataFrame.from_dict(best, orient='index')
    print(bayes_trials.best_trial['result']['loss'])
    return df_best, bayes_trials.best_trial['result']['loss']

def multi_bay_opti(objective, space_param, nb=10):
    final = pd.DataFrame(
        columns=['depth', 'iterations', 'learning_rate', 'l2_leaf_reg','SCORE'], index=range(nb))
    for i in range(nb):
        optim = bayesian_opti(objective, space_param, 20)
        final['depth'].iloc[i] = optim[0].loc['depth'][0]
        final['iterations'].iloc[i] = optim[0].loc['iterations'][0]
        final['learning_rate'].iloc[i] = optim[0].loc['learning_rate'][0]
        final['l2_leaf_reg'].iloc[i] = optim[0].loc['l2_leaf_reg'][0]
        final['SCORE'].iloc[i] = optim[1]
        print('Etape {} : SCORE de {}'.format(i + 1, optim[1]))
    final.to_csv('res_bayes_multisearch_hyperparam.csv')
    return final

final = multi_bay_opti(objective, space_param, 1)

100%|██████████| 20/20 [14:45<00:00, 44.29s/trial, best loss: 0.1875]            
0.1875
Etape 1 : SCORE de 0.1875


#### 2 - Grid Search

In [2]:
model=CatBoostClassifier(verbose=False)
grid = {"depth": [6,8,10,11],
    "learning_rate": [0.001,0.01,0.1],
    "iterations": [1600,2500,4000]}

grid_srch_cat = GridSearchCV(estimator=model, param_grid=grid, cv=3)
grid_srch_cat.fit(X_train_na_rf, Y_train_na_rf)
params = grid_srch_cat.best_params_
print("The best estimator across ALL searched params:",grid_srch_cat.best_estimator_)
print("The best score across ALL searched params:",grid_srch_cat.best_score_)
print("The best parameters across ALL searched params:",grid_srch_cat.best_params_)

## DERNIERE PREDICTION

In [105]:
model_cbc_na_mean = CatBoostClassifier(depth= 8,
        iterations= 3300,
        learning_rate= 0.1,
        l2_leaf_reg= 800,
        verbose=False)
model_cbc_na_mean.fit(X_train_na_rf, Y_train_na_rf)
res_cbc_na_rf = model_cbc_na_mean.predict(X_test_na_rf)
df_res_cbc_na_rf=pd.DataFrame(df_test['PassengerId'])
df_res_cbc_na_rf['Transported']=res_cbc_na_rf
df_res_cbc_na_rf['Transported']=df_res_cbc_na_rf['Transported'].replace([0,1],[False, True])
df_res_cbc_na_rf.set_index('PassengerId').to_csv('LAST_prediction_cbc_na_rf.csv')