## Imports

In [2]:
import os

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier


from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold

## Import the data

In [26]:
train = pd.read_pickle('processed_data/train.pkl')

In [27]:
# check for inf values in top500
print(np.any(np.isinf(train)))
# in which column is -inf
print(np.where(np.isinf(train)))



True
(array([  619,  1017,  1298,  2681,  4169,  6308,  6345,  6907,  6916,
        7072,  7489,  8773, 11560, 11633, 12537, 14202, 14504, 14613,
       15084, 15235, 19427, 19840, 20240, 22249, 23688, 23729, 24161,
       25830, 25922, 26847, 26856, 27552, 27985, 29198, 31102, 32026,
       32843, 33092, 34047, 34684, 35904, 37740, 37833, 38706, 39159,
       39556, 39647, 39816, 39848, 40052, 40219, 40257, 40450, 40724,
       41893, 42583, 42711, 42719, 44901, 46332, 46829, 48132, 49048,
       50165, 51736, 52083, 52184, 52345, 53749, 54038, 54788, 54983,
       58312, 58323, 60162, 60478, 60775, 61815, 63830, 65047, 65300,
       65522, 66409, 66918, 67611, 68229, 70328, 71117, 71208, 72003,
       72484, 72577, 72830, 77243, 77965, 77990, 78272, 79942, 80565,
       82011, 82698, 84535, 84826, 85623, 86308]), array([951, 951, 951, 951, 951, 951, 951, 951, 951, 951, 951, 951, 951,
       951, 951, 951, 951, 951, 951, 951, 951, 951, 951, 951, 951, 951,
       951, 951, 951, 951, 95

In [3]:
train_df = pd.read_pickle('processed_data/internal/internal_train.pkl')
test_df = pd.read_pickle('processed_data/internal/internal_test_X.pkl')

In [4]:
X = train_df.loc[
    :, ~train_df.columns.str.startswith("speciesId_")
]
species_columns = ["surveyId"] + list(
    train_df.columns[train_df.columns.str.startswith("speciesId_")]
)
y = train_df.loc[:, species_columns]

In [6]:
print(X.shape, y.shape)

(69917, 988) (69917, 501)


In [7]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=113)

### Random Forest

In [8]:
rfc = RandomForestClassifier(random_state=113)

In [67]:
param_grid = {
    'n_estimators': [100, 150, 200],
    'min_samples_split': [5, 10, 15],
    'max_depth': [8, 9 ,10, 15, 20],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']
}

In [68]:
# CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv=StratifiedKFold(n_splits=5))


In [9]:
rfc.fit(X_train, y_train)

ValueError: Input X contains infinity or a value too large for dtype('float32').

In [70]:
print(f'Best params: {CV_rfc.best_params_}')
print(f'Best score: {CV_rfc.best_score_}')

Best params: {'criterion': 'entropy', 'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 15, 'n_estimators': 150}
Best score: 0.834324830099478


In [71]:
submission_rfc = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Survived': pipe_final_rfc.predict(X_test)
})

In [72]:
submission_rfc.to_csv('./data/submission_rfc_20240401.csv', index=False)

### Decision Tree

In [73]:
dtc = DecisionTreeClassifier(random_state=113)

In [74]:
param_grid = {
    'min_samples_split': [5, 10, 15],
    'max_depth': [10, 20, 30],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']
}

In [75]:
CV_dtc = GridSearchCV(estimator=dtc, param_grid=param_grid, cv=StratifiedKFold(n_splits=5))

In [76]:
pipe_final_dtc = make_pipeline(col_transform, CV_dtc)
pipe_final_dtc.fit(X_train, y_train)

In [77]:
print(f'Best params: {CV_dtc.best_params_}')
print(f'Best score: {CV_dtc.best_score_}')

Best params: {'criterion': 'gini', 'max_depth': 20, 'min_samples_leaf': 2, 'min_samples_split': 15}
Best score: 0.8160346695557962


In [78]:
submission_dtc = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Survived': pipe_final_dtc.predict(X_test)
})

In [79]:
submission_dtc.to_csv('./data/submission_dtc_20240401.csv', index=False)

### K-Nearest Neighbors

In [80]:
knn = KNeighborsClassifier()

In [81]:
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'p': [1, 2]
}

In [82]:
CV_knn = GridSearchCV(estimator=knn, param_grid=param_grid, cv=StratifiedKFold(n_splits=5))

In [83]:
pipe_final_knn = make_pipeline(col_transform, CV_knn)
pipe_final_knn.fit(X_train, y_train)

In [84]:
print(f'Best params: {CV_knn.best_params_}')
print(f'Best score: {CV_knn.best_score_}')

Best params: {'algorithm': 'brute', 'n_neighbors': 9, 'p': 1, 'weights': 'uniform'}
Best score: 0.7907416527134836


In [85]:
submission_knn = pd.DataFrame({
    "PassengerId": test_df["PassengerId"],
    "Survived": pipe_final_knn.predict(X_test)
})

In [86]:
submission_knn.to_csv('./data/submission_knn_20240401.csv', index=False)

### C-Support Vector Classifier 

In [87]:
svc = SVC(random_state=113)

In [88]:
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid']
}

In [89]:
CV_svc = GridSearchCV(estimator=svc, param_grid=param_grid, cv=StratifiedKFold(n_splits=5))

In [90]:
pipe_final_svc = make_pipeline(col_transform, CV_svc)
pipe_final_svc.fit(X_train, y_train)

In [91]:
print(f'Best params: {CV_svc.best_params_}')
print(f'Best score: {CV_svc.best_score_}')

Best params: {'C': 10, 'kernel': 'rbf'}
Best score: 0.8287205751994484


In [92]:
submission_svc = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Survived': pipe_final_svc.predict(X_test)
})

In [93]:
submission_svc.to_csv('./data/submission_svc_20240401.csv', index=False)

### Logisitic Regression

In [94]:
lr = LogisticRegression(random_state=113)

In [95]:
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
}

In [96]:
CV_lr = GridSearchCV(estimator=lr, param_grid=param_grid, cv=StratifiedKFold(n_splits=5))

In [97]:
pipe_final_lr = make_pipeline(col_transform, CV_lr)
pipe_final_lr.fit(X_train, y_train)

In [98]:
print(f'Best params: {CV_lr.best_params_}')
print(f'Best score: {CV_lr.best_score_}')

Best params: {'C': 0.1}
Best score: 0.8104599625726386


In [99]:
submission_lr = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Survived': pipe_final_lr.predict(X_test)
})

In [100]:
submission_lr.to_csv('./data/submission_lr_20240401.csv', index=False)

### Gaussian Naive Bayes

In [101]:
gnb = GaussianNB()

In [102]:
param_grid = {
    'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5]
}

In [103]:
CV_gnb = GridSearchCV(estimator=gnb, param_grid=param_grid, cv=StratifiedKFold(n_splits=5))

In [104]:
pipe_final_gnb = make_pipeline(col_transform, CV_gnb)
pipe_final_gnb.fit(X_train, y_train)

In [105]:
print(f'Best params: {CV_gnb.best_params_}')
print(f'Best score: {CV_gnb.best_score_}')

Best params: {'var_smoothing': 1e-09}
Best score: 0.7935782527331824


In [106]:
submission_gnb = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Survived': pipe_final_gnb.predict(X_test)
})

In [107]:
submission_gnb.to_csv('./data/submission_gnb_20240401.csv', index=False)

### XGBoost

In [108]:
xgb = XGBClassifier(random_state=113)


In [109]:
param_grid = {
    'n_estimators': [100, 150, 200],
    'max_depth': [3, 4, 5, 6],
    'learning_rate': [0.01, 0.1, 0.3],
    'subsample': [0.5, 0.7, 1],
    'colsample_bytree': [0.5, 0.7, 1],
    'gamma': [0, 1, 5],
    'reg_alpha': [0, 1, 5],
    'reg_lambda': [0, 1, 5]
}

In [110]:
CV_xgb = GridSearchCV(estimator=xgb, param_grid=param_grid, cv=StratifiedKFold(n_splits=5))

In [111]:
pipe_final_xgb = make_pipeline(col_transform, CV_xgb)
pipe_final_xgb.fit(X_train, y_train)

In [112]:
print(f'Best params: {CV_xgb.best_params_}')
print(f'Best score: {CV_xgb.best_score_}')

Best params: {'colsample_bytree': 0.5, 'gamma': 5, 'learning_rate': 0.3, 'max_depth': 3, 'n_estimators': 200, 'reg_alpha': 0, 'reg_lambda': 0, 'subsample': 0.7}
Best score: 0.8371220328966806


In [113]:
submission_xgb = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Survived': pipe_final_xgb.predict(X_test)
})

In [114]:
submission_xgb.to_csv('./submissions/submission_xgb_20240401.csv', index=False)

### Ensemble

In [116]:
submission_ensemble = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Survived': (pipe_final_rfc.predict(X_test) + pipe_final_dtc.predict(X_test) + pipe_final_knn.predict(X_test) + pipe_final_svc.predict(X_test) + pipe_final_lr.predict(X_test) + pipe_final_gnb.predict(X_test) + pipe_final_xgb.predict(X_test)) // 7
})

In [117]:
submission_ensemble.to_csv('./submissions/submission_ensemble_20240401.csv', index=False)

### AdaBoost

In [118]:
abc = AdaBoostClassifier()

In [119]:
dtc_2 = DecisionTreeClassifier(criterion = 'entropy', max_depth=10,min_samples_leaf=4, min_samples_split=10)
svc_2 = SVC(probability=True, C=10, kernel='rbf')
lr_2 = LogisticRegression(C=0.1)
lr_3 = LogisticRegression(C=0.2)
lr_4 = LogisticRegression(C=0.05)

In [120]:
param_grid = {
    'estimator': [dtc_2, svc_2, lr_2],
    'n_estimators':  [5, 10, 25, 50, 100],
    'algorithm': ['SAMME', 'SAMME.R'],
    'learning_rate': [(0.97 + x / 100) for x in range(1, 7)]
}

In [121]:
CV_abc = GridSearchCV(estimator=abc, param_grid=param_grid, cv=StratifiedKFold(n_splits=5))

In [122]:
pipe_final_abc= make_pipeline(col_transform, CV_abc)
pipe_final_abc.fit(X_train, y_train)



In [123]:
print(CV_abc.best_params_)
print(CV_abc.best_score_)

{'algorithm': 'SAMME', 'estimator': DecisionTreeClassifier(criterion='entropy', max_depth=10, min_samples_leaf=4,
                       min_samples_split=10), 'learning_rate': 1.0, 'n_estimators': 10}
0.803378311829016


In [124]:
submission_abc = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Survived': pipe_final_abc.predict(X_test)
})

In [125]:
submission_abc.to_csv('./submissions/submission_abc_20240402.csv', index=False)

### Extra Trees Classifier

In [126]:
etc = ExtraTreesClassifier()

In [127]:
param_grid = {
    "max_features": [1, 3, 10],
    "min_samples_split": [2, 3, 10],
    "min_samples_leaf": [1, 3, 10],
    "n_estimators" :[100,300],
}

In [128]:
CV_etc = GridSearchCV(estimator=etc, param_grid=param_grid, cv=StratifiedKFold(n_splits=5))

In [129]:
pipe_final_etc= make_pipeline(col_transform, CV_etc)
pipe_final_etc.fit(X_train, y_train)

In [130]:
print(CV_etc.best_params_)
print(CV_etc.best_score_)

{'max_features': 1, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 100}
0.8301191765980498


In [131]:
submission_etc = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Survived': pipe_final_etc.predict(X_test)
})

In [132]:
submission_etc.to_csv('./submissions/submission_etc_20240402.csv', index=False)

### Gradient Boosting Classifier

In [133]:
gbc = GradientBoostingClassifier()

In [134]:
param_grid = {
    'n_estimators' : [300, 400, 500],
    'learning_rate': [ 0.1, 0.3, 0.6, 1.0],
    'max_depth': [8, 10, 12],
    'min_samples_leaf': [50, 100, 120, 150],
    'max_features': [0.1, 0.3, 0.5]
}

In [135]:
CV_gbc = GridSearchCV(estimator=gbc, param_grid=param_grid, cv=StratifiedKFold(n_splits=5))

In [136]:
pipe_final_gbc= make_pipeline(col_transform, CV_gbc)
pipe_final_gbc.fit(X_train, y_train)

In [137]:
print(CV_gbc.best_params_)
print(CV_gbc.best_score_)

{'learning_rate': 0.1, 'max_depth': 12, 'max_features': 0.5, 'min_samples_leaf': 100, 'n_estimators': 500}
0.8258938244853737


In [138]:
submission_gbc = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Survived': pipe_final_gbc.predict(X_test)
})

In [139]:
submission_gbc.to_csv('./submissions/submission_gbc_20240402.csv', index=False)

### Voting Classifier

- Ada Boost Classifier weight 1
- Extra Trees Classifier weight 2
- Gaussian Naive Bayes weight 3

In [140]:
vc = VotingClassifier([('gbc', CV_gbc.best_estimator_),
                        ('etc', CV_etc.best_estimator_),
                        ('nb', CV_gnb.best_estimator_)
                        ], voting='hard', weights=[1,2,3] )

In [141]:
pipe_final_vc = make_pipeline(col_transform, vc)
pipe_final_vc.fit(X_train, y_train)

In [142]:
print(f'Best score: {cross_val_score(pipe_final_vc, X_train, y_train, cv=StratifiedKFold(n_splits=5)).mean()}')

Best score: 0.8258839751797499


In [143]:
submission_vc = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Survived': pipe_final_vc.predict(X_test)
})

In [144]:
submission_vc.to_csv('./submissions/submission_vc_20240402.csv', index=False)