## Load Data

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [2]:
df = pd.read_csv('/content/train_clean.csv')
dt = pd.read_csv('/content/test_clean.csv')
test_fe = pd.read_csv('/content/test_fe.csv')

In [3]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Age_Cat,Fare_Cat,Title,Family,Family_Cat
0,0,3,0,22.0,1,0,7.25,0,0,0,0,2,0
1,1,1,1,38.0,1,0,71.2833,1,0,1,1,2,0
2,1,3,1,26.0,0,0,7.925,0,0,0,2,1,1
3,1,1,1,35.0,1,0,53.1,0,0,1,1,2,0
4,0,3,0,35.0,0,0,8.05,0,0,0,0,1,1


In [4]:
from sklearn.model_selection import train_test_split

X = df.drop(columns=['Survived'])
y = df['Survived']
X_train, X_test, y_train, y_test =train_test_split(X, y, test_size=0.2, random_state=42)

## Ensemble Methods

In [5]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

# Model dasar untuk ensemble methods
random_forest = RandomForestClassifier(n_estimators=100, random_state=42)
adaboost = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1), n_estimators=100, random_state=42)
logistic_regression = LogisticRegression(random_state=42)

# Model stacking
estimators = [('random_forest', random_forest), ('adaboost', adaboost), ('logistic_regression', logistic_regression)]
stacking = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())

stacking.fit(X_train, y_train)
y_pred = stacking.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [6]:
accuracy = np.mean(y_pred == y_test)
print(f'Akurasi: {accuracy:.4f}')

Akurasi: 0.7921


In [7]:
y_test_pred = stacking.predict(dt)
dt_result = pd.DataFrame(test_fe['PassengerId'])
dt_result['Survived'] = y_test_pred
dt_result.to_csv('ensemble_fe.csv', index=False)

## Boosting

In [8]:
!pip install catboost
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

Collecting catboost
  Downloading catboost-1.2-cp310-cp310-manylinux2014_x86_64.whl (98.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.6/98.6 MB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2


In [9]:
models = [
    ('AdaBoost', AdaBoostClassifier()),
    ('Gradient Boosting', GradientBoostingClassifier()),
    ('XGBoost', XGBClassifier()),
    ('LightGBM', LGBMClassifier()),
    ('CatBoost', CatBoostClassifier(silent=True)) ]

results = []
for name, model in models:
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)
  accuracy = accuracy_score(y_test, y_pred)
  precision = precision_score(y_test, y_pred)
  recall = recall_score(y_test, y_pred)
  f1 = f1_score(y_test, y_pred)
  results.append((name, accuracy, precision, recall, f1))

# Create comparison table
comparison_df = pd.DataFrame(results, columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])

[LightGBM] [Info] Number of positive: 271, number of negative: 440
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 228
[LightGBM] [Info] Number of data points in the train set: 711, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.381153 -> initscore=-0.484656
[LightGBM] [Info] Start training from score -0.484656


In [10]:
comparison_df

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,AdaBoost,0.808989,0.727273,0.811594,0.767123
1,Gradient Boosting,0.797753,0.726027,0.768116,0.746479
2,XGBoost,0.792135,0.735294,0.724638,0.729927
3,LightGBM,0.814607,0.764706,0.753623,0.759124
4,CatBoost,0.764045,0.684932,0.724638,0.704225


In [15]:
# Training with LightGBM
lgbm = LGBMClassifier()

lgbm.fit(X_train, y_train)
y_test_pred = lgbm.predict(dt)
dt_result = pd.DataFrame(test_fe['PassengerId'])
dt_result['Survived'] = y_test_pred
dt_result.to_csv('lgbm_prediction.csv', index=False)

[LightGBM] [Info] Number of positive: 271, number of negative: 440
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 228
[LightGBM] [Info] Number of data points in the train set: 711, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.381153 -> initscore=-0.484656
[LightGBM] [Info] Start training from score -0.484656


## Another Comparison

In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

In [12]:
models = [
    ('Logistic Regression', LogisticRegression()),
    ('K-Nearest Neighbors', KNeighborsClassifier()),
    ('Support Vector Machines', SVC()),
    ('Decision Tree', DecisionTreeClassifier()),
    ('Random Forest', RandomForestClassifier()),
    ('AdaBoost', AdaBoostClassifier()),
    ('Gradient Boosting', GradientBoostingClassifier()),
    ('Naive Bayes', GaussianNB()),
     ('Neural Network', MLPClassifier())
]

# Fungsi untuk evaluasi model
def evaluate_model(model, X_test, y_test):
  y_pred = model.predict(X_test)
  accuracy = accuracy_score(y_test, y_pred)
  precision = precision_score(y_test, y_pred)
  recall = recall_score(y_test, y_pred)
  f1 = f1_score(y_test, y_pred)

  return accuracy, precision, recall, f1

# Melatih dan mengevaluasi model
results = []
for name, model in models:
  model.fit(X_train, y_train)
  accuracy, precision, recall, f1 = evaluate_model(model, X_test, y_test)
  results.append([name, accuracy, precision, recall, f1])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [13]:
results_df = pd.DataFrame(results, columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])
display(results_df)

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,Logistic Regression,0.803371,0.72973,0.782609,0.755245
1,K-Nearest Neighbors,0.696629,0.61194,0.594203,0.602941
2,Support Vector Machines,0.679775,0.6875,0.318841,0.435644
3,Decision Tree,0.730337,0.62963,0.73913,0.68
4,Random Forest,0.747191,0.653846,0.73913,0.693878
5,AdaBoost,0.808989,0.727273,0.811594,0.767123
6,Gradient Boosting,0.797753,0.726027,0.768116,0.746479
7,Naive Bayes,0.775281,0.679012,0.797101,0.733333
8,Neural Network,0.814607,0.736842,0.811594,0.772414


In [16]:
# Training with LightGBM
mlp = MLPClassifier()

mlp.fit(X_train, y_train)
y_test_pred = mlp.predict(dt)
dt_result = pd.DataFrame(test_fe['PassengerId'])
dt_result['Survived'] = y_test_pred
dt_result.to_csv('mlp_prediction.csv', index=False)

