# Import Library and Dataset

In [27]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [28]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

# Preprocessing

In [29]:
df_train.dropna(subset=['Embarked'], inplace=True)

In [30]:
df_train['Age'].fillna(df_train['Age'].median(), inplace=True)
df_train['Embarked'].fillna(df_train['Embarked'].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_train['Age'].fillna(df_train['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_train['Embarked'].fillna(df_train['Embarked'].mode()[0], inplace=True)


In [31]:
df_train['Sex'].unique()

array(['male', 'female'], dtype=object)

In [32]:
df_train['Embarked'].unique()

array(['S', 'C', 'Q'], dtype=object)

In [33]:
from sklearn.preprocessing import LabelEncoder
sex_encoder = LabelEncoder()
df_train['Sex'] = sex_encoder.fit_transform(df_train['Sex'])
df_train['Sex']

encoded_df = pd.get_dummies(df_train['Embarked'], prefix='Embarked')

df_train = pd.concat([df_train, encoded_df], axis=1)
df_train = df_train.drop(columns='Embarked')

In [34]:
embarked_encoder = LabelEncoder()
df_train['Embarked_C'] = embarked_encoder.fit_transform(df_train['Embarked_C'])
df_train['Embarked_Q'] = embarked_encoder.transform(df_train['Embarked_Q'])
df_train['Embarked_S'] = embarked_encoder.transform(df_train['Embarked_S'])

In [35]:
df_train.to_csv('train_cleaned.csv', index=False)

In [36]:
df_train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked_C,Embarked_Q,Embarked_S
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.2500,,0,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,1,0,0
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.9250,,0,0,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1000,C123,0,0,1
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.0500,,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",1,27.0,0,0,211536,13.0000,,0,0,1
887,888,1,1,"Graham, Miss. Margaret Edith",0,19.0,0,0,112053,30.0000,B42,0,0,1
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",0,28.0,1,2,W./C. 6607,23.4500,,0,0,1
889,890,1,1,"Behr, Mr. Karl Howell",1,26.0,0,0,111369,30.0000,C148,1,0,0


In [37]:
df_train['Age'].isna().sum()

0

In [38]:
# X = df_train[['Pclass', 'Sex', 'SibSp', 'Parch', 'Fare', 'Embarked_C', 'Embarked_Q', 'Embarked_S']]
# y = df_train['Survived']
X = df_train[['Pclass', 'Sex', 'Fare','Age']]
y = df_train['Survived']

In [39]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train

Unnamed: 0,Pclass,Sex,Fare,Age
708,1,0,151.5500,22.0
240,3,0,14.4542,28.0
382,3,1,7.9250,32.0
792,3,0,69.5500,28.0
683,3,1,46.9000,14.0
...,...,...,...,...
107,3,1,7.7750,28.0
271,3,1,0.0000,25.0
862,1,0,25.9292,48.0
436,3,0,34.3750,21.0


In [40]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

## Decision Tree

In [41]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
tree.fit(X_train, y_train)

from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = tree.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[83 26]
 [16 53]]


0.7640449438202247

## svc

In [74]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
# Define parameter grid
from sklearn.model_selection import GridSearchCV 

# defining parameter range 
param_grid = {'C': [0.1, 1, 10, 100, 1000], 
			'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
			'kernel': ['rbf', 'sigmoid']} 

# Create SVC classifier
svc = SVC()

# Grid search cross-validation
grid_search = GridSearchCV(estimator=svc,
                        param_grid=param_grid, 
                        cv=3, 
                        scoring='accuracy', 
                        verbose=2)
grid_search.fit(X_train, y_train)



Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .....................C=0.1, gamma=1, kernel=sigmoid; total time=   0.0s
[CV] END .....................C=0.1, gamma=1, kernel=sigmoid; total time=   0.0s
[CV] END .....................C=0.1, gamma=1, kernel=sigmoid; total time=   0.0s
[CV] END .....................C=0.1, gamma=1, kernel=sigmoid; total time=   0.0s
[CV] END .....................C=0.1, gamma=1, kernel=sigmoid; total time=   0.0s
[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time=   0.0s
[CV] END .......................C=0.1, gamma=0.

In [75]:
# Print best parameters
print("Best parameters:", grid_search.best_params_)


Best parameters: {'C': 1000, 'gamma': 0.0001, 'kernel': 'rbf'}


In [76]:
from sklearn.svm import SVC
SVM = SVC(C=100, gamma=0.1, kernel = 'rbf')
SVM2 = SVC(C=1000, gamma=0.0001,kernel='rbf')
SVM.fit(X_train, y_train)
SVM2.fit(X_train, y_train)

from sklearn.metrics import confusion_matrix, accuracy_score
print("Model 1 :")
y_pred = SVM.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
print(accuracy_score(y_test, y_pred))
accuracy_score(y_test, y_pred)
print("Model 2 :")
y_pred2 = SVM2.predict(X_test)
cm2 = confusion_matrix(y_test, y_pred2)
print(cm2)
print(accuracy_score(y_test, y_pred2))

Model 1 :
[[94 15]
 [15 54]]
0.8314606741573034
Model 2 :
[[89 20]
 [16 53]]
0.797752808988764


## Logistic Regression

In [45]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)

from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[87 22]
 [15 54]]


0.7921348314606742

## Random Forest

In [46]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0)
classifier.fit(X_train, y_train)

from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)



[[87 22]
 [14 55]]


0.797752808988764

In [47]:
# feature_importances = classifier.feature_importances_
# data = pd.DataFrame(X_train, columns=['Pclass', 'Sex', 'SibSp', 'Parch', 'Fare', 'Embarked_C', 'Embarked_Q', 'Embarked_S'])
# importances_df = pd.DataFrame({'Feature': data.columns, 'Importance': feature_importances})
# importances_df = importances_df.sort_values(by='Importance', ascending=False)

# plt.figure(figsize=(10, 6) ,)
# plt.barh(importances_df['Feature'], importances_df['Importance'])
# plt.xlabel('Importance')
# plt.ylabel('Feature')
# plt.title('Feature Importance')
# plt.show()

## Gradient Boosting

In [48]:
import xgboost as xgb
model = xgb.XGBClassifier()

# Definisikan daftar parameter yang akan diuji
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.01, 0.001],
    'n_estimators': [100, 200, 300],
    'objective': ['binary:logistic']
}

# Inisialisasi GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='accuracy', verbose=3)

# Lakukan pencarian grid
grid_search.fit(X_train, y_train)

# Tampilkan parameter terbaik dan skor terbaik
print("Best Parameters:", grid_search.best_params_)

Fitting 3 folds for each of 27 candidates, totalling 81 fits
[CV 1/3] END learning_rate=0.1, max_depth=3, n_estimators=100, objective=binary:logistic;, score=0.793 total time=   0.0s
[CV 2/3] END learning_rate=0.1, max_depth=3, n_estimators=100, objective=binary:logistic;, score=0.840 total time=   0.0s
[CV 3/3] END learning_rate=0.1, max_depth=3, n_estimators=100, objective=binary:logistic;, score=0.823 total time=   0.0s
[CV 1/3] END learning_rate=0.1, max_depth=3, n_estimators=200, objective=binary:logistic;, score=0.797 total time=   0.0s


[CV 2/3] END learning_rate=0.1, max_depth=3, n_estimators=200, objective=binary:logistic;, score=0.840 total time=   0.0s
[CV 3/3] END learning_rate=0.1, max_depth=3, n_estimators=200, objective=binary:logistic;, score=0.823 total time=   0.0s
[CV 1/3] END learning_rate=0.1, max_depth=3, n_estimators=300, objective=binary:logistic;, score=0.781 total time=   0.0s
[CV 2/3] END learning_rate=0.1, max_depth=3, n_estimators=300, objective=binary:logistic;, score=0.852 total time=   0.0s
[CV 3/3] END learning_rate=0.1, max_depth=3, n_estimators=300, objective=binary:logistic;, score=0.814 total time=   0.0s
[CV 1/3] END learning_rate=0.1, max_depth=5, n_estimators=100, objective=binary:logistic;, score=0.793 total time=   0.0s
[CV 2/3] END learning_rate=0.1, max_depth=5, n_estimators=100, objective=binary:logistic;, score=0.840 total time=   0.0s
[CV 3/3] END learning_rate=0.1, max_depth=5, n_estimators=100, objective=binary:logistic;, score=0.840 total time=   0.0s
[CV 1/3] END learning_ra

In [49]:
import xgboost as xgb

boost = xgb.XGBClassifier()

params = {'learning_rate': 0.01, 'max_depth': 7, 'n_estimators': 300, 'objective': 'binary:logistic'}
boost = xgb.XGBClassifier(learning_rate=0.01, max_depth=7, n_estimators=300, objective='binary:logistic')

boost.fit(X_train, y_train)


from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = boost.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)



[[94 15]
 [18 51]]


0.8146067415730337

## Naive Bayes


In [80]:

from sklearn.naive_bayes import GaussianNB
# Definisikan parameter grid
param_grid = {
    'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5],  # Hanya parameter ini yang umum untuk GaussianNB
    'priors': [None, [0.6, 0.4], [0.5, 0.5], [0.7, 0.3]]
}
model = GaussianNB()

# Inisialisasi GridSearchCV
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')

# Lakukan pencarian grid
grid_search.fit(X_train, y_train)

# Tampilkan parameter terbaik dan skor terbaik
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)
model = GaussianNB(priors=[0.7, 0.3], var_smoothing=1e-5)
model.fit(X_train, y_train)
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)


Best Parameters: {'priors': [0.7, 0.3], 'var_smoothing': 1e-05}
Best Score: 0.7892347089530187
[[90 15]
 [22 52]]


0.7932960893854749

# Visualization

# Modelling