# Titanic

## https://www.kaggle.com/competitions/titanic

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('data_train.csv')

# Split Data

In [3]:
from sklearn.model_selection import train_test_split

# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

In [4]:
X = df[['Pclass', 'Sex', 'Embarked', 'Age', 'SibSp', 'Parch', 'Fare']]
y = df['Survived']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
X_train_asli = X_train.copy()

In [7]:
numeric_features     = ['Age', 'SibSp', 'Parch', 'Fare']
categorical_features = ['Pclass', 'Sex', 'Embarked']

# Imputer

In [8]:
from sklearn.impute import SimpleImputer

# https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html

In [9]:
# X_train.isnull().sum()

In [10]:
imputer_numeric = SimpleImputer(strategy='mean')
X_train[numeric_features] = imputer_numeric.fit_transform(X_train[numeric_features])

In [11]:
imputer_categorical = SimpleImputer(strategy='most_frequent')
X_train[categorical_features] = imputer_categorical.fit_transform(X_train[categorical_features])

In [12]:
# X_train.isnull().sum()

In [13]:
import joblib

joblib.dump(imputer_numeric, 'imputer_numeric.joblib')
joblib.dump(imputer_categorical, 'imputer_categorical.joblib')
print('done')

done


In [14]:
# X_train.head()

# Scaler

In [15]:
from sklearn.preprocessing import StandardScaler

# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html

In [16]:
# X_train.describe()

In [17]:
scaler_numeric = StandardScaler()

X_train[numeric_features] = scaler_numeric.fit_transform(X_train[numeric_features])

In [18]:
joblib.dump(scaler_numeric, 'scaler_numeric.joblib')

print('done')

done


In [19]:
# X_train.head()

In [20]:
# X_train.describe()

# One Hot Encoder

In [21]:
from sklearn.preprocessing import OneHotEncoder

# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html

In [22]:
# X_train[categorical_features].head()

In [23]:
encoder_categorical = OneHotEncoder()

X_train_encoded_array = encoder_categorical.fit_transform(X_train[categorical_features]).toarray()
# print(X_train_encoded_array)

In [24]:
joblib.dump(encoder_categorical, 'encoder_categorical.joblib')

print('done')

done


In [25]:
X_encoded_df = pd.DataFrame(X_train_encoded_array, columns=encoder_categorical.get_feature_names_out(categorical_features))
# X_encoded_df.head()

In [26]:
X_encoded_df.reset_index(drop=True, inplace=True)
X_train.reset_index(drop=True, inplace=True)

In [27]:
X_train = pd.concat([X_train, X_encoded_df], axis=1)
# X_train.head()

In [28]:
X_train.drop(['Pclass', 'Sex', 'Embarked'], axis=1, inplace=True)
# X_train.head()

In [29]:
# X_train_asli.head()

# Train Model Using Logistic Regression

In [61]:
from sklearn.ensemble import RandomForestClassifier

# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

In [62]:
clf_model = RandomForestClassifier(max_depth=2, random_state=0)
clf_model.fit(X_train, y_train)

In [63]:
# logreg_model = LogisticRegression(random_state=42, max_iter=1000, C=0.1)
# logreg_model.fit(X_train, y_train)

In [64]:
joblib.dump(clf_model, 'clf_model.joblib')

print('done')

done


from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split> from sklearn.naive_bayes import GaussianNB

# Simple Test

In [65]:
data_dict = {
    'Pclass': 3,
    'Name': 'Braund, Mr. Owen Harris',
    'Sex': 'male',
    'Age': 22,
    'SibSp': 1,
    'Parch': 0,
    'Ticket': 'A/5 21171',
    'Fare': 7.25,
    'Cabin': None,
    'Embarked': 'S'
}

In [66]:
import joblib

# Load the saved models
imputer_numeric = joblib.load('imputer_numeric.joblib')
imputer_categorical = joblib.load('imputer_categorical.joblib')
scaler_numeric = joblib.load('scaler_numeric.joblib')
encoder_categorical = joblib.load('encoder_categorical.joblib')
logreg_model = joblib.load('logreg_model.joblib')

In [67]:
new_data = pd.DataFrame([data_dict])
new_data.drop(['Name','Cabin','Ticket'], axis=1, inplace=True)

In [68]:
new_data[numeric_features] = imputer_numeric.transform(new_data[numeric_features])
new_data[categorical_features] = imputer_categorical.transform(new_data[categorical_features])

new_data[numeric_features] = scaler_numeric.transform(new_data[numeric_features])

new_data_encoded_array = encoder_categorical.transform(new_data[categorical_features]).toarray()
new_data_encoded_df = pd.DataFrame(new_data_encoded_array, columns=encoder_categorical.get_feature_names_out(categorical_features))

new_data.reset_index(drop=True, inplace=True)
new_data = pd.concat([new_data, new_data_encoded_df], axis=1)
new_data.drop(['Pclass', 'Sex', 'Embarked'], axis=1, inplace=True)

# new_data

In [69]:
predictions = logreg_model.predict(new_data)

print(predictions)

[0]


# X_test dan y_test

In [70]:
# X_test.head()

In [71]:
X_test[numeric_features] = imputer_numeric.transform(X_test[numeric_features])
X_test[categorical_features] = imputer_categorical.transform(X_test[categorical_features])

X_test[numeric_features] = scaler_numeric.transform(X_test[numeric_features])

X_test_encoded_array = encoder_categorical.transform(X_test[categorical_features]).toarray()
X_test_encoded_df = pd.DataFrame(X_test_encoded_array, columns=encoder_categorical.get_feature_names_out(categorical_features))

X_test.reset_index(drop=True, inplace=True)
X_test = pd.concat([X_test, X_test_encoded_df], axis=1)
X_test.drop(['Pclass', 'Sex', 'Embarked'], axis=1, inplace=True)

predictions = logreg_model.predict(X_test)

KeyError: "None of [Index(['Pclass', 'Sex', 'Embarked'], dtype='object')] are in the [columns]"

In [None]:
# predictions

In [72]:
# y_test

In [73]:
result_df = pd.concat([X_test.reset_index(drop=True), y_test.reset_index(drop=True), pd.Series(predictions, name='Predictions')], axis=1)
# result_df.head()

In [74]:
from sklearn.metrics import accuracy_score

# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html

accuracy = accuracy_score(y_test, predictions)

print(f"Accuracy: {accuracy:.2f}")


ValueError: Found input variables with inconsistent numbers of samples: [179, 1]

In [75]:
from sklearn.metrics import recall_score

# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.recall_score.html

recall = recall_score(y_test, predictions)

print(f"Recall: {recall:.2f}")


ValueError: Found input variables with inconsistent numbers of samples: [179, 1]

In [76]:
from sklearn.metrics import precision_score

# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_score.html

precision = precision_score(y_test, predictions)

print(f"Precision: {precision:.2f}")


ValueError: Found input variables with inconsistent numbers of samples: [179, 1]

In [77]:
from sklearn.metrics import f1_score

# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html

f1 = f1_score(y_test, predictions)

print(f"F1 Score: {f1:.2f}")


ValueError: Found input variables with inconsistent numbers of samples: [179, 1]

In [78]:
from sklearn.metrics import classification_report

# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html

report = classification_report(y_test, predictions)

print(report)

ValueError: Found input variables with inconsistent numbers of samples: [179, 1]

# Submissions

In [79]:
df_sub = pd.read_csv('data_test.csv')
df_sub_asli = df_sub.copy()

In [80]:
# df_sub.head()

In [81]:
df_sub.drop(['Name','Cabin','Ticket','PassengerId'], axis=1, inplace=True)

In [82]:
df_sub[numeric_features] = imputer_numeric.transform(df_sub[numeric_features])
df_sub[categorical_features] = imputer_categorical.transform(df_sub[categorical_features])

df_sub[numeric_features] = scaler_numeric.transform(df_sub[numeric_features])

df_sub_encoded_array = encoder_categorical.transform(df_sub[categorical_features]).toarray()
df_sub_encoded_df = pd.DataFrame(df_sub_encoded_array, columns=encoder_categorical.get_feature_names_out(categorical_features))

df_sub.reset_index(drop=True, inplace=True)
df_sub = pd.concat([df_sub, df_sub_encoded_df], axis=1)
df_sub.drop(['Pclass', 'Sex', 'Embarked'], axis=1, inplace=True)

predictions_sub = logreg_model.predict(df_sub)

In [83]:
predictions_sub

array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [84]:
df_sub_asli.reset_index(drop=True, inplace=True)
df_sub_asli = pd.concat([df_sub_asli, pd.Series(predictions_sub, name='Survived')], axis=1)
# df_sub_asli.head()

In [85]:
df_sub_asli = df_sub_asli[['PassengerId','Survived']]
# df_sub_asli.head()

In [86]:
df_sub_asli.to_csv('hasil_prediksi.csv', index=False)