In [None]:
# =====================================================
# 1. Problem Statement
# =====================================================
# Tujuan:
# Memprediksi apakah penumpang Titanic selamat atau tidak berdasarkan data demografi dan tiket.
# Target: kolom "Survived" (0 = Tidak Selamat, 1 = Selamat)

# =====================================================
# 2. Import Library & Data Collection
# =====================================================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

# Load dataset
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# Melihat 5 data teratas
print(train.head())

# =====================================================
# 3. Exploratory Data Analysis (EDA)
# =====================================================
print(train.info())
print(train.describe())

# Distribusi target
sns.countplot(x='Survived', data=train)
plt.title("Distribusi Survival")
plt.show()

# Survival rate berdasarkan Gender
sns.countplot(x='Sex', hue='Survived', data=train)
plt.title("Survival berdasarkan Gender")
plt.show()

# Survival rate berdasarkan Pclass
sns.countplot(x='Pclass', hue='Survived', data=train)
plt.title("Survival berdasarkan Kelas Penumpang")
plt.show()

# =====================================================
# 4. Data Cleaning
# =====================================================
# Cek missing values
print(train.isnull().sum())

# Imputasi Age dengan median per Pclass & Sex
train['Age'] = train.groupby(['Pclass', 'Sex'])['Age'].apply(lambda x: x.fillna(x.median()))
test['Age'] = test.groupby(['Pclass', 'Sex'])['Age'].apply(lambda x: x.fillna(x.median()))

# Isi Embarked dengan mode
train['Embarked'] = train['Embarked'].fillna(train['Embarked'].mode()[0])
test['Fare'] = test['Fare'].fillna(test['Fare'].median())

# =====================================================
# 5. Feature Engineering
# =====================================================
# Buat kolom FamilySize
train['FamilySize'] = train['SibSp'] + train['Parch'] + 1
test['FamilySize'] = test['SibSp'] + test['Parch'] + 1

# Ambil Title dari Name
train['Title'] = train['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
test['Title'] = test['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

# Encode Title jarang menjadi kategori "Rare"
rare_titles = train['Title'].value_counts()[train['Title'].value_counts() < 10].index
train['Title'] = train['Title'].replace(rare_titles, 'Rare')
test['Title'] = test['Title'].replace(rare_titles, 'Rare')

# Encoding kolom kategorikal
label_cols = ['Sex', 'Embarked', 'Title']
le = LabelEncoder()
for col in label_cols:
    train[col] = le.fit_transform(train[col])
    test[col] = le.transform(test[col])

# Drop kolom yang tidak diperlukan
drop_cols = ['PassengerId', 'Name', 'Ticket', 'Cabin']
train = train.drop(columns=drop_cols)
test_data = test.drop(columns=drop_cols)

# =====================================================
# 6. Data Splitting
# =====================================================
X = train.drop('Survived', axis=1)
y = train['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standarisasi fitur numerik
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
test_data_scaled = scaler.transform(test_data)

# =====================================================
# 7. Modeling
# =====================================================
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(random_state=42),
    "KNN": KNeighborsClassifier()
}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"=== {name} ===")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))

# =====================================================
# 8. Model Evaluation (Confusion Matrix contoh)
# =====================================================
best_model = RandomForestClassifier(random_state=42)
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix - Random Forest")
plt.show()

# =====================================================
# 9. Hyperparameter Tuning
# =====================================================
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10]
}
grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5)
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)
