<a href="https://colab.research.google.com/github/ronggobp/Machine-Learning-Course-2026/blob/main/notebooks/week-02-eda/02_Titanic_EDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Library, W&B Setup & Data Loading

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import wandb

# Inisialisasi W&B untuk tracking EDA [cite: 30, 40]
run = wandb.init(project="titanic-eda-2026", name="comprehensive-eda-hybrid")

# Load data langsung dari sumber [cite: 15]
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
df = pd.read_csv(url)

# Log dataset mentah ke W&B Table [cite: 40]
wandb.log({"raw_data": wandb.Table(dataframe=df)})

print("Dataset Loaded Successfully!")
df.head()

Inspeksi Mendalam (Mengenal Masalah)

In [None]:
# Cek info umum dan statistik deskriptif [cite: 15]
print("--- Data Info ---")
df.info()

print("\n--- Missing Values ---")
print(df.isnull().sum())

Data Cleaning (Sanitasi Data)

In [None]:
# 1. Imputasi Age dengan Median [cite: 16]
df['Age'] = df['Age'].fillna(df['Age'].median())

# 2. Imputasi Embarked dengan Modus [cite: 16]
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

# 3. Drop Cabin karena >70% kosong [cite: 16]
df.drop(columns=['Cabin'], inplace=True)

print("Cleaning Selesai. Sisa data kosong:", df.isnull().sum().sum())

Exploratory Data Analysis (Visualisasi Hybrid)

In [None]:
plt.figure(figsize=(16, 5))

# Plot 1: Distribusi Selamat [cite: 17]
plt.subplot(1, 3, 1)
sns.countplot(data=df, x='Survived', hue='Survived', palette='pastel', legend=False)
plt.title("Distribusi Keselamatan")

# Plot 2: Distribusi Umur [cite: 17]
plt.subplot(1, 3, 2)
sns.histplot(df['Age'], kde=True, color='teal')
plt.title("Distribusi Umur Penumpang")

# Plot 3: Survival Rate by Gender (Warning-Free)
plt.subplot(1, 3, 3)
sns.barplot(data=df, x='Sex', y='Survived', hue='Sex', palette='coolwarm', legend=False)
plt.title("Survival Rate: Pria vs Wanita")

plt.tight_layout()
plt.show()

Feature Engineering (Persiapan ML)

In [None]:
# 1. Membuat fitur FamilySize (Sesuai tugas)
# Formula: FamilySize = SibSp + Parch + 1
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1

# 2. Encoding: Mengubah Teks menjadi Angka
# Sex: male=0, female=1
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})

# Embarked: Categorical Encoding
df['Embarked'] = df['Embarked'].astype('category').cat.codes

print("Feature Engineering Selesai. Data sekarang bersifat numerik.")
df.head()

Analisis Korelasi & Final Logging

In [None]:
# Heatmap Korelasi untuk melihat hubungan antar fitur
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(numeric_only=True), annot=True, cmap='RdYlGn', fmt=".2f")
plt.title("Matriks Korelasi Titanic")
plt.show()

# Log data akhir yang sudah bersih dan ter-encode ke W&B [cite: 41]
clean_table = wandb.Table(dataframe=df)
wandb.log({"final_processed_data": clean_table})

# Ringkasan akhir di dashboard
wandb.run.summary["final_features_count"] = df.shape[1]
wandb.finish()