In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

def engineer_features(df):
  df = df.copy()

  df['IsFemale'] = (df['Sex'] == 'female').astype(int)

  df['Pclass'] = df['Pclass'].astype(int)

  df['Age'] = df['Age'].fillna(df['Age'].median())
  df['IsChild'] = (df['Age'] <= 12).astype(int)
  df['IsBaby'] = (df['Age'] <=2).astype(int)
  df['AgeGroup'] = pd.cut(df['Age'], bins=[0,12,18,35,60,100], labels=[1,2,3,4,5])
  df['AgeGroup'] = df['AgeGroup'].astype(int)

  df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
  df['IsAlone'] = (df['FamilySize'] == 1).astype(int)
  df['BigFamily'] = (df['FamilySize'] >= 4).astype(int)

  df['Fare'] = df['Fare'].fillna(df['Fare'].median())
  df['FarePerPerson'] = df['Fare'] / df['FamilySize']

  df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
  df['Title'] = df['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don',
                                     'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
  df['Title'] = df['Title'].replace('Mlle', 'Miss')
  df['Title'] = df['Title'].replace('Mme', 'Mrs')
  title_mapping = {"Mr": 1, "Miss":2, "Mrs":3, "Master":4, "Rare":5}
  df['Title'] = df['Title'].map(title_mapping)
  df['Title'] = df['Title'].fillna(0)

  df['Embarked'] = df['Embarked'].fillna('S')
  df['Embarked'] = df['Embarked'].map({'S':0, 'C':1, 'Q':2})

  df['WomanChild'] = ((df['IsFemale']==1) | (df['Age']<=12)).astype(int)
  df['ClassAge'] = df['Pclass'] * df['AgeGroup']

  return df

full = pd.concat([train, test], sort=False)
full = engineer_features(full)

train2 = full[:len(train)].copy()
test2 = full[len(train):].copy()

features = ['IsFemale', 'Pclass', 'AgeGroup', 'IsChild', 'IsBaby', 'FamilySize', 'IsAlone',
            'FarePerPerson', 'Title', 'Embarked', 'WomanChild', 'ClassAge', 'BigFamily', 'Fare']

X = train2[features]
y = train['Survived']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

model = RandomForestClassifier(
    n_estimators=500,
    max_depth=7,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)
model.fit(X_train, y_train)

pred_val = model.predict(X_val)
acc = accuracy_score(y_val, pred_val)
print(f"\nAkurasi Lokal: {acc:.5f} → {acc*100:.2f}%")

pred_test = model.predict(test2[features])

submission = pd.DataFrame({
    "PassengerId": test["PassengerId"],
    "Survived": pred_test.astype(int)
})
submission.to_csv("submission_titanic_day2.csv", index=False)
print("\nSubmission_titanic_day2.csv Sudah Jadi!")

print(submission.head(10))

  df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)



Akurasi Lokal: 0.79330 → 79.33%

Submission_titanic_day2.csv Sudah Jadi!
   PassengerId  Survived
0          892         0
1          893         0
2          894         0
3          895         0
4          896         0
5          897         0
6          898         1
7          899         0
8          900         1
9          901         0
