--------
### **Código Inicial** ###

In [52]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier

In [56]:
df_train = pd.read_csv(r"data/train.csv")
df_test = pd.read_csv(r"data/test.csv")

In [None]:
df_train["Age"].fillna(round(df_train["Age"].median()), inplace=True)
df_test["Age"].fillna(round(df_test["Age"].median()), inplace=True)

df_train['Sex'] = df_train['Sex'].map({'male': 0, 'female': 1})
df_test['Sex'] = df_train['Sex'].map({'male': 0, 'female': 1})

In [None]:
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']
X = df_train[features] # Features (entrada do modelo)
y = df_train['Survived'] # Target (o que queremos prever)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_val)
print(f'Acuracia: {accuracy_score(y_val, y_pred)}')

In [None]:
df_test['Fare'].fillna(df_test['Fare'].median(), inplace=True)
X_test = df_test[features]
test_predictions = model.predict(X_test)

In [None]:
submission = pd.DataFrame({'PassengerId': df_test['PassengerId'], 'Survived': test_predictions})
submission.to_csv('submission.csv', index=False)

--------
### **Código Melhorias** ###

In [57]:
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}

df_train['Age'].fillna(round(df_train['Age'].median()), inplace=True)
df_train['Sex'] = df_train['Sex'].map({'male': 0, 'female': 1})
df_train['FamilySize'] = df_train['SibSp'] + df_train['Parch']
df_train['IsAlone'] = df_train['FamilySize'].apply(lambda x: 1 if x == 0 else 0)
df_train['Title'] = df_train['Name'].str.extract(r',\s*(\w+)\.')
df_train['IsChild'] = df_train['Age'].apply(lambda x: 1 if x < 18 else 0)
df_train['Title'] = df_train['Title'].replace(['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
df_train['Title'] = df_train['Title'].map(title_mapping)
df_train['Title'].fillna(df_train['Title'].median(), inplace=True)

df_test['Age'].fillna(round(df_test['Age'].median()), inplace=True)
df_test['Sex'] = df_test['Sex'].map({'male': 0, 'female': 1})
df_test['FamilySize'] = df_test['SibSp'] + df_test['Parch']
df_test['IsAlone'] = df_test['FamilySize'].apply(lambda x: 1 if x == 0 else 0)
df_test['Title'] = df_test['Name'].str.extract(r',\s*(\w+)\.')
df_test['IsChild'] = df_test['Age'].apply(lambda x: 1 if x < 18 else 0)
df_test['Title'] = df_test['Title'].replace(['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
df_test['Title'] = df_test['Title'].map(title_mapping)
df_test['Title'].fillna(df_test['Title'].median(), inplace=True)

features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'FamilySize', 'IsAlone', 'Title', 'IsChild']
X = df_train[features]
y = df_train['Survived']

In [55]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}

model = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_val)
print(f'Acuracia: {accuracy_score(y_val, y_pred)}')

Acuracia: 0.8268156424581006


In [59]:
df_test['Fare'].fillna(df_test['Fare'].median(), inplace=True)
X_test = df_test[features]
test_predictions = best_model.predict(X_test)

In [60]:
submission = pd.DataFrame({'PassengerId': df_test['PassengerId'], 'Survived': test_predictions})
submission.to_csv('submission.csv', index=False)

-------

In [58]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
xgb_model = XGBClassifier(random_state=42)
lr_model = LogisticRegression(max_iter=1000, random_state=42)

voting_model = VotingClassifier(estimators=[
    ('rf', rf_model),
    ('xgb', xgb_model),
    ('lr', lr_model)
], voting='hard')

voting_model.fit(X_train, y_train)
y_pred = voting_model.predict(X_val)
print(f'Acurácia do Voting Classifier: {accuracy_score(y_val, y_pred)}')

Acurácia do Voting Classifier: 0.8100558659217877
