In [2]:
import pandas as pd
import numpy as np

# Carregando os dados
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')
gender_submission = pd.read_csv('data/gender_submission.csv')

# Visualizando as primeiras linhas do dataset de treino
train_df.head()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Análise Exploratória


In [3]:
# Informações gerais sobre o dataset
train_df.info()

# Estatísticas descritivas
train_df.describe()

# Verificando valores nulos
train_df.isnull().sum()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [4]:
# Preenchendo valores nulos na coluna 'Age' com a mediana
train_df['Age'].fillna(train_df['Age'].median(), inplace=True)
test_df['Age'].fillna(test_df['Age'].median(), inplace=True)

# Preenchendo valores nulos na coluna 'Embarked' com o valor mais frequente
train_df['Embarked'].fillna(train_df['Embarked'].mode()[0], inplace=True)

# Preenchendo valores nulos na coluna 'Fare' no dataset de teste
test_df['Fare'].fillna(test_df['Fare'].median(), inplace=True)

# Convertendo variáveis categóricas em numéricas
train_df['Sex'] = train_df['Sex'].map({'male': 0, 'female': 1})
test_df['Sex'] = test_df['Sex'].map({'male': 0, 'female': 1})

train_df['Embarked'] = train_df['Embarked'].map({'C': 0, 'Q': 1, 'S': 2})
test_df['Embarked'] = test_df['Embarked'].map({'C': 0, 'Q': 1, 'S': 2})


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df['Age'].fillna(train_df['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df['Age'].fillna(test_df['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object 

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Separando as variáveis preditoras e a variável alvo
X = train_df.drop(['Survived', 'PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
y = train_df['Survived']

# Dividindo os dados em conjunto de treino e teste
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Treinando o modelo
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Avaliando o modelo
y_pred = model.predict(X_val)
print(f'Acurácia: {accuracy_score(y_val, y_pred):.4f}')


Acurácia: 0.8156


In [10]:
# Exemplo de perfil personalizado
perfil_personalizado = {
    'Pclass': 3,         # Classe da cabine (1, 2, ou 3)
    'Sex': 1,            # 0 para masculino, 1 para feminino
    'Age': 27,           # Idade
    'SibSp': 0,          # Número de irmãos/cônjuges a bordo
    'Parch': 0,          # Número de pais/filhos a bordo
    'Fare': 8.25,        # Tarifa do bilhete
    'Embarked': 2        # Porto de embarque (0 = Cherbourg, 1 = Queenstown, 2 = Southampton)
}

# Transformando o perfil em um DataFrame para usar com o modelo
perfil_df = pd.DataFrame([perfil_personalizado])

# Fazendo a previsão com base no perfil
would_survive = model.predict(perfil_df)[0]
probabilidade_sobreviver = model.predict_proba(perfil_df)[0][1]

print(f'Sobreviveria: {"Sim" if would_survive == 1 else "Não"}')
print(f'Probabilidade de Sobrevivência: {probabilidade_sobreviver:.2f}')


Sobreviveria: Sim
Probabilidade de Sobrevivência: 0.51
