In [18]:
import os
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, RepeatedKFold, RepeatedStratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.experimental import enable_iterative_imputer 
from sklearn.impute import IterativeImputer
from imblearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector as selector
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import accuracy_score
from sklearn.experimental import enable_iterative_imputer

from imblearn.over_sampling import SMOTE
from category_encoders import BinaryEncoder
from ydata_profiling import ProfileReport
import optuna

# Modelos de Machine Learning
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier

from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
import xgboost as xgb
from xgboost import XGBClassifier

# Deep Learning
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical


In [19]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [20]:
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")

train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [21]:
#profile = ProfileReport(train_data, explorative=True)

# Exibir relatório no notebook
#profile.to_notebook_iframe()

In [22]:
display(train_data.info())
display(test_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


None

In [23]:
train_data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [160]:
features = train_data.drop(columns='Survived')
target = train_data['Survived']

display(features.head())

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [169]:
df = train_data.copy()

df['Ticket'] = df['Ticket'].replace('LINE',0)

df['Ticket'] = df['Ticket'].str.extract(r'\s(\d+)$|^(\d+)$').bfill(axis=1).iloc[:, 0].astype(float)

df['Ticket'] = df['Ticket'].fillna(0)

In [162]:
def replace_values(array, lower_quantile=0., upper_quantile=1.):
  array = array.copy()

  lower_quantile = np.quantile(array, lower_quantile)
  upper_quantile = np.quantile(array, upper_quantile)

  array[array<lower_quantile] = lower_quantile
  array[array>upper_quantile] = upper_quantile

  return array

In [170]:
df['Ticket'] = replace_values(df['Ticket'], upper_quantile=.95)

In [180]:
df.query('Ticket < 100')

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,TicketRanges
179,180,0,3,"Leonard, Mr. Lionel",male,36.0,0,0,0.0,0.0,Unk,S,"(-393.118, 39311.8]"
271,272,1,3,"Tornquist, Mr. William Henry",male,25.0,0,0,0.0,0.0,Unk,S,"(-393.118, 39311.8]"
302,303,0,3,"Johnson, Mr. William Cahoone Jr",male,19.0,0,0,0.0,0.0,Unk,S,"(-393.118, 39311.8]"
597,598,0,3,"Johnson, Mr. Alfred",male,49.0,0,0,0.0,0.0,Unk,S,"(-393.118, 39311.8]"
772,773,0,2,"Mack, Mrs. (Mary)",female,57.0,0,0,3.0,10.5,E,S,"(-393.118, 39311.8]"
841,842,0,2,"Mudd, Mr. Thomas Charles",male,16.0,0,0,3.0,10.5,Unk,S,"(-393.118, 39311.8]"


In [181]:
df['TicketRanges'] = pd.cut(df['Ticket'],20)

df.groupby(['TicketRanges'])['Survived'].agg(['mean', 'count'])

  df.groupby(['TicketRanges'])['Survived'].agg(['mean', 'count'])
  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0_level_0,mean,count
TicketRanges,Unnamed: 1_level_1,Unnamed: 2_level_1
"(-393.118, 19655.9]",0.456592,311
"(19655.9, 39311.8]",0.504854,103
"(39311.8, 58967.7]",0.0,7
"(58967.7, 78623.6]",0.333333,3
"(78623.6, 98279.5]",,0
"(98279.5, 117935.4]",0.513889,72
"(117935.4, 137591.3]",,0
"(137591.3, 157247.2]",,0
"(157247.2, 176903.1]",,0
"(176903.1, 196559.0]",,0


In [182]:
df['Cabin'] = df['Cabin'].str.extract(r'([A-Za-z]+)').fillna('Unk')

df.groupby(['Cabin'])['Survived'].agg(['mean', 'count'])

Unnamed: 0_level_0,mean,count
Cabin,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0.466667,15
B,0.744681,47
C,0.59322,59
D,0.757576,33
E,0.75,32
F,0.615385,13
G,0.5,4
T,0.0,1
Unk,0.299854,687


In [186]:
df['Title'] = df['Name'].str.extract(r'([A-Za-z]+)\.')

df.groupby(['Title'])['Survived'].agg(['mean', 'count'])


Unnamed: 0_level_0,mean,count
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
Capt,0.0,1
Col,0.5,2
Countess,1.0,1
Don,0.0,1
Dr,0.428571,7
Jonkheer,0.0,1
Lady,1.0,1
Major,0.5,2
Master,0.575,40
Miss,0.697802,182


In [188]:
df['AgeRanges'] = pd.cut(features['Age'],10)

df.groupby('AgeRanges')['Survived'].agg(['mean', 'count'])

  df.groupby('AgeRanges')['Survived'].agg(['mean', 'count'])


Unnamed: 0_level_0,mean,count
AgeRanges,Unnamed: 1_level_1,Unnamed: 2_level_1
"(0.34, 8.378]",0.666667,54
"(8.378, 16.336]",0.413043,46
"(16.336, 24.294]",0.355932,177
"(24.294, 32.252]",0.384615,169
"(32.252, 40.21]",0.440678,118
"(40.21, 48.168]",0.342857,70
"(48.168, 56.126]",0.466667,45
"(56.126, 64.084]",0.375,24
"(64.084, 72.042]",0.0,9
"(72.042, 80.0]",0.5,2


In [209]:
df['AgeRanges'] = df['AgeRanges'].astype(str)

unique_ranges = df['AgeRanges'].dropna().unique()

range_mapping = {val: i for i, val in enumerate(sorted(unique_ranges), start=1)}

df['AgeRanges'] = df['AgeRanges'].map(range_mapping)

df

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,TicketRanges,Title,AgeRanges,AgeRangeCat,FareRanges
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,21171.0,7.2500,Unk,S,"(19655.9, 39311.8]",Mr,2,2,"(-0.512, 51.233]"
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,17599.0,71.2833,C,C,"(-393.118, 19655.9]",Mrs,4,3,"(51.233, 102.466]"
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,393118.0,7.9250,Unk,S,"(373462.1, 393118.0]",Miss,3,2,"(-0.512, 51.233]"
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803.0,53.1000,C,S,"(98279.5, 117935.4]",Mrs,4,3,"(51.233, 102.466]"
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450.0,8.0500,Unk,S,"(353806.2, 373462.1]",Mr,4,3,"(-0.512, 51.233]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536.0,13.0000,Unk,S,"(196559.0, 216214.9]",Rev,3,2,"(-0.512, 51.233]"
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053.0,30.0000,B,S,"(98279.5, 117935.4]",Miss,2,2,"(-0.512, 51.233]"
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,6607.0,23.4500,Unk,S,"(-393.118, 19655.9]",Miss,11,,"(-0.512, 51.233]"
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369.0,30.0000,C,C,"(98279.5, 117935.4]",Mr,3,2,"(-0.512, 51.233]"


In [191]:
bins = [0.34, 16.336, 32.252, 48.168, 64.084, 80.0]

labels = [1, 2, 3, 4, 5]

df['AgeRangeCat'] = pd.cut(df['Age'], bins=bins, labels=labels, include_lowest=True)

df

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,TicketRanges,Title,AgeRanges,AgeRangeCat,FareRanges
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,21171.0,7.2500,Unk,S,"(19655.9, 39311.8]",Mr,"(16.336, 24.294]",2,"(-0.512, 102.466]"
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,17599.0,71.2833,C,C,"(-393.118, 19655.9]",Mrs,"(32.252, 40.21]",3,"(-0.512, 102.466]"
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,393118.0,7.9250,Unk,S,"(373462.1, 393118.0]",Miss,"(24.294, 32.252]",2,"(-0.512, 102.466]"
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803.0,53.1000,C,S,"(98279.5, 117935.4]",Mrs,"(32.252, 40.21]",3,"(-0.512, 102.466]"
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450.0,8.0500,Unk,S,"(353806.2, 373462.1]",Mr,"(32.252, 40.21]",3,"(-0.512, 102.466]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536.0,13.0000,Unk,S,"(196559.0, 216214.9]",Rev,"(24.294, 32.252]",2,"(-0.512, 102.466]"
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053.0,30.0000,B,S,"(98279.5, 117935.4]",Miss,"(16.336, 24.294]",2,"(-0.512, 102.466]"
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,6607.0,23.4500,Unk,S,"(-393.118, 19655.9]",Miss,,,"(-0.512, 102.466]"
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369.0,30.0000,C,C,"(98279.5, 117935.4]",Mr,"(24.294, 32.252]",2,"(-0.512, 102.466]"


In [199]:
df['FareRanges'] = pd.cut(features['Fare'],10)

df.groupby('FareRanges')['Survived'].agg(['mean','sum'])

  df.groupby('FareRanges')['Survived'].agg(['mean','sum'])
  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0_level_0,mean,sum
FareRanges,Unnamed: 1_level_1,Unnamed: 2_level_1
"(-0.512, 51.233]",0.318306,233
"(51.233, 102.466]",0.660377,70
"(102.466, 153.699]",0.741935,23
"(153.699, 204.932]",1.0,2
"(204.932, 256.165]",0.636364,7
"(256.165, 307.398]",0.666667,4
"(307.398, 358.63]",,0
"(358.63, 409.863]",,0
"(409.863, 461.096]",,0
"(461.096, 512.329]",1.0,3


In [27]:
class TransformData(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.imputer = IterativeImputer(random_state=42)
        
    def fit(self, df, y=None):
        # Ajusta o imputer para as colunas numéricas desejadas
        self.imputer.fit(df[['Age','Fare']])
        return self

    def transform(self, df, y=None):
        df_transformed = df.copy()

        # Imputa os valores em 'Age' e 'Fare'
        df_transformed[['Age','Fare']] = self.imputer.transform(df_transformed[['Age','Fare']])

        # Extrai o título a partir da coluna 'Name'
        df_transformed['Title'] = df_transformed['Name'].str.extract(r'([A-Za-z]+)\.')

        df_transformed['Title'] = df_transformed['Title'].replace(['Dr', 'Rev', 'Col', 'Major', 'Lady', 'Jonkheer', 'Don', 'Capt', 'Countess', 'Sir', 'Dona'], 'Rare')

        df_transformed['Title'] = df_transformed['Title'].replace(['Mlle','Ms'], 'Miss')

        df_transformed['Title'] = df_transformed['Title'].replace('Mme', 'Mrs')
        
        # Imputa valores ausentes em 'Age' com a média
        df_transformed['Age'] = df_transformed['Age'].fillna(df_transformed['Age'].mean())
        
        # Cria a coluna 'Adult Male'
        df_transformed['Adult Male'] = 0 
        df_transformed.loc[(df_transformed['Age'] >= 18) & (df_transformed['Sex'] == 'male'), 'Adult Male'] = 1
        
        # Imputa valores ausentes em 'Embarked' por grupo
        df_transformed['Embarked'] = df_transformed.groupby(['Sex', 'Fare'])['Embarked'] \
            .transform(lambda x: x.fillna(x.mode()[0] if not x.mode().empty else 'S'))
        
        # Cria coluna 'Family Size'
        df_transformed['Family Size'] = df_transformed['SibSp'] + df_transformed['Parch'] + 1

        # Cria coluna 'Alone'
        df_transformed['Alone'] = np.where((df_transformed['SibSp'] > 0) | (df_transformed['Parch'] > 0), 'No', 'Yes')

        # Log da coluna 'Fare'
        df_transformed['Fare'] = df_transformed['Fare'].map(lambda x: np.log(x) if x > 0 else 0)
        
        # Remove colunas indesejadas
        df_transformed.drop(columns=['PassengerId','Name','Ticket','Cabin', 'Alone', 'SibSp', 'Parch'], inplace=True)

        return df_transformed

In [28]:
Preprocessing = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), selector(dtype_include="number")),
        ("cat", OneHotEncoder(), selector(dtype_exclude="number"))
    ]
)

In [29]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.30, random_state=24, stratify=target)

print('Data Splitted')

Data Splitted


In [30]:
# models = {
#     'Logistic Regression': LogisticRegression(max_iter=200),
#     'Decision Tree': DecisionTreeClassifier(),
#     'SVM': SVC(),
#     'Random Forest': RandomForestClassifier(),
#     'XGBoost': xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'),
#     'K-Nearest Neighbors': KNeighborsClassifier(),
#     'Gaussian Naive Bayes': GaussianNB(),
#     'Gradient Boosting': GradientBoostingClassifier(),
#     'AdaBoost': AdaBoostClassifier(),
#     'Extra Trees': ExtraTreesClassifier(),
#     'MLP': MLPClassifier(max_iter=1000)
# }

# cv = RepeatedKFold(n_splits=3, n_repeats=10, random_state=None)

# results = {}
# for name, model in models.items():

#     pipeline = Pipeline([
#     ("Organizing", TransformData()),
#     ("Preprocessing", Preprocessing),
#     ("Balancing", SMOTE(sampling_strategy='auto', random_state=42)),
#     ("Model", model)
#     ])
      
#     scores = cross_val_score(pipeline, features, target, cv=cv)
    
#     results[name] = scores.mean()
    
#     print(f"{name} CV Accuracy: {results[name]:.4f}")


# best_model_name = max(results, key=results.get)
# print(f"\nBest model based on CV: {best_model_name}")

# # Train the best model on the full training set and evaluate it on the test set
# best_model = models[best_model_name]

# pipeline = Pipeline([
#     ("Organizing", TransformData()),
#     ("Preprocessing", Preprocessing),
#     ("Model", best_model)
#     ])

# pipeline.fit(X_train, y_train)
# test_accuracy = pipeline.score(X_test, y_test)
# print(f"Test set Accuracy: {test_accuracy:.4f}")


In [31]:
# def objective(trial):
#     # Sugestões para os hiperparâmetros do RandomForest
#     n_estimators = trial.suggest_int("n_estimators", 10, 400)
#     max_depth = trial.suggest_int("max_depth", 2, 64)
    
#     # Montagem do pipeline com os hiperparâmetros sugeridos
#     model = Pipeline([
#         ("Organizing", TransformData()),
#         ("Preprocessing", Preprocessing),
#         ("Balancing", SMOTE(sampling_strategy='auto', random_state=42)),
#         ("Model", RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42))
#     ])
    
#     # cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
#     cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=10, random_state=None)
    
#     accuracy = cross_val_score(model, features, target, cv=cv, scoring='accuracy').mean()
    
#     return accuracy

# # Criação do estudo com o objetivo de maximizar a acurácia
# study = optuna.create_study(direction="maximize")
# study.optimize(objective, n_trials=50)

# print("Melhores parâmetros:", study.best_params)
# print("Melhor acurácia:", study.best_value)


In [32]:
# def objective(trial):
#     # Suggest hyperparameters for SVC
#     C = trial.suggest_float("C", 1e-3, 1e3, log=True)
#     kernel = trial.suggest_categorical("kernel", ["rbf", "linear", "poly", "sigmoid"])
    
#     params = {"C": C, "kernel": kernel}
    
#     if kernel == "linear":
#         # gamma is not used with linear kernel
#         params["gamma"] = "auto"
#     elif kernel == "poly":
#         gamma = trial.suggest_float("gamma", 1e-4, 1e1, log=True)
#         degree = trial.suggest_int("degree", 2, 5)
#         params["gamma"] = gamma
#         params["degree"] = degree
#     else:
#         gamma = trial.suggest_float("gamma", 1e-4, 1e1, log=True)
#         params["gamma"] = gamma

#     # Build the pipeline with custom preprocessing steps and the SVC model
#     model = Pipeline([
#         ("Organizing", TransformData()),
#         ("Preprocessing", Preprocessing),
#         ("Balancing", SMOTE(sampling_strategy='auto', random_state=42)),
#         ("Model", SVC(**params, probability=True))
#     ])

#     # Using RepeatedStratifiedKFold for robust cross-validation
#     cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=10, random_state=None)
    
#     accuracy = cross_val_score(model, features, target, cv=cv, scoring='accuracy').mean()
    
#     return accuracy

# # Create and run the study to maximize accuracy
# study = optuna.create_study(direction="maximize")
# study.optimize(objective, n_trials=50)

# print("Melhores parâmetros:", study.best_params)
# print("Melhor acurácia:", study.best_value)


In [33]:
# Monta o pipeline com as etapas de organização, pré-processamento e o modelo RandomForest
model = Pipeline([
    ("Organizing", TransformData()),
    ("Preprocessing", Preprocessing),
    ("Balancing", SMOTE(sampling_strategy='auto', random_state=42)),
    ("Model", SVC(C = 0.956, kernel='linear', probability=True))
])

# Treina o pipeline completo
model.fit(X_train, y_train)

# Faz as previsões
train_predictions = model.predict(X_train)
test_predictions = model.predict(X_test)

train_accuracy = accuracy_score(y_train, train_predictions)
test_accuracy = accuracy_score(y_test, test_predictions)

print(f"Train Accuracy: {train_accuracy:.2f}")
print(f"Test Accuracy: {test_accuracy:.2f}")


Train Accuracy: 0.83
Test Accuracy: 0.81


In [34]:
unk_predictions = model.predict(test_data)

output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': unk_predictions})

display(output)

output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


Your submission was successfully saved!
