In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer

#Model_Selection
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

import warnings
warnings.filterwarnings("ignore")


import matplotlib.pyplot as plt
import seaborn as sns

from Pipeline import get_drop_categorical_features,get_drop_columns_with_null_valuse,get_colums_names


In [3]:
#nice setup for graphics cause why not: pretty blue and another pretty blue
colors = ['#06344d', '#00b2ff']
sns.set(palette = colors, font = 'Serif', style = 'white', 
        rc = {'axes.facecolor':'#f1f1f1', 'figure.facecolor':'#f1f1f1'})

# Load Data

In [21]:
df = pd.read_csv("Dados/train.csv",index_col=0)

# Exploratory Data Analysis

In [4]:
df.head(2)

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


## Understanding the columns

- `PassangerId` - numerical ID, works as an Index. Doesn't have value for the model we're going to built.
- `Survived` - boolean var meaning (0 = did not survived, 1 = survived)
- `Pclass` - travel class, being 1st class the most luxurious and expensive and 3rd class the most cheap
- `Name` - passanger's names and names their relatives in parenthesis
- `SibSp` - how many siblings the passanger had in titanic
- `Parch` -  how many parents/childs the passanger had on board
- `Ticket` - ticket's code, doesn't have value for the model we're going to built.
- `Fare` - how much the ticket cost
- `Embarked` - Port of embarkation: C = Cherbourg, Q = Queenstown, S = Southampton 

## Dealing with NaN and object type columns

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Survived       891 non-null    int64  
 1   Pclass         891 non-null    int64  
 2   Name           891 non-null    object 
 3   Sex            891 non-null    object 
 4   Age            714 non-null    float64
 5   SibSp          891 non-null    int64  
 6   Parch          891 non-null    int64  
 7   Ticket         891 non-null    object 
 8   Fare           891 non-null    float64
 9   Cabin          204 non-null    object 
 10  Embarked       889 non-null    object 
 11  Gender_binary  0 non-null      object 
dtypes: float64(2), int64(4), object(6)
memory usage: 122.8+ KB


In [14]:
# we can see how much (%) of each column is missing 
faltantes_percentual = (df.isnull().sum() / len(df.iloc[:,0])*100)
faltantes_percentual

Survived     0.000000
Pclass       0.000000
Name         0.000000
Sex          0.000000
Age         19.865320
SibSp        0.000000
Parch        0.000000
Ticket       0.000000
Fare         0.000000
Cabin       77.104377
Embarked     0.224467
dtype: float64

In [28]:
# function to change object data to categorical
def sex_to_binary(n):
    if n == 'male':
        return 1
    elif n == 'female':
        return 0
    
def Pclass_onehot(df):
    df_Pclass_enc = pd.get_dummies(df['Pclass']) 
    return df_Pclass_enc
    
def transform_dtype(df):
    df['Gender_binary'] = df['Sex'].map(sex_to_binary)
    Pclass_dummies = Pclass_onehot(df)
    df = df.join(Pclass_dummies)
    df.drop(columns=['Name','Ticket','Cabin','Embarked','Sex','Pclass'], inplace=True)
    return df
    
def dealing_null_values(df):
    df = df['Age'].fillna(-1)
    return df

In [29]:
df1 = transform_dtype(df)

In [30]:
df1.head(2)

Unnamed: 0_level_0,Survived,Age,SibSp,Parch,Fare,Gender_binary,1,2,3
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,0,22.0,1,0,7.25,1,0,0,1
2,1,38.0,1,0,71.2833,0,1,0,0


## Split data

In [5]:
x = df.drop("Survived",axis=1).copy()
y = df.Survived

In [7]:
x.head(2)

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [8]:
y.head(2)

PassengerId
1    0
2    1
Name: Survived, dtype: int64

In [None]:
x_train, x_val, y_train, y_val = train_test_split(x,y,
                                                    test_size = 0.3,
                                                    random_state = 0)

## Feature Selection

In [None]:
def drop_