In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer

#Model_Selection
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

import warnings
warnings.filterwarnings("ignore")


import matplotlib.pyplot as plt
import seaborn as sns

from Pipeline import get_drop_categorical_features,get_drop_columns_with_null_valuse,get_colums_names


In [3]:
#nice setup for graphics cause why not: pretty blue and another pretty blue
colors = ['#06344d', '#00b2ff']
sns.set(palette = colors, font = 'Serif', style = 'white', 
        rc = {'axes.facecolor':'#f1f1f1', 'figure.facecolor':'#f1f1f1'})

# Load Data

In [21]:
df = pd.read_csv("Dados/train.csv",index_col=0)

# Exploratory Data Analysis

In [4]:
df.head(2)

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


## Understanding the columns

- `PassangerId` - numerical ID, works as an Index. Doesn't have value for the model we're going to built.
- `Survived` - boolean var meaning (0 = did not survived, 1 = survived)
- `Pclass` - travel class, being 1st class the most luxurious and expensive and 3rd class the most cheap
- `Name` - passanger's names and names their relatives in parenthesis
- `SibSp` - how many siblings the passanger had in titanic
- `Parch` -  how many parents/childs the passanger had on board
- `Ticket` - ticket's code, doesn't have value for the model we're going to built.
- `Fare` - how much the ticket cost
- `Embarked` - Port of embarkation: C = Cherbourg, Q = Queenstown, S = Southampton 

## Dealing with NaN and object type columns

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Survived       891 non-null    int64  
 1   Pclass         891 non-null    int64  
 2   Name           891 non-null    object 
 3   Sex            891 non-null    object 
 4   Age            714 non-null    float64
 5   SibSp          891 non-null    int64  
 6   Parch          891 non-null    int64  
 7   Ticket         891 non-null    object 
 8   Fare           891 non-null    float64
 9   Cabin          204 non-null    object 
 10  Embarked       889 non-null    object 
 11  Gender_binary  0 non-null      object 
dtypes: float64(2), int64(4), object(6)
memory usage: 122.8+ KB


In [14]:
# we can see how much (%) of each column is missing 
faltantes_percentual = (df.isnull().sum() / len(df.iloc[:,0])*100)
faltantes_percentual

Survived     0.000000
Pclass       0.000000
Name         0.000000
Sex          0.000000
Age         19.865320
SibSp        0.000000
Parch        0.000000
Ticket       0.000000
Fare         0.000000
Cabin       77.104377
Embarked     0.224467
dtype: float64

In [28]:
# function to change object data to categorical
def sex_to_binary(n):
    if n == 'male':
        return 1
    elif n == 'female':
        return 0
    
def Pclass_onehot(df):
    df_Pclass_enc = pd.get_dummies(df['Pclass']) 
    return df_Pclass_enc
    
def transform_dtype(df):
    df['Gender_binary'] = df['Sex'].map(sex_to_binary)
    Pclass_dummies = Pclass_onehot(df)
    df = df.join(Pclass_dummies)
    df.drop(columns=['Name','Ticket','Cabin','Embarked','Sex','Pclass'], inplace=True)
    return df
    
def dealing_null_values(df):
    df = df['Age'].fillna(-1)
    return df

In [29]:
df1 = transform_dtype(df)

In [30]:
df1.head(2)

Unnamed: 0_level_0,Survived,Age,SibSp,Parch,Fare,Gender_binary,1,2,3
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,0,22.0,1,0,7.25,1,0,0,1
2,1,38.0,1,0,71.2833,0,1,0,0


## Split data

In [5]:
x = df.drop("Survived",axis=1).copy()
y = df.Survived

In [7]:
x.head(2)

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [8]:
y.head(2)

PassengerId
1    0
2    1
Name: Survived, dtype: int64

In [None]:
x_train, x_val, y_train, y_val = train_test_split(x,y,
                                                    test_size = 0.3,
                                                    random_state = 0)

## Feature Selection

In [None]:
def drop_

In [42]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer

#Model_Selection
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

import warnings
warnings.filterwarnings("ignore")


import matplotlib.pyplot as plt
import seaborn as sns

from Pipeline import get_drop_categorical_features,get_drop_columns_with_null_valuse,get_colums_names


import pandas as pd
from sklearn.preprocessing import FunctionTransformer

# function to be applied in each row of the "Sex" column
# and change object data to categorical (1 for male, 0 for female"
def sex_to_binary(n):
    if n == 'male':
        return 1
    elif n == 'female':
        return 0

#transform Pclass int stype to categorical through get_dummies
def Pclass_onehot(df):
    df_Pclass_enc = pd.get_dummies(df['Pclass'])
    return df_Pclass_enc

# transform  all columns necessary
def transform_dtype(df):
    df['Gender_binary'] = df['Sex'].map(sex_to_binary)
    Pclass_dummies = Pclass_onehot(df)
    df = df.join(Pclass_dummies)
    df.drop(columns=['Name' ,'Ticket' ,'Cabin' ,'Embarked' ,'Sex' ,'Pclass'], inplace=True)
    return df

# dealing with missing numbers
def dealing_null_values(df):
    df = df['Age'].fillna(-1)
    return df

def saving_columns(df):
    global colunas
    colunas= df.columns
    return df

get_dealing_null_values = FunctionTransformer(dealing_null_values,validate=False)
get_transform_dtype = FunctionTransformer(transform_dtype,validate=False)
get_colums_names = FunctionTransformer(saving_columns,validate=False)

#-----------------------------------------

def drop_columns_with_null_valuse(df):
    df.dropna(axis=1, inplace=True)
    try:
        df.drop("Fare", axis=1, inplace=True)
    except:
        pass
    return df


def drop_categorical_features(df):
    df = df.select_dtypes(exclude="object")
    return df


def saving_columns(df):
    global colunas
    colunas = df.columns
    return df


get_drop_columns_with_null_valuse = FunctionTransformer(drop_columns_with_null_valuse, validate=False)
get_drop_categorical_features = FunctionTransformer(drop_categorical_features, validate=False)

df = pd.read_csv("Dados/train.csv",index_col=0)
x = df.drop("Survived",axis=1).copy()
y = df.Survived



x_train, x_val, y_train, y_val = train_test_split(x,y,
                                                    test_size = 0.3,
                                                    random_state = 0)


pipe1 = Pipeline(memory=None,
                 steps = [
                     ("Feature_Selection",get_drop_categorical_features),
                     ("Null_Validate",get_drop_columns_with_null_valuse),
                     ("Final_Columns",get_colums_names),
                     ("RandomForest", RandomForestClassifier() )
                 ],
                verbose=False
                )

pipe2 = Pipeline(memory=None,
                 steps = [
                     ("Feature_Selection",get_drop_categorical_features),
                     ("Null_Validate",get_drop_columns_with_null_valuse),
                     ("Final_Columns",get_colums_names),
                     ("RandomForest", LogisticRegression() )
                 ],
                verbose=False
                )


#Salvando Scores
modelos_testados = {"Modelos":["RandomForestClassifier","LogisticRegression"],
                    "Pipeline":[pipe1,pipe2],
                    "Score":[],
                    "Steps":[]
                    }
n = len(modelos_testados["Modelos"])
with open("metrics.txt", 'w') as outfile:
    for ref in range(n):
        modelos_testados["Pipeline"][ref].fit(x_train,y_train)
        test_score = modelos_testados["Pipeline"][ref].score(x_val,y_val)
        nome_modelo = modelos_testados["Modelos"][ref]
        steps = modelos_testados["Pipeline"][ref].named_steps.keys()
        outfile.write(f"{nome_modelo}- Test Score: {test_score} - Steps: {steps}")
        modelos_testados["Score"].append(test_score)
        lista_steps = [step for step in steps]
        modelos_testados["Steps"].append(lista_steps)





df_modelos = pd.DataFrame({"Model":modelos_testados["Modelos"], "Score":modelos_testados["Score"], "Steps":modelos_testados["Steps"]})
df_modelos.to_markdown("Modelos.md",index=False)




#Salvando submissão
#Aqui devemos escolher nosso Pipe que iremos utilizar para nosso modelo
x_test = pd.read_csv("Dados/test.csv",index_col = 0)
predict_array = pipe1.predict(x_test)
predict_submission = pd.DataFrame({"PassengerId":x_test.index,"Survived":predict_array})
predict_submission.to_csv("Predições/Predict1.csv",index=False)


#Plotando grafico
importances = pipe1.named_steps['RandomForest'].feature_importances_
colunas = ['Pclass', 'SibSp', 'Parch']
# Colunas vai vir da var global em get_colum_names
feature_df = pd.DataFrame(list(zip(colunas, importances)), columns = ["feature","importance"])
feature_df = feature_df.sort_values(by='importance', ascending=False,)


axis_fs = 18 #fontsize
title_fs = 22 #fontsize
sns.set(style="whitegrid")

ax = sns.barplot(x="importance", y="feature", data=feature_df)
ax.set_xlabel('Importance',fontsize = axis_fs)
ax.set_ylabel('Feature', fontsize = axis_fs)#ylabel
ax.set_title('Random forest\nfeature importance', fontsize = title_fs)

plt.tight_layout()
plt.savefig("feature_importance.png",dpi=120)
plt.close()

print("FINALIZADO!!!")


#cml-publish feature_importance.png --md >> report.md
#          cml-send-comment report.md

FINALIZADO!!!


In [43]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer

#Model_Selection
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

import warnings
warnings.filterwarnings("ignore")


import matplotlib.pyplot as plt
import seaborn as sns

from Pipeline import get_drop_categorical_features,get_drop_columns_with_null_valuse,get_colums_names



df = pd.read_csv("Dados/train.csv",index_col=0)
x = df.drop("Survived",axis=1).copy()
y = df.Survived



x_train, x_val, y_train, y_val = train_test_split(x,y,
                                                    test_size = 0.3,
                                                    random_state = 0)


pipe1 = Pipeline(memory=None,
                 steps = [
                     ("Feature_Selection",get_drop_categorical_features),
                     ("Null_Validate",get_drop_columns_with_null_valuse),
                     ("Final_Columns",get_colums_names),
                     ("RandomForest", RandomForestClassifier() )
                 ],
                verbose=False
                )

pipe2 = Pipeline(memory=None,
                 steps = [
                     ("Feature_Selection",get_drop_categorical_features),
                     ("Null_Validate",get_drop_columns_with_null_valuse),
                     ("Final_Columns",get_colums_names),
                     ("RandomForest", LogisticRegression() )
                 ],
                verbose=False
                )

pipe3 = Pipeline(memory=None,
                 steps = [
                     ("Feature_Selection",get_transform_dtype),
                     ("Null_Validate",get_dealing_null_values),
                     ("Final_Columns",get_colums_names),
                     ("RandomForest", RandomForestClassifier() )
                 ],
                verbose=False
                )

pipe4 = Pipeline(memory=None,
                 steps = [
                     ("Feature_Selection",get_transform_dtype),
                     ("Null_Validate",get_dealing_null_values),
                     ("Final_Columns",get_colums_names),
                     ("RandomForest", LogisticRegression() )
                 ],
                verbose=False
                )
#Salvando Scores
modelos_testados = {"Modelos":["RandomForestClassifier","LogisticRegression","RandomForestClassifier_v2","LogisticRegression"],
                    "Pipeline":[pipe1,pipe2,pipe3, pipe4],
                    "Score":[],
                    "Steps":[]
                    }
n = len(modelos_testados["Modelos"])
with open("metrics.txt", 'w') as outfile:
    for ref in range(n):
        modelos_testados["Pipeline"][ref].fit(x_train,y_train)
        test_score = modelos_testados["Pipeline"][ref].score(x_val,y_val)
        nome_modelo = modelos_testados["Modelos"][ref]
        steps = modelos_testados["Pipeline"][ref].named_steps.keys()
        outfile.write(f"{nome_modelo}- Test Score: {test_score} - Steps: {steps}")
        modelos_testados["Score"].append(test_score)
        lista_steps = [step for step in steps]
        modelos_testados["Steps"].append(lista_steps)





df_modelos = pd.DataFrame({"Model":modelos_testados["Modelos"], "Score":modelos_testados["Score"], "Steps":modelos_testados["Steps"]})
df_modelos.to_markdown("Modelos.md",index=False)




#Salvando submissão
#Aqui devemos escolher nosso Pipe que iremos utilizar para nosso modelo
x_test = pd.read_csv("Dados/test.csv",index_col = 0)
predict_array = pipe1.predict(x_test)
predict_submission = pd.DataFrame({"PassengerId":x_test.index,"Survived":predict_array})
predict_submission.to_csv("Predições/Predict1.csv",index=False)


#Plotando grafico
importances = pipe1.named_steps['RandomForest'].feature_importances_
colunas = ['Pclass', 'SibSp', 'Parch']
# Colunas vai vir da var global em get_colum_names
feature_df = pd.DataFrame(list(zip(colunas, importances)), columns = ["feature","importance"])
feature_df = feature_df.sort_values(by='importance', ascending=False,)


axis_fs = 18 #fontsize
title_fs = 22 #fontsize
sns.set(style="whitegrid")

ax = sns.barplot(x="importance", y="feature", data=feature_df)
ax.set_xlabel('Importance',fontsize = axis_fs)
ax.set_ylabel('Feature', fontsize = axis_fs)#ylabel
ax.set_title('Random forest\nfeature importance', fontsize = title_fs)

plt.tight_layout()
plt.savefig("feature_importance.png",dpi=120)
plt.close()

print("FINALIZADO!!!")


#cml-publish feature_importance.png --md >> report.md
#          cml-send-comment report.md

AttributeError: 'Series' object has no attribute 'columns'

In [44]:
#Feature_Selection",get_transform_dtype),
#                     ("Null_Validate",get_dealing_null_values),
##                     ("Final_Columns",get_colums_names),
 #                    ("RandomForest", RandomForestClassifier()
 

df2 = dealing_null_values(df)
#get_dealing_null_values = FunctionTransformer(dealing_null_values,validate=False)
#get_transform_dtype = FunctionTransformer(transform_dtype,validate=False)
#get_colums_names = FunctionTransformer(saving_columns,validate=False)
                      

In [45]:
df2.head()

PassengerId
1    22.0
2    38.0
3    26.0
4    35.0
5    35.0
Name: Age, dtype: float64

In [46]:
import pandas as pd
from sklearn.preprocessing import FunctionTransformer

# function to be applied in each row of the "Sex" column
# and change object data to categorical (1 for male, 0 for female"
def sex_to_binary(n):
    if n == 'male':
        return 1
    elif n == 'female':
        return 0

#transform Pclass int stype to categorical through get_dummies
def Pclass_onehot(df):
    df_Pclass_enc = pd.get_dummies(df['Pclass'])
    return df_Pclass_enc

# transform  all columns necessary
def transform_dtype(df):
    df['Gender_binary'] = df['Sex'].map(sex_to_binary)
    Pclass_dummies = Pclass_onehot(df)
    df = df.join(Pclass_dummies)
    df.drop(['Name' ,'Ticket' ,'Cabin' ,'Embarked' ,'Sex' ,'Pclass'], axis=1, inplace=True)
    return df

# dealing with missing numbers
def dealing_null_values(df):
    df['Age'] = df['Age'].fillna(-1)
    return df

def saving_columns(df):
    global colunas
    colunas= df.columns
    return df

get_dealing_null_values = FunctionTransformer(dealing_null_values,validate=False)
get_transform_dtype = FunctionTransformer(transform_dtype,validate=False)
get_colums_names = FunctionTransformer(saving_columns,validate=False)

#-----------------------------------------

def drop_columns_with_null_valuse(df):
    df.dropna(axis=1, inplace=True)
    try:
        df.drop("Fare", axis=1, inplace=True)
    except:
        pass
    return df


def drop_categorical_features(df):
    df = df.select_dtypes(exclude="object")
    return df


def saving_columns(df):
    global colunas
    colunas = df.columns
    return df


get_drop_columns_with_null_valuse = FunctionTransformer(drop_columns_with_null_valuse, validate=False)
get_drop_categorical_features = FunctionTransformer(drop_categorical_features, validate=False)




In [47]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer

#Model_Selection
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

import warnings
warnings.filterwarnings("ignore")


import matplotlib.pyplot as plt
import seaborn as sns

from Pipeline import get_drop_categorical_features,get_drop_columns_with_null_valuse,get_colums_names



df = pd.read_csv("Dados/train.csv",index_col=0)
x = df.drop("Survived",axis=1).copy()
y = df.Survived



x_train, x_val, y_train, y_val = train_test_split(x,y,
                                                    test_size = 0.3,
                                                    random_state = 0)


pipe1 = Pipeline(memory=None,
                 steps = [
                     ("Feature_Selection",get_drop_categorical_features),
                     ("Null_Validate",get_drop_columns_with_null_valuse),
                     ("Final_Columns",get_colums_names),
                     ("RandomForest", RandomForestClassifier() )
                 ],
                verbose=False
                )

pipe2 = Pipeline(memory=None,
                 steps = [
                     ("Feature_Selection",get_drop_categorical_features),
                     ("Null_Validate",get_drop_columns_with_null_valuse),
                     ("Final_Columns",get_colums_names),
                     ("RandomForest", LogisticRegression() )
                 ],
                verbose=False
                )

pipe3 = Pipeline(memory=None,
                 steps = [
                     ("Feature_Selection",get_transform_dtype),
                     ("Null_Validate",get_dealing_null_values),
                     ("Final_Columns",get_colums_names),
                     ("RandomForest", RandomForestClassifier() )
                 ],
                verbose=False
                )

pipe4 = Pipeline(memory=None,
                 steps = [
                     ("Feature_Selection",get_transform_dtype),
                     ("Null_Validate",get_dealing_null_values),
                     ("Final_Columns",get_colums_names),
                     ("RandomForest", LogisticRegression() )
                 ],
                verbose=False
                )
#Salvando Scores
modelos_testados = {"Modelos":["RandomForestClassifier","LogisticRegression","RandomForestClassifier_v2","LogisticRegression"],
                    "Pipeline":[pipe1,pipe2,pipe3, pipe4],
                    "Score":[],
                    "Steps":[]
                    }
n = len(modelos_testados["Modelos"])
with open("metrics.txt", 'w') as outfile:
    for ref in range(n):
        modelos_testados["Pipeline"][ref].fit(x_train,y_train)
        test_score = modelos_testados["Pipeline"][ref].score(x_val,y_val)
        nome_modelo = modelos_testados["Modelos"][ref]
        steps = modelos_testados["Pipeline"][ref].named_steps.keys()
        outfile.write(f"{nome_modelo}- Test Score: {test_score} - Steps: {steps}")
        modelos_testados["Score"].append(test_score)
        lista_steps = [step for step in steps]
        modelos_testados["Steps"].append(lista_steps)





df_modelos = pd.DataFrame({"Model":modelos_testados["Modelos"], "Score":modelos_testados["Score"], "Steps":modelos_testados["Steps"]})
df_modelos.to_markdown("Modelos.md",index=False)




#Salvando submissão
#Aqui devemos escolher nosso Pipe que iremos utilizar para nosso modelo
x_test = pd.read_csv("Dados/test.csv",index_col = 0)
predict_array = pipe1.predict(x_test)
predict_submission = pd.DataFrame({"PassengerId":x_test.index,"Survived":predict_array})
predict_submission.to_csv("Predições/Predict1.csv",index=False)


#Plotando grafico
importances = pipe1.named_steps['RandomForest'].feature_importances_
colunas = ['Pclass', 'SibSp', 'Parch']
# Colunas vai vir da var global em get_colum_names
feature_df = pd.DataFrame(list(zip(colunas, importances)), columns = ["feature","importance"])
feature_df = feature_df.sort_values(by='importance', ascending=False,)


axis_fs = 18 #fontsize
title_fs = 22 #fontsize
sns.set(style="whitegrid")

ax = sns.barplot(x="importance", y="feature", data=feature_df)
ax.set_xlabel('Importance',fontsize = axis_fs)
ax.set_ylabel('Feature', fontsize = axis_fs)#ylabel
ax.set_title('Random forest\nfeature importance', fontsize = title_fs)

plt.tight_layout()
plt.savefig("feature_importance.png",dpi=120)
plt.close()

print("FINALIZADO!!!")


#cml-publish feature_importance.png --md >> report.md
#          cml-send-comment report.md

FINALIZADO!!!
