In [14]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [15]:
df = pd.read_csv('train.csv')

In [16]:
from sklearn.model_selection import StratifiedShuffleSplit

In [17]:
spliter = StratifiedShuffleSplit(n_splits=1, random_state=42, test_size=0.2)

In [18]:
for train_idx, test_idx in spliter.split(df, df['Sex']):
    strat_test = df.iloc[test_idx]
    strat_train = df.iloc[train_idx]

In [19]:
df2 = strat_train.copy()

In [20]:
df2 = df2.set_index('PassengerId')

In [21]:
df2['Sex'] = df2['Sex'] == 'female'

In [22]:
df2['Fator'] = (df2['Sex'] & (df2['Age'] >= 18))*df2['Parch'] + (df2['Age'] < 18)*df2['SibSp']

In [23]:
df2.corr(method='kendall')['Survived'].sort_values(ascending=False)[1:]

Sex       0.550827
Fare      0.257084
Fator     0.164765
Parch     0.123739
SibSp     0.069682
Age      -0.037318
Pclass   -0.302240
Name: Survived, dtype: float64

In [24]:
from sklearn.preprocessing import OrdinalEncoder

In [25]:
df2['Age_cat'] = pd.cut(df2['Age'], bins=[0, 18, 40, 60, np.inf], labels=False, right=False)

In [26]:
df2.corr(method='kendall')['Survived'].sort_values(ascending=False)[1:]

Sex        0.550827
Fare       0.257084
Fator      0.164765
Parch      0.123739
SibSp      0.069682
Age       -0.037318
Age_cat   -0.084679
Pclass    -0.302240
Name: Survived, dtype: float64

In [27]:
from sklearn.impute import SimpleImputer

In [28]:
from sklearn.base import TransformerMixin, BaseEstimator

In [29]:
class Transform_Age(TransformerMixin, BaseEstimator):
    def __init__(self,bins=[0, 18, 40, 60, np.inf]):
        self.bins=bins
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        return pd.DataFrame(pd.cut(X, bins=self.bins, right=False, labels=False))

In [30]:
aj = Transform_Age(5)

In [31]:
from sklearn.compose import ColumnTransformer

In [32]:
from sklearn.pipeline import Pipeline

In [33]:
pipi = ColumnTransformer([
    ('Sexo_to_cat',OrdinalEncoder(),['Sex']),
    ('Age_to_cat', Transform_Age(),'Age'),
    ('Pclass_to_cat',OrdinalEncoder(),['Pclass']),
    ('Fator','passthrough',['Parch','SibSp'])
])

In [34]:
class Fator(Transform_Age, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, X,y=None):
        #fact = (df2['Sex'] & (df2['Age'] >= 18))*df2['Parch'] + (df2['Age'] < 18)*df2['SibSp']
        fact = (X[:,0]*(X[:,1] > 0))*X[:,3] + (X[:,1] == 0)*X[:,4]
        return fact
        

In [35]:
class Transf(Transform_Age, TransformerMixin):
    def __init__(self, fat_novo=False, fat_antigo=True):
        self.fat_novo=fat_novo
        self.fat_antigo=fat_antigo
    def fit(self, X, y=None):
        return self
    def transform(self, X,y=None):
        pipi = ColumnTransformer([
    ('Sexo_to_cat',OrdinalEncoder(),['Sex']),
    ('Age_to_cat', Transform_Age(),'Age'),
    ('Pclass_to_cat',OrdinalEncoder(),['Pclass']),
    ('Fator','passthrough',['Parch','SibSp'])
])
        T = pipi.fit_transform(X)
        T = SimpleImputer(strategy='most_frequent').fit_transform(T)
        
        R = T.copy()
        if not self.fat_antigo:
            R = T[:,0:3].copy()
        if self.fat_novo:
            fact = Fator().fit_transform(T)
            return np.c_[R,fact]
        return R

In [36]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

In [37]:
ptrain_ant = Transf().fit_transform(strat_train)
ptrain_ant_nov = Transf(fat_novo=True).fit_transform(strat_train)
ptrain_nov = Transf(fat_novo=True, fat_antigo=False).fit_transform(strat_train)

In [38]:
lalb = strat_train['Survived']

In [39]:
mod_log_ant = LogisticRegression().fit(ptrain_ant, lalb)
mod_log_ant_nov = LogisticRegression().fit(ptrain_ant_nov, lalb)
mod_log_nov = LogisticRegression().fit(ptrain_nov, lalb)

In [40]:
print(cross_val_score(mod_log_ant, ptrain_ant, lalb, scoring='accuracy', cv=10).mean())
print(cross_val_score(mod_log_ant_nov, ptrain_ant_nov, lalb, scoring='accuracy', cv=10).mean())
print(cross_val_score(mod_log_nov, ptrain_nov, lalb, scoring='accuracy', cv=10).mean())

0.7977895148669797
0.7991979655712049
0.7893192488262911


In [41]:
from sklearn.tree import DecisionTreeClassifier

In [42]:
mod_tree_ant = DecisionTreeClassifier().fit(ptrain_ant, lalb)
mod_tree_ant_nov = DecisionTreeClassifier().fit(ptrain_ant_nov, lalb)
mod_tree_nov = DecisionTreeClassifier().fit(ptrain_nov, lalb)

In [43]:
print(cross_val_score(mod_tree_ant, ptrain_ant, lalb, scoring='accuracy', cv=10).mean())
print(cross_val_score(mod_tree_ant_nov, ptrain_ant_nov, lalb, scoring='accuracy', cv=10).mean())
print(cross_val_score(mod_tree_nov, ptrain_nov, lalb, scoring='accuracy', cv=10).mean())

0.7878912363067292
0.786482785602504
0.7936032863849766


In [44]:
from sklearn.ensemble import RandomForestClassifier

In [45]:
mod_rf_ant = RandomForestClassifier().fit(ptrain_ant, lalb)
mod_rf_ant_nov = RandomForestClassifier().fit(ptrain_ant_nov, lalb)
mod_rf_nov = RandomForestClassifier().fit(ptrain_nov, lalb)

In [46]:
print(cross_val_score(mod_rf_ant, ptrain_ant, lalb, scoring='accuracy', cv=10).mean())
print(cross_val_score(mod_rf_ant_nov, ptrain_ant_nov, lalb, scoring='accuracy', cv=10).mean())
print(cross_val_score(mod_rf_nov, ptrain_nov, lalb, scoring='accuracy', cv=10).mean())

0.7963223787167449
0.7991392801251955
0.7976917057902975


In [47]:
strat_train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
788,789,1,3,"Dean, Master. Bertram Vere",male,1.0,1,2,C.A. 2315,20.5750,,S
347,348,1,3,"Davison, Mrs. Thomas Henry (Mary E Finck)",female,,1,0,386525,16.1000,,S
629,630,0,3,"O'Connell, Mr. Patrick D",male,,0,0,334912,7.7333,,Q
734,735,0,2,"Troupiansky, Mr. Moses Aaron",male,23.0,0,0,233639,13.0000,,S
106,107,1,3,"Salkjelsvik, Miss. Anna Kristine",female,21.0,0,0,343120,7.6500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
263,264,0,1,"Harrison, Mr. William",male,40.0,0,0,112059,0.0000,B94,S
628,629,0,3,"Bostandyeff, Mr. Guentcho",male,26.0,0,0,349224,7.8958,,S
711,712,0,1,"Klaber, Mr. Herman",male,,0,0,113028,26.5500,C124,S
453,454,1,1,"Goldenberg, Mr. Samuel L",male,49.0,1,0,17453,89.1042,C92,C


In [59]:
pred_test = mod_log_ant.predict(Transf().fit_transform(strat_train))

In [60]:
from sklearn.metrics import accuracy_score

In [61]:
accuracy_score(strat_train['Survived'],pred_test)

0.7991573033707865

In [62]:
teste = pd.read_csv('test.csv')

In [63]:
teste_arrum = Transf().fit_transform(teste)

In [64]:
prev_test = mod_rf_ant.predict(teste_arrum)

In [65]:
prev_test

array([0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [66]:
df_f = pd.DataFrame({'PassengerId':teste['PassengerId'], 'Survived':prev_test})

In [56]:
df_f.to_csv('second_try.csv', index=False)

In [57]:
import joblib

In [58]:
joblib.dump(mod_rf_ant, 'segundo_random_forest.pkl')

['segundo_random_forest.pkl']