In [24]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.ensemble import RandomForestClassifier
from scipy.stats.distributions import expon, poisson, uniform, randint

In [9]:
df = pd.read_csv('../train.csv')

In [119]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [11]:
Pipe_Impute = ColumnTransformer([
    ('Pass1', 'passthrough', ['Sex', 'Pclass', 'SibSp', 'Parch']),
    ('Impute_mean', SimpleImputer(strategy='mean'), ['Age', 'Fare']),
    ('Impute_most_freq', SimpleImputer(strategy='most_frequent'), ['Embarked'])
])

In [14]:
class Cate(TransformerMixin, BaseEstimator):
    def __init__(self, q=True, n=4):
        self.q = q
        self.n = n
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        T = X.copy()
        if self.q:
            t = pd.qcut(X[:,0], q=self.n, labels=False)
        else:
            t = pd.cut(X[:,0], bins=self.n, labels=False)
        T[:,0] = t
        return T

In [15]:
class Feature_Selec(TransformerMixin, BaseEstimator):
    def __init__(self, k=9):
        self.k = k
    def fit(self,  X, y=None):
        return self
    def transform(self, X, y=None):
        ft = joblib.load('Feat_Impor.pkl')
        return X[:, ft[:(self.k+1)]]

In [16]:
Pipe_Encode = ColumnTransformer([
    ('Sex_Encode',OrdinalEncoder(),[0]),
    ('Pclass_Encode',OrdinalEncoder(),[1]),
    ('Pass2','passthrough', [2,3]),
    ('Age_Cat',Cate(), [4]),
    ('Fare_Cat',Cate(), [5]),
    ('Embarked_Encode', OneHotEncoder(), [6])
])

In [17]:
Pipe_Completo = Pipeline([
    ('Pipe_Impute', Pipe_Impute),
    ('Pipe_Encode', Pipe_Encode),
    ('Feature_Selec', Feature_Selec()),
    ('RF', RandomForestClassifier(random_state=42))
])

In [213]:
param_grid = {
    'Pipe_Encode__Age_Cat__q':[False],
    'Pipe_Encode__Age_Cat__n':[4,5],
    'Pipe_Encode__Fare_Cat__q':[False,True],
    'Pipe_Encode__Fare_Cat__n':[4,5],
    'Feature_Selec__k':[5],
    'RF__n_estimators':randint(150, 170),
    'RF__min_samples_split':[7],
}

In [49]:
?RandomForestClassifier

In [214]:
grid_search = RandomizedSearchCV(Pipe_Completo, param_grid, n_iter=60, cv=5,
                                scoring='accuracy')

In [215]:
dads = df.drop('Survived', axis=1)
labels = df['Survived']

In [216]:
grid_search.fit(dads, labels)

RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('Pipe_Impute',
                                              ColumnTransformer(transformers=[('Pass1',
                                                                               'passthrough',
                                                                               ['Sex',
                                                                                'Pclass',
                                                                                'SibSp',
                                                                                'Parch']),
                                                                              ('Impute_mean',
                                                                               SimpleImputer(),
                                                                               ['Age',
                                                                                'Fare']),
        

In [217]:
grid_search.best_score_

0.8384282217061075

In [218]:
grid_search.best_params_

{'Feature_Selec__k': 5,
 'Pipe_Encode__Age_Cat__n': 4,
 'Pipe_Encode__Age_Cat__q': False,
 'Pipe_Encode__Fare_Cat__n': 4,
 'Pipe_Encode__Fare_Cat__q': True,
 'RF__min_samples_split': 7,
 'RF__n_estimators': 163}

In [203]:
model1 = grid_search.best_estimator_

In [207]:
def write_test(df, mod, n=1):
    dft = pd.DataFrame(np.c_[df['PassengerId'],mod.predict(df)], columns=['PassengerId', 'Survived'])
    dft.to_csv('Predictions_Teste{}.csv'.format(n), index=False)

In [205]:
test = pd.read_csv('../test.csv')

In [206]:
write_test(test, model1)