In [207]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [208]:
df = pd.read_csv('../train.csv')

In [209]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [210]:
from sklearn.model_selection import StratifiedShuffleSplit

In [211]:
spliter = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=30)

In [212]:
for train_idx, test_idx in spliter.split(df, df['Sex']):
    strat_train = df.iloc[train_idx]
    strat_test = df.iloc[test_idx]

In [213]:
from sklearn.base import TransformerMixin, BaseEstimator

In [214]:
class Transf_Cat(TransformerMixin, BaseEstimator):
    def __init__(self, tamn=5, tp=True):
        self.tamn = tamn
        self.tp=tp
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        if self.tp:
            return pd.qcut(X, q=self.tamn, labels=False)
        else:
            return pd.cut(X, bins=self.tamn, labels=False)   

In [215]:
from sklearn.compose import ColumnTransformer

In [216]:
from sklearn.impute import SimpleImputer

In [217]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

In [218]:
class Impute_Encode_Emb(TransformerMixin, BaseEstimator):
    def __init__(self):
        pass
    def fit(self, X,y=None):
        self.imp = SimpleImputer(strategy='most_frequent').fit(X)
        return self
    def transform(self, X,y=None):
        nov = self.imp.transform(X)
        return OneHotEncoder().fit_transform(nov)

In [219]:
pipe_impute_encode = ColumnTransformer([
        ('Encode_Sex', OrdinalEncoder(), ['Sex']),
        ('Pcl', 'passthrough', ['Pclass']),
        ('Impute_Age', SimpleImputer(strategy='median'), ['Age']),
        ('Keep', 'passthrough', ['Fare','SibSp','Parch']),
        ('Impute_Embarked', Impute_Encode_Emb(), ['Embarked']),
])

* Sex - 0
* Pclass - 1
* Age - 2
* Fare - 3
* SibSp - 4
* Parch - 5
* Embarked - 6

In [220]:
class Final(TransformerMixin, BaseEstimator):
    def __init__(self, age=True, fare=True, sibsp=True, parch=True, emb=True, q1=True, n1=4, q2=True, n2=5):
        self.age = age
        self.fare = fare
        self.sibsp = sibsp
        self.parch = parch
        self.emb = emb
        self.q1 = q1
        self.n1 = n1
        self.q2 = q2
        self.n2 = n2
    def fit(self, X, y=None):
        self.pipe = pipe_impute_encode.fit(X)
        return self
    def transform(self, X, y=None):
        F = self.pipe.transform(X)
        col = [0, 1]
        if self.age:
            F[:,2] = Transf_Cat(self.n1, self.q1).fit_transform(F[:,2])
            col.append(2)
        if self.fare:
            F[:,3] = Transf_Cat(self.n2, self.q2).fit_transform(F[:,3])
            col.append(3)
        if self.sibsp:
            col.append(4)
        if self.parch:
            col.append(5)
        if self.emb:
            col.extend([6,7,8])
        return F[:,col]

In [221]:
from sklearn.pipeline import Pipeline

In [14]:
from sklearn.ensemble import RandomForestClassifier

In [17]:
from sklearn.linear_model import LogisticRegression

In [224]:
transf_pred = Pipeline([
    ('Transform', Final()),
    ('Predict', LogisticRegression(random_state=42, solver='liblinear'))
])

In [15]:
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV

In [36]:
param_grid = [
    {'Transform__age': [False, True], 'Transform__fare': [False, True], 'Transform__sibsp':[False, True], 
     'Transform__parch':[False, True],
     'Transform__emb':[False, True],
     'Transform__q1':[False],
     'Transform__q2':[False],
     'Transform__n1':[2,3,4],
     'Transform__n2':[2,3,4]
     #'Predict__n_estimators':[30,40,50],
     #'Predict__max_features':[2,4,6],
     #'Predict__bootstrap':[False, True]
    }
]

In [225]:
param_grid_2 = [
    {'Transform__fare':[False],
     'Transform__parch':[False],
     'Transform__q1':[False],
     'Transform__q2':[False],
     'Transform__n1':[2],
     'Transform__n2':[2],
     'Predict__penalty':['l1','l2'],
     'Predict__C':[i/100 for i in range(120, 146, 5)],
     'Predict__solver':['liblinear'],
     'Predict__max_iter':[1000],
     'Predict__class_weight':[None, 'balanced'],
     
    }
]

In [222]:
?LogisticRegression

In [227]:
grid_search = GridSearchCV(transf_pred, param_grid_2, cv=5,
                          scoring='accuracy',
                          return_train_score=True)

In [228]:
grid_search.fit(df, df['Survived'])

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('Transform', Final()),
                                       ('Predict',
                                        LogisticRegression(random_state=42,
                                                           solver='liblinear'))]),
             param_grid=[{'Predict__C': [1.2, 1.25, 1.3, 1.35, 1.4, 1.45],
                          'Predict__class_weight': [None, 'balanced'],
                          'Predict__max_iter': [1000],
                          'Predict__penalty': ['l1', 'l2'],
                          'Predict__solver': ['liblinear'],
                          'Transform__fare': [False], 'Transform__n1': [2],
                          'Transform__n2': [2], 'Transform__parch': [False],
                          'Transform__q1': [False], 'Transform__q2': [False]}],
             return_train_score=True, scoring='accuracy')

In [229]:
grid_search.best_params_

{'Predict__C': 1.4,
 'Predict__class_weight': None,
 'Predict__max_iter': 1000,
 'Predict__penalty': 'l2',
 'Predict__solver': 'liblinear',
 'Transform__fare': False,
 'Transform__n1': 2,
 'Transform__n2': 2,
 'Transform__parch': False,
 'Transform__q1': False,
 'Transform__q2': False}

In [230]:
grid_search.best_score_

0.8058251208335949

In [390]:
model = grid_search.best_estimator_

In [193]:
testes = pd.read_csv('../test.csv')

In [194]:
testes['Survived'] = 0

In [195]:
testes = testes[['PassengerId','Survived','Pclass','Name','Sex', 'Age','SibSp','Parch','Ticket','Fare','Cabin','Embarked']]

In [196]:
testes['Fare'] = testes['Fare'].fillna(np.mean(testes['Fare']))

In [197]:
predi = model.predict(testes)

In [198]:
Id = testes['PassengerId']

In [199]:
df_pred = pd.DataFrame({'PassengerId':Id, 'Survived':predi})

In [200]:
df_pred.to_csv('RandomFTuned_SeventhTry.csv', index=False)

In [240]:
from sklearn.model_selection import RandomizedSearchCV

In [18]:
from scipy.stats.distributions import uniform,norm

In [304]:
transf_pred = Pipeline([
    ('Transform', Final()),
    ('Predict', LogisticRegression(random_state=42, solver='liblinear'))
])

param_grid_2 = [
    {'Transform__fare':[False],
     'Transform__parch':[False],
     'Transform__q1':[False],
     'Transform__q2':[False],
     'Transform__n1':[2],
     'Transform__n2':[2],
     'Predict__penalty':['l2'],
     'Predict__C':uniform(0.2,3),
     'Predict__intercept_scaling':uniform(0.2,3),
     'Predict__solver':['liblinear'],
    }
]

In [305]:
grid_search = RandomizedSearchCV(transf_pred, param_grid_2, n_iter=100, cv=5,
                          scoring='accuracy',
                          return_train_score=True)

In [248]:
?RandomizedSearchCV

In [306]:
grid_search.fit(df, df['Survived'])

RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('Transform', Final()),
                                             ('Predict',
                                              LogisticRegression(random_state=42,
                                                                 solver='liblinear'))]),
                   n_iter=100,
                   param_distributions=[{'Predict__C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000166FA0B8190>,
                                         'Predict__intercept_scaling': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000166F8ED1370>,
                                         'Predict__penalty': ['l2'],
                                         'Predict__solver': ['liblinear'],
                                         'Transform__fare': [False],
                                         'Transform__n1': [2],
                                         'Transform__n2': [2],
                              

In [309]:
grid_search.best_score_

0.8058251208335949

In [322]:
from sklearn.svm import SVC

In [380]:
transf_pred = Pipeline([
    ('Transform', Final(q1=False, n2=3, n1=6)),
    ('Predict', SVC(random_state=42, verbose=True))
])

param_grid_2 = [
    {'Predict__kernel':['linear'],
    'Predict__C':uniform(0.1,3)},
    {'Predict__kernel':['rbf','poly','sigmoid'],
    'Predict__C':uniform(0.1,3),
    'Predict__gamma':uniform(0,1)}
]

In [381]:
grid_search = RandomizedSearchCV(transf_pred, param_grid_2, n_iter=80, cv=5,
                          scoring='accuracy',
                          return_train_score=True)

In [382]:
grid_search.fit(df, df['Survived'])

[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM]

RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('Transform',
                                              Final(n1=6, n2=3, q1=False)),
                                             ('Predict',
                                              SVC(random_state=42,
                                                  verbose=True))]),
                   n_iter=80,
                   param_distributions=[{'Predict__C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000166F96EA940>,
                                         'Predict__kernel': ['linear']},
                                        {'Predict__C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000166F9429DC0>,
                                         'Predict__gamma': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000166F9716310>,
                                         'Predict__kernel': ['rbf', 'poly',
                                                             'sigmoid']}

In [400]:
grid_search.best_score_

0.8249011361496453

In [388]:
grid_search.best_params_

{'Predict__C': 0.1933263869684543,
 'Predict__gamma': 0.2690682391492566,
 'Predict__kernel': 'poly'}

In [401]:
model

Pipeline(steps=[('Transform', Final(n1=6, n2=3, q1=False)),
                ('Predict',
                 SVC(C=0.1933263869684543, gamma=0.2690682391492566,
                     kernel='poly', random_state=42, verbose=True))])

{'Transform__q2': True,
 'Transform__q1': False,
 'Transform__parch': True,
 'Transform__n2': 3,
 'Transform__n1': 6,
 'Transform__fare': True,
 'Transform__age': True}

In [63]:
from scipy.stats.distributions import uniform,norm, randint

In [223]:
transf_pred = Pipeline([
    ('Transform', Final(q1=False, n1=4, n2=5, emb=False, sibsp=False)),
    ('Predict', RandomForestClassifier(random_state=42))
])

param_grid = [
    {
        'Predict__n_estimators':randint(140, 190),
        'Predict__min_samples_split':[3,4,5,6,7,8],
        'Predict__min_samples_leaf':[1,2,3,4],
        'Predict__max_depth':[None, 2,3,4,5,6,7]
    }
]

In [222]:
?RandomForestClassifier

In [224]:
grid_search = RandomizedSearchCV(transf_pred, param_grid, n_iter=60, cv=3)

In [225]:
grid_search.fit(df, df['Survived'])

RandomizedSearchCV(cv=3,
                   estimator=Pipeline(steps=[('Transform',
                                              Final(emb=False, q1=False,
                                                    sibsp=False)),
                                             ('Predict',
                                              RandomForestClassifier(random_state=42))]),
                   n_iter=60,
                   param_distributions=[{'Predict__max_depth': [None, 2, 3, 4,
                                                                5, 6, 7],
                                         'Predict__min_samples_leaf': [1, 2, 3,
                                                                       4],
                                         'Predict__min_samples_split': [3, 4, 5,
                                                                        6, 7,
                                                                        8],
                                         'Predict__n_est

In [226]:
grid_search.best_params_

{'Predict__max_depth': None,
 'Predict__min_samples_leaf': 1,
 'Predict__min_samples_split': 7,
 'Predict__n_estimators': 151}

In [227]:
grid_search.best_score_

0.8282828282828283

In [192]:
model = grid_search.best_estimator_

In [203]:
model[1].feature_importances_

array([0.48326218, 0.19050194, 0.08765954, 0.15881015, 0.07976619])

In [205]:
import joblib

In [206]:
joblib.dump(model, 'RandomForestTuned.pkl')

['RandomForestTuned.pkl']

{'Predict__min_samples_leaf': 2,
 'Predict__min_samples_split': 4,
 'Predict__n_estimators': 142}

{'Predict__criterion': 'gini',
 'Predict__min_samples_split': 4,
 'Predict__n_estimators': 158} 0.8288

{'Predict__max_features': 'auto',
 'Predict__min_samples_leaf': 1,
 'Predict__min_samples_split': 5,
 'Predict__n_estimators': 168} 0.8271

{'Predict__min_samples_leaf': 1,
 'Predict__min_samples_split': 6,
 'Predict__n_estimators': 147}0.830527497194164