In [1]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score,precision_score, recall_score, f1_score, classification_report


# classificators
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,BaggingClassifier

## Preparing Data

In [2]:
titanic = pd.read_csv('./titanic.csv')
titanic.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,home.dest
0,3,0,"Abbing, Mr. Anthony",male,42,0,0,C.A. 5547,75500,,S,
1,3,0,"Abbott, Master. Eugene Joseph",male,13,0,2,C.A. 2673,202500,,S,"East Providence, RI"
2,3,0,"Abbott, Mr. Rossmore Edward",male,16,1,1,C.A. 2673,202500,,S,"East Providence, RI"
3,3,1,"Abbott, Mrs. Stanton (Rosa Hunt)",female,35,1,1,C.A. 2673,202500,,S,"East Providence, RI"
4,3,1,"Abelseth, Miss. Karen Marie",female,16,0,0,348125,76500,,S,"Norway Los Angeles, CA"


In [3]:
titanic.shape

(1046, 12)

In [4]:
refactor_1 = titanic.drop(['name','ticket'], axis=1) # drop NAME and TICKET NUMBER

In [5]:
refactor_1.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare,cabin,embarked,home.dest
0,3,0,male,42,0,0,75500,,S,
1,3,0,male,13,0,2,202500,,S,"East Providence, RI"
2,3,0,male,16,1,1,202500,,S,"East Providence, RI"
3,3,1,female,35,1,1,202500,,S,"East Providence, RI"
4,3,1,female,16,0,0,76500,,S,"Norway Los Angeles, CA"


In [6]:
refactor_2 = refactor_1
refactor_2['sex'] = refactor_1.sex.map({"male":1,"female":0}) # male - 1, female - 0

In [7]:
refactor_2.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare,cabin,embarked,home.dest
0,3,0,1,42,0,0,75500,,S,
1,3,0,1,13,0,2,202500,,S,"East Providence, RI"
2,3,0,1,16,1,1,202500,,S,"East Providence, RI"
3,3,1,0,35,1,1,202500,,S,"East Providence, RI"
4,3,1,0,16,0,0,76500,,S,"Norway Los Angeles, CA"


In [8]:
null_counts = refactor_2.isnull().sum()
print("Number of null values in each columns:\n{}".format(null_counts))

Number of null values in each columns:
pclass         0
survived       0
sex            0
age            0
sibsp          0
parch          0
fare           1
cabin        774
embarked       2
home.dest    361
dtype: int64


In [9]:
refactor_3 = refactor_2.drop(['home.dest','cabin'], axis=1) # drop CABIN and HOME.DEST number
refactor_3.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare,embarked
0,3,0,1,42,0,0,75500,S
1,3,0,1,13,0,2,202500,S
2,3,0,1,16,1,1,202500,S
3,3,1,0,35,1,1,202500,S
4,3,1,0,16,0,0,76500,S


In [10]:
refactor_3.embarked.value_counts()

S    782
C    212
Q     50
Name: embarked, dtype: int64

In [11]:
refactor_4 = refactor_3
refactor_4.embarked = refactor_1.embarked.map({"S":0,"C":1,"Q":2}) # S(Southampton) - 0, C(Cherbourg) - 1, Q(Queenstown) - 2

In [12]:
refactor_4.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare,embarked
0,3,0,1,42,0,0,75500,0.0
1,3,0,1,13,0,2,202500,0.0
2,3,0,1,16,1,1,202500,0.0
3,3,1,0,35,1,1,202500,0.0
4,3,1,0,16,0,0,76500,0.0


In [13]:
refactor_4.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1046 entries, 0 to 1045
Data columns (total 8 columns):
pclass      1046 non-null int64
survived    1046 non-null int64
sex         1046 non-null int64
age         1046 non-null object
sibsp       1046 non-null int64
parch       1046 non-null int64
fare        1045 non-null object
embarked    1044 non-null float64
dtypes: float64(1), int64(5), object(2)
memory usage: 65.4+ KB


In [14]:
refactor_5 = refactor_4
refactor_5.age = pd.to_numeric(pd.Series(refactor_4.age).str.replace(',','.'))
refactor_5.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare,embarked
0,3,0,1,42.0,0,0,75500,0.0
1,3,0,1,13.0,0,2,202500,0.0
2,3,0,1,16.0,1,1,202500,0.0
3,3,1,0,35.0,1,1,202500,0.0
4,3,1,0,16.0,0,0,76500,0.0


In [15]:
refactor_5.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1046 entries, 0 to 1045
Data columns (total 8 columns):
pclass      1046 non-null int64
survived    1046 non-null int64
sex         1046 non-null int64
age         1046 non-null float64
sibsp       1046 non-null int64
parch       1046 non-null int64
fare        1045 non-null object
embarked    1044 non-null float64
dtypes: float64(2), int64(5), object(1)
memory usage: 65.4+ KB


In [16]:
refactor_6 = refactor_5
refactor_6.fare = pd.to_numeric(pd.Series(refactor_5.fare).str.replace(',','.'))
refactor_6.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare,embarked
0,3,0,1,42.0,0,0,7.55,0.0
1,3,0,1,13.0,0,2,20.25,0.0
2,3,0,1,16.0,1,1,20.25,0.0
3,3,1,0,35.0,1,1,20.25,0.0
4,3,1,0,16.0,0,0,7.65,0.0


In [17]:
refactor_6.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1046 entries, 0 to 1045
Data columns (total 8 columns):
pclass      1046 non-null int64
survived    1046 non-null int64
sex         1046 non-null int64
age         1046 non-null float64
sibsp       1046 non-null int64
parch       1046 non-null int64
fare        1045 non-null float64
embarked    1044 non-null float64
dtypes: float64(3), int64(5)
memory usage: 65.4 KB


In [18]:
null_counts = refactor_6.isna().sum()
print("Number of NaN values in each columns:\n{}".format(null_counts))

Number of NaN values in each columns:
pclass      0
survived    0
sex         0
age         0
sibsp       0
parch       0
fare        1
embarked    2
dtype: int64


In [19]:
refactor_6.shape

(1046, 8)

In [20]:
refactor_7 = refactor_6
refactor_7.dropna(inplace=True)

print(refactor_7.shape)
null_counts = refactor_7.isna().sum()
print("Number of NaN values in each columns:\n{}".format(null_counts))

(1043, 8)
Number of NaN values in each columns:
pclass      0
survived    0
sex         0
age         0
sibsp       0
parch       0
fare        0
embarked    0
dtype: int64


In [21]:
titanic_prepared = refactor_7

In [22]:
titanic_prepared.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare,embarked
0,3,0,1,42.0,0,0,7.55,0.0
1,3,0,1,13.0,0,2,20.25,0.0
2,3,0,1,16.0,1,1,20.25,0.0
3,3,1,0,35.0,1,1,20.25,0.0
4,3,1,0,16.0,0,0,7.65,0.0


## Modeling Data / Model Comparison

In [23]:
X = titanic_prepared.drop('survived', axis=1).values
y = titanic_prepared.survived.values

In [24]:
X

array([[ 3.   ,  1.   , 42.   , ...,  0.   ,  7.55 ,  0.   ],
       [ 3.   ,  1.   , 13.   , ...,  2.   , 20.25 ,  0.   ],
       [ 3.   ,  1.   , 16.   , ...,  1.   , 20.25 ,  0.   ],
       ...,
       [ 3.   ,  1.   , 26.5  , ...,  0.   ,  7.225,  1.   ],
       [ 3.   ,  1.   , 27.   , ...,  0.   ,  7.225,  1.   ],
       [ 3.   ,  1.   , 29.   , ...,  0.   ,  7.875,  0.   ]])

In [25]:
y

array([0, 0, 0, ..., 0, 0, 0])

In [26]:
X_test, X_train, y_test,y_train = train_test_split(X,y, test_size=500, random_state=123)

In [35]:
models = {
#     "logreg":{
#         "pipe":Pipeline([
#             ("standarization",StandardScaler()),
#             ("logreg",LogisticRegression())
#         ]),
#         "param_grid":{
#             "standarization__with_mean":[False,True],
#             "logreg__penalty":["l1","l2"],
#             "logreg__C":[10,1,0.1,0.01]
#         }
#     },
#     "tree":{
#          "pipe":Pipeline([
#             ("tree",DecisionTreeClassifier())
#          ]),
#         "param_grid":{
#             "tree__max_depth":[3,5,7,10,20],
#             "tree__min_samples_leaf":[2,3,5,10,15]
#         }
#     },
#     "bayes":{
#         "pipe": Pipeline([
#             ("bayes",MultinomialNB())
#         ]),
#         "param_grid":{}
#     },
    "svc":{
        "pipe":Pipeline([
            ("standarization",StandardScaler()),
            ("svc",SVC())
        ]),
        "param_grid": [
#             {
#                 "standarization__with_mean":[False,True],
#                 "svc__kernel":['linear'],
#                 "svc__C": [1, 10, 100, 1000]
#             },
#             {
#                 "standarization__with_mean":[False,True],
#                 "svc__kernel":['rbf'],
#                 "svc__C": [1, 10, 100, 1000],
#                 "svc__gamma": [0.001, 0.0001]
#             },
#             {
#                 "standarization__with_mean":[False,True],
#                 "svc__kernel":['poly'],
#                 "svc__C": [1, 10, 100, 1000],
#                 "svc__degree": [1, 2,3,4,5]
#             },
            { 
                "standarization__with_mean":[False,True],
                "svc__kernel":['sigmoid'],
                "svc__C": [1, 10, 100]}
        ]
    },
    "random": {
        "pipe": Pipeline([
            ('random',RandomForestClassifier())]),
        "param_grid":{
            'random__n_estimators':[10,20,50]
        }
    },
    
    "bagging":{
        "pipe":Pipeline([
            ("bagging",BaggingClassifier())])
        ,
        "param_grid":{
            'bagging__n_estimators':[10,20,50,75],
            'bagging__max_samples':[1.0,0.5,0.3],
            'bagging__max_features':[1.0,0.5,0.3]
        } 
    }
}

In [36]:
def compareModels(models):
    results = {}
    for key in models:
        
        pipe = models[key]['pipe']
        param_grid = models[key]['param_grid']
    
        gs = GridSearchCV(pipe, param_grid, cv = 5, n_jobs=8)
        gs.fit(X_train,y_train)
        
        print(key)
    
        accuracy = accuracy_score(gs.best_estimator_.predict(X_test),y_test)
        best_params = str(gs.best_estimator_.get_params())
        f1 =  f1_score(gs.best_estimator_.predict(X_test),y_test)
    
        results[key] = {"accuracy":accuracy,"best_params":best_params, "f1":f1}
        
    return results

In [37]:
results = compareModels(models)

bagging
random
svc
logreg


In [34]:
import json
print(json.dumps(results, indent=4))

{
    "bagging": {
        "best_params": "{'bagging': BaggingClassifier(base_estimator=None, bootstrap=True,\n         bootstrap_features=False, max_features=1.0, max_samples=0.3,\n         n_estimators=50, n_jobs=1, oob_score=False, random_state=None,\n         verbose=0, warm_start=False), 'bagging__warm_start': False, 'bagging__random_state': None, 'bagging__bootstrap_features': False, 'bagging__max_samples': 0.3, 'bagging__verbose': 0, 'bagging__n_jobs': 1, 'bagging__n_estimators': 50, 'steps': [('bagging', BaggingClassifier(base_estimator=None, bootstrap=True,\n         bootstrap_features=False, max_features=1.0, max_samples=0.3,\n         n_estimators=50, n_jobs=1, oob_score=False, random_state=None,\n         verbose=0, warm_start=False))], 'bagging__base_estimator': None, 'memory': None, 'bagging__oob_score': False, 'bagging__max_features': 1.0, 'bagging__bootstrap': True}", 
        "f1": 0.7355163727959697, 
        "accuracy": 0.8066298342541437
    }, 
    "random": {
    