In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline


train = pd.read_csv(r"E:\datasets\titanic\train.csv")
test = pd.read_csv(r"E:\datasets\titanic\test.csv")
train.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [2]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [3]:
drop_feat = ["Cabin","Ticket","Name","PassengerId"]
numeric_feat = ["Age","SibSp","Parch","Fare"]
categorical_feat = ["Pclass","Sex","Embarked"]

In [4]:
train.Embarked.fillna("S",inplace = True)
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64

In [5]:
from sklearn.model_selection import train_test_split

train_set,val_set= train_test_split(train,test_size=0.2,random_state =42)

train_y = pd.DataFrame(train_set["Survived"])
train_X = train_set.drop("Survived",axis =1)

val_y = pd.DataFrame(val_set["Survived"])
val_X = val_set.drop("Survived",axis =1)

y = pd.DataFrame(train["Survived"])

In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

num_transformer = Pipeline(steps = [
    ('medimputer',SimpleImputer(strategy="median")),
    ("stdscaler",StandardScaler())
])

cat_transformer = Pipeline([
    ("1_hot",OneHotEncoder(handle_unknown="ignore"))
])

col_transformer = ColumnTransformer(transformers = [
    ("drop_col","drop",drop_feat),
    ("num",num_transformer,numeric_feat),
    ("cat",cat_transformer,categorical_feat)
])

data_pipeline = Pipeline([
    ("transform_col",col_transformer)
])

In [7]:
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

#svc_clf  = SVC(kernel="rbf",C=10)
rnd_clf = RandomForestClassifier()
params = {'n_estimators': [300, 350, 400,100],
          'max_depth': [3,4,5,7],
          'criterion':['gini'],
          'min_samples_leaf' : [1, 3, 5],
          'max_features':['auto'],
          'min_samples_split': [5, 10, 15,20],
          'max_leaf_nodes':[3,5,6,7,],
          }
clf = GridSearchCV(rnd_clf ,param_grid=params, cv=10, n_jobs=-1)

In [8]:
data_pipeline.fit(train_X)
train_X_ = data_pipeline.transform(train_X)

clf.fit(train_X_,train_y)


  self.best_estimator_.fit(X, y, **fit_params)


GridSearchCV(cv=10, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'criterion': ['gini'], 'max_depth': [3, 4, 5, 7],
                         'max_features': ['auto'],
                         'max_leaf_nodes': [3, 5, 6, 7],
                         'min_samples_leaf': [1, 3, 5],
                         'min_samples_split': [5, 10, 15, 20],
                         'n_estimators': [300, 350, 400, 100]})

In [9]:
print(clf.best_estimator_)
print(clf.best_score_)

RandomForestClassifier(max_depth=7, max_leaf_nodes=7, min_samples_split=5,
                       n_estimators=350)
0.820226917057903


In [16]:
data_pipeline.fit(val_X)
val_X_ = data_pipeline.transform(val_X)
y_pred = clf.predict(val_X_)

y_pred

array([0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 1, 1], dtype=int64)

In [17]:
import sklearn.metrics as m

m.confusion_matrix(val_y,y_pred)


array([[95, 10],
       [26, 48]], dtype=int64)

In [18]:
m.precision_score(val_y,y_pred)

0.8275862068965517

In [19]:
m.accuracy_score(val_y,y_pred)

0.7988826815642458

def get_models():
    models = dict()
    models['LGBMClassifier'] = LGBMClassifier()
    models['LogisticRegression'] = LogisticRegression()
    models['DecisionTree'] = DecisionTreeClassifier(max_depth=8) #Tuned
    models['RandomForest'] = RandomForestClassifier(max_depth=32) #Tuned
    models['GradientBoosting'] = GradientBoostingClassifier(max_depth=5) #Tuned
    models['svc'] = SVC(C=100, gamma=0.001, kernel='sigmoid') #Tuned
    return models

# evaluate a given model using cross-validation
def evaluate_model(model, X, y):
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
    scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
    return scores