In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

# Linear models
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression

# Decision Tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor  

# Random Forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor 

#Boosting
from sklearn.ensemble import GradientBoostingClassifier

# Support Vector Machines
from sklearn.svm import SVC  # classifier
from sklearn.svm import SVR  # regression

# K-Nearest Neighbors
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor  

# Naive Bayes
from sklearn.naive_bayes import GaussianNB, MultinomialNB  

#Encoding
from sklearn.preprocessing import LabelEncoder,OneHotEncoder

#Scaling

from sklearn.preprocessing import StandardScaler #(Standarization)
from sklearn.preprocessing import MinMaxScaler #Normalization 

#Train_Test_Split
from sklearn.model_selection import train_test_split

#Metrices 
from sklearn.metrics import recall_score,precision_score,accuracy_score,f1_score,confusion_matrix,classification_report

#Tuning Model
from sklearn.model_selection import GridSearchCV #Automatically find the best combination of hyperparameters for a model.

In [3]:
df=sns.load_dataset("titanic")

In [4]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [5]:
df.columns

Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town',
       'alive', 'alone'],
      dtype='object')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


In [7]:
df.drop(['deck','embark_town','alive','class','who','adult_male'],axis=1,inplace=True)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   survived  891 non-null    int64  
 1   pclass    891 non-null    int64  
 2   sex       891 non-null    object 
 3   age       714 non-null    float64
 4   sibsp     891 non-null    int64  
 5   parch     891 non-null    int64  
 6   fare      891 non-null    float64
 7   embarked  889 non-null    object 
 8   alone     891 non-null    bool   
dtypes: bool(1), float64(2), int64(4), object(2)
memory usage: 56.7+ KB


In [9]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,alone
0,0,3,male,22.0,1,0,7.25,S,False
1,1,1,female,38.0,1,0,71.2833,C,False
2,1,3,female,26.0,0,0,7.925,S,True
3,1,1,female,35.0,1,0,53.1,S,False
4,0,3,male,35.0,0,0,8.05,S,True


In [10]:
df['age'].fillna(df['age'].mean(),inplace=True)

In [11]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,alone
0,0,3,male,22.0,1,0,7.25,S,False
1,1,1,female,38.0,1,0,71.2833,C,False
2,1,3,female,26.0,0,0,7.925,S,True
3,1,1,female,35.0,1,0,53.1,S,False
4,0,3,male,35.0,0,0,8.05,S,True


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   survived  891 non-null    int64  
 1   pclass    891 non-null    int64  
 2   sex       891 non-null    object 
 3   age       891 non-null    float64
 4   sibsp     891 non-null    int64  
 5   parch     891 non-null    int64  
 6   fare      891 non-null    float64
 7   embarked  889 non-null    object 
 8   alone     891 non-null    bool   
dtypes: bool(1), float64(2), int64(4), object(2)
memory usage: 56.7+ KB


In [13]:
df.dropna(subset=['embarked'],inplace=True) # remvoving null valued row,we could also fill with mean but we just removed it

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 889 entries, 0 to 890
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   survived  889 non-null    int64  
 1   pclass    889 non-null    int64  
 2   sex       889 non-null    object 
 3   age       889 non-null    float64
 4   sibsp     889 non-null    int64  
 5   parch     889 non-null    int64  
 6   fare      889 non-null    float64
 7   embarked  889 non-null    object 
 8   alone     889 non-null    bool   
dtypes: bool(1), float64(2), int64(4), object(2)
memory usage: 63.4+ KB


In [15]:
#Encoding
# df=pd.get_dummies(df,columns=['embarked'],drop_first=True)


In [16]:
# df['sex']=df['sex'].map({'male':0,'female':1})

In [17]:
#Encoding
le=LabelEncoder() # ordinal 
# oe=OneHotEncoder() # no-ordinal(no natural order)
df['embarked']=le.fit_transform(df['embarked'])
df['sex']=le.fit_transform(df['sex'])


In [18]:
df=df.astype(int)

In [19]:
#Standard Scaling
s_scaler=StandardScaler()
s_scaler.fit_transform(df)

array([[-0.78696114,  0.82520863,  0.73534203, ..., -0.49673282,
         0.58683958, -1.22934919],
       [ 1.27071078, -1.57221121, -1.35991138, ...,  0.79153916,
        -1.93955453, -1.22934919],
       [ 1.27071078,  0.82520863, -1.35991138, ..., -0.49673282,
         0.58683958,  0.81343853],
       ...,
       [-0.78696114,  0.82520863, -1.35991138, ..., -0.17466483,
         0.58683958, -1.22934919],
       [ 1.27071078, -1.57221121,  0.73534203, ..., -0.03376008,
        -1.93955453,  0.81343853],
       [-0.78696114,  0.82520863,  0.73534203, ..., -0.49673282,
        -0.67635748,  0.81343853]], shape=(889, 9))

In [20]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,alone
0,0,3,1,22,1,0,7,2,0
1,1,1,0,38,1,0,71,0,0
2,1,3,0,26,0,0,7,2,1
3,1,1,0,35,1,0,53,2,0
4,0,3,1,35,0,0,8,2,1


In [21]:
#Normalization
n_scaler=MinMaxScaler()
n_scaler.fit_transform(df)

array([[0.        , 1.        , 1.        , ..., 0.01367188, 1.        ,
        0.        ],
       [1.        , 0.        , 0.        , ..., 0.13867188, 0.        ,
        0.        ],
       [1.        , 1.        , 0.        , ..., 0.01367188, 1.        ,
        1.        ],
       ...,
       [0.        , 1.        , 0.        , ..., 0.04492188, 1.        ,
        0.        ],
       [1.        , 0.        , 1.        , ..., 0.05859375, 0.        ,
        1.        ],
       [0.        , 1.        , 1.        , ..., 0.01367188, 0.5       ,
        1.        ]], shape=(889, 9))

In [22]:
X=df.drop("survived",axis=1)
y=df['survived']

In [23]:
#Train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [24]:
#Model Selection

model_logistic=LogisticRegression()
model_logistic.fit(X_train,y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [25]:
y_predict=model_logistic.predict(X_test)
print(y_predict)

[0 1 1 0 1 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0 0 1 0 1 0 0 1
 1 0 0 0 1 0 0 0 0 1 0 0 1 1 1 0 0 1 1 1 0 0 0 0 1 1 0 1 0 0 0 1 1 0 1 1 0
 1 1 0 0 1 0 0 1 1 1 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 1 0 0 0 0 1 1 0 0 0
 1 0 1 0 0 0 0 1 0 0 1 0 0 1 1 1 1 1 0 1 0 0 0 1 1 0 1 1 1 1 0 1 0 1 0 0 1
 0 1 0 1 0 1 0 1 0 0 0 1 0 1 0 0 0 1 1 0 0 1 1 1 1 0 1 0 0 1]


In [26]:
df_compare=pd.DataFrame({
    'y':y_test.values,
    'y_predict':y_predict
})
print(df_compare)

     y  y_predict
0    0          0
1    1          1
2    1          1
3    0          0
4    1          1
..  ..        ...
173  0          0
174  0          1
175  1          0
176  1          0
177  1          1

[178 rows x 2 columns]


In [27]:
#Metrices
print('recall:',recall_score(y_test,y_predict))
print('precision:',precision_score(y_test,y_predict))
print('accuracy:',accuracy_score(y_test,y_predict))
print('f1_score:',f1_score(y_test,y_predict))
print()
print('confusion_matrix:\n',confusion_matrix(y_test,y_predict))
print()
print('classification_report:\n',classification_report(y_test,y_predict))


recall: 0.7681159420289855


precision: 0.7361111111111112
accuracy: 0.8033707865168539
f1_score: 0.75177304964539

confusion_matrix:
 [[90 19]
 [16 53]]

classification_report:
               precision    recall  f1-score   support

           0       0.85      0.83      0.84       109
           1       0.74      0.77      0.75        69

    accuracy                           0.80       178
   macro avg       0.79      0.80      0.79       178
weighted avg       0.81      0.80      0.80       178



In [28]:
#Train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [29]:
model_tree=DecisionTreeClassifier(
    criterion='gini',         # or 'entropy'
    max_depth=5,              # maximum depth of tree
    min_samples_split=4,      # min samples to split a node
    min_samples_leaf=2,       # min samples at a leaf
    max_features=None,        # number of features to consider at each split
    random_state=42
)
model_tree.fit(X_train,y_train)

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,5
,min_samples_split,4
,min_samples_leaf,2
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [30]:
y_predict_tree=model_tree.predict(X_test)
print(y_predict_tree)

[0 1 1 0 1 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 1 0 0 0 1 0 0 0 0 1
 0 0 0 0 1 0 0 1 0 1 0 0 1 1 1 0 0 1 1 0 0 0 0 1 0 1 0 1 0 0 1 1 1 0 1 1 0
 0 1 0 0 1 1 0 1 1 1 0 0 0 0 0 0 0 1 0 0 1 1 0 0 0 0 0 1 0 0 0 0 1 1 0 0 0
 0 0 1 0 0 0 0 0 0 1 1 0 0 1 1 1 1 1 1 1 0 0 0 1 1 0 1 1 1 1 0 1 0 1 0 0 1
 0 1 0 1 0 1 0 1 0 0 0 1 0 1 0 0 0 0 1 0 0 1 1 1 1 0 1 0 0 1]


In [31]:
#Metrices
print("DECISION TREE METRICES\n")
print('recall:',recall_score(y_test,y_predict_tree))
print('precision:',precision_score(y_test,y_predict_tree))
print('accuracy:',accuracy_score(y_test,y_predict_tree))
print('f1_score:',f1_score(y_test,y_predict_tree))
print()
print('confusion_matrix:\n',confusion_matrix(y_test,y_predict_tree))
print()
print('classification_report:\n',classification_report(y_test,y_predict_tree))

DECISION TREE METRICES

recall: 0.7536231884057971
precision: 0.7222222222222222
accuracy: 0.7921348314606742
f1_score: 0.7375886524822695

confusion_matrix:
 [[89 20]
 [17 52]]

classification_report:
               precision    recall  f1-score   support

           0       0.84      0.82      0.83       109
           1       0.72      0.75      0.74        69

    accuracy                           0.79       178
   macro avg       0.78      0.79      0.78       178
weighted avg       0.79      0.79      0.79       178



In [32]:
model_forest=RandomForestClassifier(
    n_estimators=200,        # Number of trees
    max_depth=5,             # Maximum depth of each tree
    min_samples_split=4,     # Minimum samples required to split a node
    min_samples_leaf=2,      # Minimum samples required at a leaf node
    max_features='sqrt',     # Number of features to consider at each split
    bootstrap=True,          # Use bootstrap samples
    random_state=42)
model_forest.fit(X_train,y_train)

0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,5
,min_samples_split,4
,min_samples_leaf,2
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [33]:
y_predict_forest=model_forest.predict(X_test)
print(y_predict_forest)

[0 1 1 0 1 0 0 0 1 1 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1
 0 0 0 0 0 0 0 0 0 1 0 0 1 1 1 0 0 1 1 1 0 0 0 0 1 1 0 1 0 0 0 1 1 0 1 1 0
 0 1 0 0 1 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 1 0 0 0 0 0 1 0 0 0
 0 0 1 0 0 0 0 1 0 1 1 0 0 1 1 0 1 1 0 1 0 0 0 0 1 0 1 0 1 1 0 1 0 1 0 0 1
 0 1 0 1 0 1 0 1 0 0 0 1 0 1 0 0 0 0 1 0 0 1 1 1 1 0 1 0 0 1]


In [34]:
#Metrices
print("Random Forest METRICES\n")
print('recall:',recall_score(y_test,y_predict_forest))
print('precision:',precision_score(y_test,y_predict_forest))
print('accuracy:',accuracy_score(y_test,y_predict_forest))
print('f1_score:',f1_score(y_test,y_predict_forest))
print()
print('confusion_matrix:\n',confusion_matrix(y_test,y_predict_forest))
print()
print('classification_report:\n',classification_report(y_test,y_predict_forest))

Random Forest METRICES

recall: 0.7101449275362319
precision: 0.7903225806451613
accuracy: 0.8146067415730337
f1_score: 0.7480916030534351

confusion_matrix:
 [[96 13]
 [20 49]]

classification_report:
               precision    recall  f1-score   support

           0       0.83      0.88      0.85       109
           1       0.79      0.71      0.75        69

    accuracy                           0.81       178
   macro avg       0.81      0.80      0.80       178
weighted avg       0.81      0.81      0.81       178



In [35]:
model_gradient=GradientBoostingClassifier(
    n_estimators=200,       # number of trees
    learning_rate=0.1,      # shrinkage factor
    max_depth=3,            # max depth of each tree
    min_samples_split=2,    # min samples to split
    min_samples_leaf=1,     # min samples at a leaf
    subsample=0.8,          # fraction of samples for each tree
    max_features='sqrt',    # number of features per split
    random_state=42
)
model_gradient.fit(X_train,y_train)

0,1,2
,loss,'log_loss'
,learning_rate,0.1
,n_estimators,200
,subsample,0.8
,criterion,'friedman_mse'
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_depth,3
,min_impurity_decrease,0.0


In [36]:
y_predict_gradient=model_gradient.predict(X_test)
print(y_predict_gradient)

[0 1 1 0 1 0 0 0 1 1 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1
 1 0 0 0 0 0 0 1 0 1 0 0 1 1 1 0 0 1 1 0 0 0 0 0 0 1 0 1 0 0 1 1 1 0 1 1 0
 0 1 0 0 1 1 1 1 1 1 0 0 0 0 0 0 0 1 0 0 1 1 0 0 1 0 0 1 0 0 0 0 1 1 0 0 0
 0 0 1 0 0 0 0 0 0 1 1 0 0 1 1 0 1 1 0 1 0 0 0 1 1 0 1 0 1 1 0 1 0 1 0 0 1
 0 1 0 1 0 1 0 1 0 0 0 1 0 1 0 0 0 1 1 0 0 1 1 1 1 0 1 0 0 1]


In [37]:
#Metrices
print("Gradient Boosting METRICES\n")
print('recall:',recall_score(y_test,y_predict_gradient))
print('precision:',precision_score(y_test,y_predict_gradient))
print('accuracy:',accuracy_score(y_test,y_predict_gradient))
print('f1_score:',f1_score(y_test,y_predict_gradient))
print()
print('confusion_matrix:\n',confusion_matrix(y_test,y_predict_gradient))
print()
print('classification_report:\n',classification_report(y_test,y_predict_gradient))

Gradient Boosting METRICES

recall: 0.7536231884057971
precision: 0.7536231884057971
accuracy: 0.8089887640449438
f1_score: 0.7536231884057971

confusion_matrix:
 [[92 17]
 [17 52]]

classification_report:
               precision    recall  f1-score   support

           0       0.84      0.84      0.84       109
           1       0.75      0.75      0.75        69

    accuracy                           0.81       178
   macro avg       0.80      0.80      0.80       178
weighted avg       0.81      0.81      0.81       178



In [38]:
param_grid_tree = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 3, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10, 15],
    'min_samples_leaf': [1, 2, 4, 6],
    'max_features': [None, 'sqrt', 'log2']
}

param_grid_forest = {
    'n_estimators': [50, 100, 150, 200],
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

param_grid_gradient = {
    'n_estimators': [50, 100, 150, 200],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [2, 3, 4, 5],
    'subsample': [0.8, 1.0]
}

# GridSearchCV_model_tree
grid_search_tree = GridSearchCV(
    estimator=model_tree,
    param_grid=param_grid_tree,
    cv=5,           # 5-fold cross-validation
    scoring='accuracy',  # metric to optimize
    n_jobs=-1       # use all CPU cores
)

# GridSearchCV
grid_search_forest = GridSearchCV(
    estimator=model_forest,
    param_grid=param_grid_forest,
    cv=5,           # 5-fold cross-validation
    scoring='accuracy',  # metric to optimize
    n_jobs=-1       # use all CPU cores
)

# GridSearchCV
grid_search_gradient = GridSearchCV(
    estimator=model_gradient,
    param_grid=param_grid_gradient,
    cv=5,           # 5-fold cross-validation
    scoring='accuracy',  # metric to optimize
    n_jobs=-1       # use all CPU cores
)

# Fit on training data
grid_search_tree.fit(X_train, y_train)
grid_search_forest.fit(X_train, y_train)
grid_search_gradient.fit(X_train, y_train)

# Best hyperparameters
print("Decision Tree Best Hyperparameters:", grid_search_tree.best_params_)
print("Random Forest Best Hyperparameters:", grid_search_forest.best_params_)
print("Gradient Boosting Best Hyperparameters:", grid_search_gradient.best_params_)

# Best accuracy
print("Decision Tree Best Accuracy:", grid_search_tree.best_score_)
print("Random Forest Best Accuracy:", grid_search_forest.best_score_)
print("Gradient Boosting Best Accuracy:", grid_search_gradient.best_score_)


Decision Tree Best Hyperparameters: {'criterion': 'gini', 'max_depth': 20, 'max_features': 'log2', 'min_samples_leaf': 2, 'min_samples_split': 15}
Random Forest Best Hyperparameters: {'criterion': 'entropy', 'max_depth': 10, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 50}
Gradient Boosting Best Hyperparameters: {'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 150, 'subsample': 1.0}
Decision Tree Best Accuracy: 0.8367871565054663
Random Forest Best Accuracy: 0.8438491086378409
Gradient Boosting Best Accuracy: 0.832640598837782


In [39]:
y_pred_tree=model_tree.predict(X_test)
print(y_pred_tree)

[0 1 1 0 1 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 1 0 0 0 1 0 0 0 0 1
 0 0 0 0 1 0 0 1 0 1 0 0 1 1 1 0 0 1 1 0 0 0 0 1 0 1 0 1 0 0 1 1 1 0 1 1 0
 0 1 0 0 1 1 0 1 1 1 0 0 0 0 0 0 0 1 0 0 1 1 0 0 0 0 0 1 0 0 0 0 1 1 0 0 0
 0 0 1 0 0 0 0 0 0 1 1 0 0 1 1 1 1 1 1 1 0 0 0 1 1 0 1 1 1 1 0 1 0 1 0 0 1
 0 1 0 1 0 1 0 1 0 0 0 1 0 1 0 0 0 0 1 0 0 1 1 1 1 0 1 0 0 1]


In [40]:
print(f"Accuracy        : {accuracy_score(y_test,y_pred_tree):.4f}")
print(f"Precision       : {precision_score(y_test,y_pred_tree):.4f}")
print(f"Recall          : {recall_score(y_test,y_pred_tree):.4f}")
print(f"F1 Score        : {f1_score(y_test,y_pred_tree):.4f}")

print("\nConfusion Matrix:")
print(confusion_matrix(y_test,y_pred_tree))

print("\nClassification Report:")
print(classification_report(y_test,y_pred_tree))


Accuracy        : 0.7921
Precision       : 0.7222
Recall          : 0.7536
F1 Score        : 0.7376

Confusion Matrix:
[[89 20]
 [17 52]]

Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.82      0.83       109
           1       0.72      0.75      0.74        69

    accuracy                           0.79       178
   macro avg       0.78      0.79      0.78       178
weighted avg       0.79      0.79      0.79       178

