In [79]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler,RobustScaler,StandardScaler

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split,cross_val_score,StratifiedKFold
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import KNNImputer,IterativeImputer
from sklearn.feature_selection import SelectKBest,chi2,mutual_info_classif

from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,VotingClassifier

sns.set_style('whitegrid')
from sklearn.metrics import accuracy_score

In [97]:
train=pd.read_csv('train.csv')
test=pd.read_csv('datasets/test.csv')

In [100]:
submit=pd.DataFrame(test['PassengerId'])
submit.head()

Unnamed: 0,PassengerId
0,892
1,893
2,894
3,895
4,896


In [101]:
model=[]#7 models
model.append(('Logistic Regression',LogisticRegression(max_iter=1000)))
model.append(('LDA',LinearDiscriminantAnalysis()))
model.append(('SVC',SVC(kernel='rbf')))
model.append(('DTC',DecisionTreeClassifier()))
model.append(('GBC',GradientBoostingClassifier()))
model.append(('RFC',RandomForestClassifier()))
model.append(('Kneig',KNeighborsClassifier()))


x=train.drop('Survived',axis=1)   
y=train['Survived']
xtrain,xvalid,ytrain,yvalid=train_test_split(x,y,test_size=0.3)

In [102]:
scores=[]

for name,models in model:
    pipeline = Pipeline( steps=[( 'scale', MinMaxScaler()), ('model', models)])
    cv=StratifiedKFold(n_splits=10, random_state=21, shuffle=True)
    score=cross_val_score( pipeline, x, y, cv=cv, scoring='accuracy', n_jobs=-1)
    scores.append((name, np.mean(score)))
   
    
scores

[('Logistic Regression', 0.8272159800249688),
 ('LDA', 0.82832709113608),
 ('SVC', 0.8238077403245943),
 ('DTC', 0.7789513108614232),
 ('GBC', 0.8361423220973784),
 ('RFC', 0.8092883895131087),
 ('Kneig', 0.8159300873907617)]

**Classification report**

In [103]:
from sklearn.metrics import classification_report

model=LogisticRegression(max_iter = 3000)
model.fit(xtrain, ytrain)
ypred=model.predict(xvalid)
print(classification_report(yvalid, ypred))

              precision    recall  f1-score   support

         0.0       0.85      0.88      0.86       168
         1.0       0.78      0.74      0.76       100

    accuracy                           0.82       268
   macro avg       0.81      0.81      0.81       268
weighted avg       0.82      0.82      0.82       268



In [104]:
model=RandomForestClassifier()
model.fit(xtrain, ytrain)
ypred=model.predict(xvalid)
print(classification_report(yvalid, ypred))

              precision    recall  f1-score   support

         0.0       0.84      0.85      0.85       168
         1.0       0.74      0.73      0.74       100

    accuracy                           0.81       268
   macro avg       0.79      0.79      0.79       268
weighted avg       0.81      0.81      0.81       268



# 7.Model classification using voting Classifier

In [105]:
estimator = []
estimator.append(('LR', GradientBoostingClassifier()))
estimator.append(('SVC', RandomForestClassifier()))
estimator.append(('kd', LogisticRegression(max_iter=3000)))


  
# Voting Classifier with hard voting
vot_hard = VotingClassifier(estimators = estimator, voting ='hard')
vot_hard.fit(xtrain, ytrain)
ypred=vot_hard.predict(xvalid)
print(classification_report(yvalid,ypred))

              precision    recall  f1-score   support

         0.0       0.87      0.88      0.87       168
         1.0       0.79      0.77      0.78       100

    accuracy                           0.84       268
   macro avg       0.83      0.83      0.83       268
weighted avg       0.84      0.84      0.84       268



In [106]:
pipeline=Pipeline(steps=[('scale',MinMaxScaler()),('jj',SelectKBest(score_func=mutual_info_classif,k=6)),('model',VotingClassifier(estimators = estimator, voting ='soft'))])
cv=StratifiedKFold(n_splits=10,random_state=21,shuffle=True)
pipeline.fit(x,y)
ypred=pipeline.predict(xvalid)
print(classification_report(yvalid,ypred))

              precision    recall  f1-score   support

         0.0       0.91      0.99      0.95       168
         1.0       0.98      0.84      0.90       100

    accuracy                           0.93       268
   macro avg       0.94      0.91      0.93       268
weighted avg       0.94      0.93      0.93       268



# 8. Hyperparameter tuning 


**5.1. KNeighborsClassifier**

In [107]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint


param_dist = {"max_depth": [3, None],
              "max_features": sp_randint(1, x.shape[1]),
              "min_samples_split": sp_randint(2, 11),
              "bootstrap": [True, False],
              "n_estimators": sp_randint(100, 500)}

random_search = RandomizedSearchCV(RandomForestClassifier(), param_distributions=param_dist, n_iter=10, cv=5, random_state=42)
random_search.fit(x,y)
print(random_search.best_params_)

{'bootstrap': True, 'max_depth': None, 'max_features': 13, 'min_samples_split': 9, 'n_estimators': 288}


In [108]:
gb_grid_params = {'learning_rate': [0.1, 0.05, 0.02, 0.01],
              'max_depth': [4, 6, 8],
              'min_samples_leaf': [20, 50,100,150],
              'max_features': [1.0, 0.3, 0.1] 
              }
print(gb_grid_params)

gb_gs = GradientBoostingClassifier(n_estimators = 600)

clf =RandomizedSearchCV(gb_gs,
                               gb_grid_params,
                               cv=2,
                               scoring='accuracy', 
                               n_jobs=10);
clf.fit(x,y)
print(clf.best_params_)

{'learning_rate': [0.1, 0.05, 0.02, 0.01], 'max_depth': [4, 6, 8], 'min_samples_leaf': [20, 50, 100, 150], 'max_features': [1.0, 0.3, 0.1]}
{'min_samples_leaf': 20, 'max_features': 1.0, 'max_depth': 8, 'learning_rate': 0.01}


In [109]:
estimator = []
estimator.append(('LR', clf))
estimator.append(('SVC', random_search))


  
# Voting Classifier with hard voting
vot_hard = VotingClassifier(estimators = estimator, voting ='hard')
vot_hard.fit(xtrain, ytrain)
ypred=vot_hard.predict(xvalid)
print(classification_report(yvalid,ypred))

              precision    recall  f1-score   support

         0.0       0.85      0.95      0.90       168
         1.0       0.90      0.71      0.79       100

    accuracy                           0.86       268
   macro avg       0.87      0.83      0.84       268
weighted avg       0.87      0.86      0.86       268



# 9.Testing

In [110]:

test['Sex'].replace({'male':0,'female':1},inplace=True)
test['Embarked'].replace({'S':1,'C':2,'Q':3},inplace=True)
test['title']=0
test['Fare']=np.sqrt(test['Fare'])
for i in range(0,len(test)):
    test.loc[i,'title']=test['Name'].iloc[i].split(',')[1].split('.')[0][1:]
test['title'].replace({'Mr':1,'Miss':2,'Mrs':2,'Master':3,'Dr':4,'Rev':5},inplace=True)
test['title'].replace(['Major','Mlle','Col','Don','the Countess','Sir','Capt','Mme','Lady','Jonkheer','Ms','Dona'],7,inplace=True)
test['family']=test['SibSp']+test['Parch']+1
test['family']=test['family'].map(family)

for i in range(len(test)):
    if not(pd.isnull(test['Cabin'].iloc[i])):
        test.loc[i,'Cabin']=test['Cabin'].loc[i][0]
test.drop(['Name','SibSp','Parch','Ticket','PassengerId','Cabin'],axis=1,inplace=True)

test=pd.get_dummies(test,columns=['Pclass','Embarked','title','family'],drop_first=True)

test=pd.DataFrame(impute.fit_transform(test),columns=test.columns)

In [112]:
submit['Survived']=pipeline.predict(test).astype(int)
submit.to_csv('ver.csv',index=False)
submit

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0
