In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

titanic_train=pd.read_csv("train.csv")
titanic_test=pd.read_csv("test.csv")

In [2]:
from sklearn.model_selection import StratifiedShuffleSplit
split=StratifiedShuffleSplit(n_splits=1,test_size=0.2)
for train_indices,test_indices in split.split(titanic_train,titanic_train[["Survived","Pclass","Sex"]]):
    strat_train_set=titanic_train.loc[train_indices]
    strat_test_set=titanic_train.loc[test_indices]

In [3]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer

class AgeImputer (BaseEstimator, TransformerMixin):
    def fit (self,X,y=None):
        return self
    def transform (self,X):
        imputer=SimpleImputer(strategy="mean")
        X["Age"]=imputer.fit_transform(X[["Age"]])
        return X

In [4]:
from sklearn.preprocessing import OneHotEncoder
class FeatureEncoder (BaseEstimator, TransformerMixin):
    def fit (self,X,y=None):
        return self
    def transform (self,X):
        encoder=OneHotEncoder()
        matrix=encoder.fit_transform(X[["Sex"]]).toarray()
        colum_names=["Female","Male"]
        for i in range(len(matrix.T)):
            X[colum_names[i]]=matrix.T[i]
        return X

In [5]:
class FeatureEncoder2 (BaseEstimator, TransformerMixin):
    def fit (self,X,y=None):
        return self
    def transform (self,X):
        encoder=OneHotEncoder()
        matrix=encoder.fit_transform(X[["Pclass"]]).toarray()
        colum_names=["Pclass1","Pclass2","Pclass3"]
        for i in range(len(matrix.T)):
            X[colum_names[i]]=matrix.T[i]
        return X

In [6]:
class FeatureEncoder3 (BaseEstimator, TransformerMixin):
    def fit (self,X,y=None):
        return self
    def transform (self,X):
        X["Cabin"]=X["Cabin"].fillna("b")
        X["Cabin"]=X["Cabin"].str[0]
        encoder=OneHotEncoder()
        matrix=encoder.fit_transform(X[["Cabin"]]).toarray()
        colum_names=X["Cabin"].unique()
        for i in range(len(matrix.T)):
            X[colum_names[i]]=matrix.T[i]
        return X

In [7]:
class FeatureDropper (BaseEstimator, TransformerMixin):
    def fit (self,X,y=None):
        return self
    def transform (self,X):
        return X.drop(["Sex","Embarked","Ticket","Name","PassengerId","Cabin"],axis=1,errors="ignore")

In [8]:
from sklearn.pipeline import Pipeline

pipeline=Pipeline([("featureencoder",FeatureEncoder()),
                  ("ageimputer",AgeImputer()),
                  ("featureencoder2",FeatureEncoder2()),
                  ("featureencoder3",FeatureEncoder3()),
                  ("featuredropper",FeatureDropper())])

In [9]:
strat_train_set=pipeline.fit_transform(strat_train_set)
strat_train_set

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Female,Male,Pclass1,Pclass2,Pclass3,B,D,b,C,F,E,A,G,T
802,1,1,11.0000,1,2,120.0000,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
52,1,1,49.0000,1,0,76.7292,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
605,0,3,36.0000,1,0,15.5500,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
879,1,1,56.0000,0,1,83.1583,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
17,1,2,29.5932,0,0,13.0000,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47,1,3,29.5932,0,0,7.7500,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
414,1,3,44.0000,0,0,7.9250,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
838,1,3,32.0000,0,0,56.4958,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
78,1,2,0.8300,0,2,29.0000,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [10]:
strat_train_set.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 712 entries, 802 to 675
Data columns (total 20 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  712 non-null    int64  
 1   Pclass    712 non-null    int64  
 2   Age       712 non-null    float64
 3   SibSp     712 non-null    int64  
 4   Parch     712 non-null    int64  
 5   Fare      712 non-null    float64
 6   Female    712 non-null    float64
 7   Male      712 non-null    float64
 8   Pclass1   712 non-null    float64
 9   Pclass2   712 non-null    float64
 10  Pclass3   712 non-null    float64
 11  B         712 non-null    float64
 12  D         712 non-null    float64
 13  b         712 non-null    float64
 14  C         712 non-null    float64
 15  F         712 non-null    float64
 16  E         712 non-null    float64
 17  A         712 non-null    float64
 18  G         712 non-null    float64
 19  T         712 non-null    float64
dtypes: float64(16), int64(4)
memor

In [11]:
from sklearn.preprocessing import StandardScaler

X=strat_train_set.drop(["Survived"],axis=1)
y=strat_train_set["Survived"]

scaler=StandardScaler()
X_data=scaler.fit_transform(X)
y_data=y.to_numpy()

In [12]:
X_data

array([[-1.56828591, -1.43125208,  0.41418554, ..., -0.0751646 ,
        -0.03750293, -1.8649363 ],
       [-1.56828591,  1.49388072,  0.41418554, ..., -0.0751646 ,
        -0.03750293, -1.8649363 ],
       [ 0.82789342,  0.49317739,  0.41418554, ..., -0.0751646 ,
        -0.03750293,  0.53621134],
       ...,
       [ 0.82789342,  0.18526867, -0.4687489 , ..., -0.0751646 ,
        -0.03750293,  0.53621134],
       [-0.37019624, -2.21411   , -0.4687489 , ..., -0.0751646 ,
        -0.03750293,  0.53621134],
       [ 0.82789342, -0.89241183, -0.4687489 , ..., -0.0751646 ,
        -0.03750293,  0.53621134]])

In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

clf=RandomForestClassifier()

param_dist=[{"n_estimators": [10,100,200,500],"max_depth": [None,5,10],"min_samples_split": [2,3,4]}]
grid_search=GridSearchCV(clf,param_dist,cv=3,scoring="accuracy",return_train_score=True)
grid_search.fit(X_data,y_data)

In [14]:
final_clf=grid_search.best_estimator_

In [15]:
strat_test_set=pipeline.fit_transform(strat_test_set)
strat_test_set

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Female,Male,Pclass1,Pclass2,Pclass3,E,b,C,B,A,F,D
462,0,1,47.000000,0,0,38.5000,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
746,0,3,16.000000,1,1,20.2500,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
577,1,1,39.000000,1,0,55.9000,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
601,0,3,30.137266,0,0,7.8958,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
297,0,1,2.000000,1,2,151.5500,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
719,0,3,33.000000,0,0,7.7750,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
60,0,3,22.000000,0,0,7.2292,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
94,0,3,59.000000,0,0,7.2500,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
859,0,3,30.137266,0,0,7.2292,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [16]:
strat_test_set["G"]=0
strat_test_set["T"]=0
strat_test_set

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Female,Male,Pclass1,Pclass2,Pclass3,E,b,C,B,A,F,D,Cabin8,Cabin9
462,0,1,47.000000,0,0,38.5000,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0,0
746,0,3,16.000000,1,1,20.2500,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0
577,1,1,39.000000,1,0,55.9000,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0,0
601,0,3,30.137266,0,0,7.8958,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0
297,0,1,2.000000,1,2,151.5500,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
719,0,3,33.000000,0,0,7.7750,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0
60,0,3,22.000000,0,0,7.2292,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0
94,0,3,59.000000,0,0,7.2500,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0
859,0,3,30.137266,0,0,7.2292,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0


In [17]:
X_test=strat_test_set.drop(["Survived"],axis=1)
y_test=strat_test_set["Survived"]

scaler=StandardScaler()
X_data_test=scaler.fit_transform(X_test)
y_data_test=y_test.to_numpy()

In [18]:
final_clf.score(X_data_test,y_data_test)

0.8324022346368715

In [19]:
titanic_train=pipeline.fit_transform(titanic_train)
titanic_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 20 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Age       891 non-null    float64
 3   SibSp     891 non-null    int64  
 4   Parch     891 non-null    int64  
 5   Fare      891 non-null    float64
 6   Female    891 non-null    float64
 7   Male      891 non-null    float64
 8   Pclass1   891 non-null    float64
 9   Pclass2   891 non-null    float64
 10  Pclass3   891 non-null    float64
 11  b         891 non-null    float64
 12  C         891 non-null    float64
 13  E         891 non-null    float64
 14  G         891 non-null    float64
 15  D         891 non-null    float64
 16  A         891 non-null    float64
 17  B         891 non-null    float64
 18  F         891 non-null    float64
 19  T         891 non-null    float64
dtypes: float64(16), int64(4)
memory 

In [20]:
X_titanic_train=titanic_train.drop(["Survived"],axis=1)
y_titanic_train=titanic_train["Survived"]

scaler=StandardScaler()
X_data_titanic=scaler.fit_transform(X_titanic_train)
y_data_titanic=y_titanic_train.to_numpy()

In [21]:
clf2=RandomForestClassifier()

param_dist=[{"n_estimators": [10,100,200,500],"max_depth": [None,5,10],"min_samples_split": [2,3,4]}]
grid_search=GridSearchCV(clf2,param_dist,cv=3,scoring="accuracy",return_train_score=True)
grid_search.fit(X_data_titanic,y_data_titanic)

In [22]:
final_clf2=grid_search.best_estimator_

In [23]:
final_test_data=pipeline.fit_transform(titanic_test)
final_test_data

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Female,Male,Pclass1,Pclass2,Pclass3,b,B,E,A,C,D,F,G
0,3,34.50000,0,0,7.8292,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,3,47.00000,1,0,7.0000,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,2,62.00000,0,0,9.6875,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,3,27.00000,0,0,8.6625,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,3,22.00000,1,1,12.2875,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,3,30.27259,0,0,8.0500,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
414,1,39.00000,0,0,108.9000,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
415,3,38.50000,0,0,7.2500,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
416,3,30.27259,0,0,8.0500,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [25]:
final_test_data["T"]=0
final_test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 19 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Pclass   418 non-null    int64  
 1   Age      418 non-null    float64
 2   SibSp    418 non-null    int64  
 3   Parch    418 non-null    int64  
 4   Fare     417 non-null    float64
 5   Female   418 non-null    float64
 6   Male     418 non-null    float64
 7   Pclass1  418 non-null    float64
 8   Pclass2  418 non-null    float64
 9   Pclass3  418 non-null    float64
 10  b        418 non-null    float64
 11  B        418 non-null    float64
 12  E        418 non-null    float64
 13  A        418 non-null    float64
 14  C        418 non-null    float64
 15  D        418 non-null    float64
 16  F        418 non-null    float64
 17  G        418 non-null    float64
 18  T        418 non-null    int64  
dtypes: float64(15), int64(4)
memory usage: 62.2 KB


In [28]:
X_final_test=final_test_data
X_final_test=X_final_test.fillna(method="ffill")
scaler=StandardScaler()
X_data_final_test=scaler.fit_transform(X_final_test)

In [31]:
prediction=final_clf2.predict(X_data_final_test)
prediction

array([0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [33]:
final_df=pd.DataFrame(titanic_test["PassengerId"])
final_df["Survived"]=prediction
final_df

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [37]:
final_df.to_csv("result.csv",index=False)