In [239]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme(context='notebook', style='darkgrid',palette='dark', font_scale=1.2)
%matplotlib inline

In [240]:
train=pd.read_csv("train.csv")
train.sample(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
114,115,0,3,"Attalah, Miss. Malake",female,17.0,0,0,2627,14.4583,,C
678,679,0,3,"Goodwin, Mrs. Frederick (Augusta Tyler)",female,43.0,1,6,CA 2144,46.9,,S
559,560,1,3,"de Messemaeker, Mrs. Guillaume Joseph (Emma)",female,36.0,1,0,345572,17.4,,S
144,145,0,2,"Andrew, Mr. Edgardo Samuel",male,18.0,0,0,231945,11.5,,S
244,245,0,3,"Attalah, Mr. Sleiman",male,30.0,0,0,2694,7.225,,C


In [241]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer,StandardScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report

from sklearn import set_config
set_config(display="diagram")

In [242]:
X,y=train.drop(["Survived"],axis=1),train["Survived"]

In [243]:
y.sample(5)

765    1
88     1
761    0
412    1
406    0
Name: Survived, dtype: int64

In [244]:
X.sample(5)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
18,19,3,"Vander Planke, Mrs. Julius (Emelia Maria Vande...",female,31.0,1,0,345763,18.0,,S
733,734,2,"Berriman, Mr. William John",male,23.0,0,0,28425,13.0,,S
617,618,3,"Lobb, Mrs. William Arthur (Cordelia K Stanlick)",female,26.0,1,0,A/5. 3336,16.1,,S
154,155,3,"Olsen, Mr. Ole Martin",male,,0,0,Fa 265302,7.3125,,S
235,236,3,"Harknett, Miss. Alice Phoebe",female,,0,0,W./C. 6609,7.55,,S


--Model Building--

In [245]:
# Drop PassengerId,Name,Ticket,SibSp,Parch as they don't affect survival accoring to analysis
ct1=ColumnTransformer([
    ("drop-unwanted-cols","drop",[0,2,5,6,7,9])
],remainder="passthrough")

In [246]:
# Imputing Missing Vals
ageAndFareImputer=SimpleImputer(strategy="median")
embarkImputer=SimpleImputer(strategy="most_frequent")
ct2=ColumnTransformer([
    ("imputing-age-and-fare",ageAndFareImputer,[2,3]),
    ("embark-imputation",embarkImputer,[-1])
],remainder="passthrough")

In [247]:
# Encoding Categorical Data
ohe=OneHotEncoder(sparse_output=False,drop="first")
ct3=ColumnTransformer([
    ("encode-categories",ohe,[2,4]),
],remainder="passthrough")

In [248]:
# Scalling features
scaler=StandardScaler()
ct4=ColumnTransformer([
    ("scalling",scaler,slice(0,None))
],remainder="passthrough")

In [249]:
#----- Modelling --------


# clf=LogisticRegression()
# clf=RandomForestClassifier()   #81.82
# clf=DecisionTreeClassifier()   #79.8
clf=SVC()  # 82.37

In [250]:
pipe=Pipeline([
    ("drop-unwanted-cols",ct1),
    ("imputing-missing-vals",ct2),
    ("encode-categories",ct3),
    ("scaling",ct4),
    ("classifier",clf)
])

pipe

In [251]:
# x_trans=pipe.fit_transform(X)
# df=pd.DataFrame(x_trans)
# df.sample(5)

In [252]:
pipe.fit(X,y)
y_pred=pipe.predict(X)

y_pred

array([0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1,

In [253]:
from sklearn.metrics import accuracy_score
model_accuracy = accuracy_score(y,y_pred)
print(f"Models-accuracy:{model_accuracy}")

Models-accuracy:0.8282828282828283


In [254]:
# Import cross_val_score sklearn
from sklearn.model_selection import cross_val_score
scores=cross_val_score(pipe,X,y,cv=10)
print(f"Scores:{scores}")
print(f"Scores-Mean:{scores.mean()}")

Scores:[0.83333333 0.80898876 0.78651685 0.87640449 0.87640449 0.80898876
 0.80898876 0.7752809  0.85393258 0.80898876]
Scores-Mean:0.8237827715355805


In [255]:
my_report=classification_report(y,y_pred)
print(f"My-Report:\n{my_report}")

My-Report:
              precision    recall  f1-score   support

           0       0.80      0.95      0.87       549
           1       0.89      0.63      0.74       342

    accuracy                           0.83       891
   macro avg       0.85      0.79      0.80       891
weighted avg       0.84      0.83      0.82       891



In [256]:
test=pd.read_csv("test.csv")
test["Survived"]=pipe.predict(test)
test.sample(5)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
236,1128,1,"Warren, Mr. Frank Manley",male,64.0,1,0,110813,75.25,D37,C,0
122,1014,1,"Schabert, Mrs. Paul (Emma Mock)",female,35.0,1,0,13236,57.75,C28,C,1
56,948,3,"Cor, Mr. Bartol",male,35.0,0,0,349230,7.8958,,S,0
88,980,3,"O'Donoghue, Ms. Bridget",female,,0,0,364856,7.75,,Q,1
77,969,1,"Cornell, Mrs. Robert Clifford (Malvina Helen L...",female,55.0,2,0,11770,25.7,C101,S,1


## Creating Result for `test.csv`

In [259]:
test[["PassengerId","Survived"]].to_csv("Myres.csv",index = False)