In [76]:
import pandas as pd
import seaborn as sns
import matplotlib as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [77]:
df = sns.load_dataset('titanic')

In [78]:
df.shape

(891, 15)

In [79]:
df = df.drop(columns=['deck', 'alive'])

In [80]:
df.isnull().sum()[df.isnull().sum() > 0]

age            177
embarked         2
embark_town      2
dtype: int64

In [81]:
df[['age', 'embarked', 'embark_town']].head()

Unnamed: 0,age,embarked,embark_town
0,22.0,S,Southampton
1,38.0,C,Cherbourg
2,26.0,S,Southampton
3,35.0,S,Southampton
4,35.0,S,Southampton


In [82]:
df.sample()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,embark_town,alone
551,0,2,male,27.0,0,0,26.0,S,Second,man,True,Southampton,True


In [83]:
x = df.drop(columns=['survived'])
y = df['survived']

In [84]:
x.shape, y.shape

((891, 12), (891,))

In [85]:
imputer = SimpleImputer(strategy='most_frequent')

pipe1 = Pipeline(steps=[
    ('numeric', SimpleImputer(strategy='mean'))
])

pipe2 = Pipeline(steps=[
    ('category', SimpleImputer(strategy='most_frequent'))
])


numerical_features = ['age']
categorical_features = ['embarked','embark_town']

transformer1 = ColumnTransformer(transformers=[
    ('numerical', pipe1, numerical_features),
    ('categorical', pipe2, categorical_features)
])

In [86]:
pipe3 = Pipeline(steps=[
    ('encoding', OneHotEncoder(handle_unknown='ignore'))
])

encoding_labels = ['sex', 'embarked', 'class', 'who', 'embark_town', 'alone'] 

transformer2 = ColumnTransformer(transformers=[
    ('encode', pipe3, encoding_labels)
])

In [87]:
pipe = Pipeline(steps=[
    ('t1', SimpleImputer(strategy='most_frequent')),
    # ('t2', SimpleImputer(strategy='most_frequent')),
    ('t3', OneHotEncoder(handle_unknown='ignore', sparse_output=False)),
    ('scaling', StandardScaler()),
    ('classification', LogisticRegression(max_iter=1000))
])

In [88]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=2)

In [89]:
pipe.fit(x_train, y_train)

In [90]:
y_pred = pipe.predict(x_test)

In [91]:
print('Accuracy Score:', accuracy_score(y_test, y_pred))

Accuracy Score: 0.7430167597765364
