In [4]:
#importing the library
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import plotly.express as ps
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

In [6]:
df=pd.read_csv('../input/passenger-list-for-the-estonia-ferry-disaster/estonia-passenger-list.csv')
df

Unnamed: 0,PassengerId,Country,Firstname,Lastname,Sex,Age,Category,Survived
0,1,Sweden,ARVID KALLE,AADLI,M,62,P,0
1,2,Estonia,LEA,AALISTE,F,22,C,0
2,3,Estonia,AIRI,AAVASTE,F,21,C,0
3,4,Sweden,JURI,AAVIK,M,53,C,0
4,5,Sweden,BRITTA ELISABET,AHLSTROM,F,55,P,0
...,...,...,...,...,...,...,...,...
984,985,Sweden,ANNA INGRID BIRGITTA,OSTROM,F,60,P,0
985,986,Sweden,ELMAR MIKAEL,OUN,M,34,P,1
986,987,Sweden,ENN,QUNAPUU,M,77,P,0
987,988,Sweden,LY,GUNAPUU,F,87,P,0


In [7]:
#getting information about the data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 989 entries, 0 to 988
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   PassengerId  989 non-null    int64 
 1   Country      989 non-null    object
 2   Firstname    989 non-null    object
 3   Lastname     989 non-null    object
 4   Sex          989 non-null    object
 5   Age          989 non-null    int64 
 6   Category     989 non-null    object
 7   Survived     989 non-null    int64 
dtypes: int64(3), object(5)
memory usage: 61.9+ KB


# Preprocessing

In [15]:
#dropping the passenger id
df=df.drop('PassengerId',axis=1)


Encoding

In [10]:
#checking the unique values in category
df['Category'].unique()

array(['P', 'C'], dtype=object)

In [11]:
#checking the unique values in country column
df['Country'].unique()

array(['Sweden', 'Estonia', 'Latvia', 'Russia', 'Germany', 'Finland',
       'Great Britain', 'Morocco', 'Denmark', 'France', 'Netherlands',
       'Norway', 'Lithuania', 'Nigeria', 'Canada', 'Belarus'],
      dtype=object)

In [13]:
#getting only last name first letter
df['Lastname']=df['Lastname'].apply(lambda x:x[0])

In [None]:
#dropping the  first name column
df=df.drop('Firstname',axis=1)

In [18]:
df

Unnamed: 0,Country,Lastname,Sex,Age,Category,Survived
0,Sweden,A,M,62,P,0
1,Estonia,A,F,22,C,0
2,Estonia,A,F,21,C,0
3,Sweden,A,M,53,C,0
4,Sweden,A,F,55,P,0
...,...,...,...,...,...,...
984,Sweden,O,F,60,P,0
985,Sweden,O,M,34,P,1
986,Sweden,Q,M,77,P,0
987,Sweden,G,F,87,P,0


In [31]:
def binary_encode(df,column,positive_value):
    df=df.copy()
    df[column]=df[column].apply(lambda x:1 if x==positive_value else 0)
    return df
    
    
    
def ordinal_encode(df,column,ordering):
    df=df.copy()
    df[column]=df[column].apply(lambda x: ordering.index(x))
    return df
    
    
def onehot_encode(df,column):
    df=df.copy()
    dummies=pd.get_dummies(df[column])
    df=pd.concat([df,dummies],axis=1)
    df=df.drop(column,axis=1)
    return df
    

In [32]:
alphabet_order='ABCDEFGHIJKLMNOPQRSTUVWXYZ'


In [34]:
#appling the functio to the data
df=binary_encode(df,'Sex','M')
df=binary_encode(df,'Category','P')
df=ordinal_encode(df,'Lastname',alphabet_order)
df=onehot_encode(df,'Country')

In [39]:
df

Unnamed: 0,Lastname,Sex,Age,Category,Survived,Belarus,Canada,Denmark,Estonia,Finland,...,Germany,Great Britain,Latvia,Lithuania,Morocco,Netherlands,Nigeria,Norway,Russia,Sweden
0,0,0,62,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,22,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,21,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,53,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,55,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
984,14,0,60,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
985,14,0,34,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
986,16,0,77,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
987,6,0,87,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


# Spliting and Scaling

In [41]:
y=df['Survived']
x=df.drop('Survived',axis=1)

In [42]:
#scaling the data betwen 0 and 1
scaler=MinMaxScaler()
x=scaler.fit_transform(x)

Spliting and 

In [43]:
#spliting the data
x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.7)


# Training the DAta

In [50]:
log_model=LogisticRegression()
svm_model=SVC(C=1.0)
ann_model=MLPClassifier(hidden_layer_sizes=(16,16))


In [51]:
#training the model
log_model.fit(x_train,y_train)
svm_model.fit(x_train,y_train)
ann_model.fit(x_train,y_train)



MLPClassifier(hidden_layer_sizes=(16, 16))

In [52]:
#checking the accuray of the model
print(f'Log_Model accuray {log_model.score(x_test,y_test)}')
print(f'SVM_Model_accuracy{svm_model.score(x_test,y_test)}')
print(f'Ann_model accuray{ann_model.score(x_test,y_test)}')

Log_Model accuray 0.8653198653198653
SVM_Model_accuracy0.8653198653198653
Ann_model accuray0.8619528619528619


In [53]:
log_acc=log_model.score(x_test,y_test)
svm_acc=svm_model.score(x_test,y_test)
ann_acc=ann_model.score(x_test,y_test)

In [56]:
#ploting the accuracy
acc_fig=ps.bar(
    x=['Logistic Regression','Support Vector Machine','Neural Network'],
    y=[log_acc,svm_acc,ann_acc],
    labels={'x':'Model','y':'Accuracy'},
    color=['Logistic Regression','Support Vector Machine','Neutral Network'],
    title='Model Accuracy')
acc_fig.show()