In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
pd.set_option('max_columns', None)

from sklearn.cluster import KMeans
from sklearn.impute import KNNImputer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score

from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier

In [6]:
train0 = pd.read_csv('train.csv')
test0 = pd.read_csv('test.csv')

In [7]:
train0

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [8]:
target = train0['Survived']
train1 = train0.drop('Survived', axis=1)
test1 = test0.copy()

In [9]:
train2 = train1.drop(['PassengerId', 'Name', 'Ticket', 'Cabin', 'Embarked'], axis=1)
test2 = test1.drop(['PassengerId', 'Name', 'Ticket', 'Cabin', 'Embarked'], axis=1)


In [10]:
train2.isna().sum(), test2.isna().sum()

(Pclass      0
 Sex         0
 Age       177
 SibSp       0
 Parch       0
 Fare        0
 dtype: int64,
 Pclass     0
 Sex        0
 Age       86
 SibSp      0
 Parch      0
 Fare       1
 dtype: int64)

In [11]:

knn = KNNImputer()
test3 = test2.copy()
train3 = train2.copy()

train4 = pd.get_dummies(train3)
test4 = pd.get_dummies(test3)

transformed_train = knn.fit_transform(train4)
transformed_test = knn.transform(test4)

train5 = pd.DataFrame(transformed_train, columns=train4.columns, index=train4.index)
test5 = pd.DataFrame(transformed_test, columns=test4.columns, index=test4.index)

In [12]:
train5.isna().sum(), test5.isna().sum()

(Pclass        0
 Age           0
 SibSp         0
 Parch         0
 Fare          0
 Sex_female    0
 Sex_male      0
 dtype: int64,
 Pclass        0
 Age           0
 SibSp         0
 Parch         0
 Fare          0
 Sex_female    0
 Sex_male      0
 dtype: int64)

In [13]:
train6 = train5.copy()
test6 = test5.copy()

train6['Sex'] = train6['Sex_male']
test6['Sex'] = test6['Sex_male']

train6 = train6.drop(['Sex_male', 'Sex_female'], axis=1)
test6 = test6.drop(['Sex_male', 'Sex_female'], axis=1)

In [14]:
train_final = train6.drop(['Fare'], axis=1)
test_final = test6.drop(['Fare'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(train_final, target, test_size=0.2, random_state=42)

In [15]:
train_final

Unnamed: 0,Pclass,Age,SibSp,Parch,Sex
0,3.0,22.0,1.0,0.0,1.0
1,1.0,38.0,1.0,0.0,0.0
2,3.0,26.0,0.0,0.0,0.0
3,1.0,35.0,1.0,0.0,0.0
4,3.0,35.0,0.0,0.0,1.0
...,...,...,...,...,...
886,2.0,27.0,0.0,0.0,1.0
887,1.0,19.0,0.0,0.0,0.0
888,3.0,20.4,1.0,2.0,0.0
889,1.0,26.0,0.0,0.0,1.0


In [18]:
models = {
    "svc": SVC(),
    "gnb": GaussianNB(),
    "dtc": DecisionTreeClassifier(),
    "knc": KNeighborsClassifier(),
    "lr": LogisticRegression(max_iter=200),
    "lda": LinearDiscriminantAnalysis(),
    "rfc": RandomForestClassifier(),
}

In [19]:
for name, model in models.items():
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    result = accuracy_score(y_test, pred)
    print(f'{name}: {result}')

svc: 0.6089385474860335
gnb: 0.7597765363128491
dtc: 0.7653631284916201
knc: 0.8044692737430168
lr: 0.8212290502793296
lda: 0.7877094972067039
rfc: 0.8156424581005587
