In [2]:
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression

In [3]:
input_dir = Path('./data')
test_csv = input_dir / 'test.csv'
train_csv = input_dir / 'train.csv'

In [4]:
test_df = pd.read_csv(test_csv)
test_df

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [28]:
# Perished: death = 1, arrive = 0
train_df = pd.read_csv(train_csv)
train_df

Unnamed: 0,PassengerId,Perished,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,1,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,1,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,0,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,1,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,0,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [29]:
# Sex: male = 1, Female = 0
train_df['Sex_num'] = [1 if i == 'male' else 0 for i in train_df['Sex']]

# Embarked: S = 2 C = 1 Q = 0, NaN = 5
temp = train_df['Embarked']
temp_list = []
for i in temp:
    if i == 'S':
        temp_list.append(2)
    elif i == 'C':
        temp_list.append(1)
    elif i == 'Q':
        temp_list.append(0)
    else:
        temp_list.append(2)
train_df['Embarked_num'] = temp_list

In [30]:
# if Age == NaN -> Age = age_ave
train_df['Age'] = train_df['Age'].fillna(train_df['Age'].mean()) 

In [31]:
test_num = 50
idx = np.random.default_rng().permutation(len(train_df))
print(train_df.isnull().any())

PassengerId     False
Perished        False
Pclass          False
Name            False
Sex             False
Age             False
SibSp           False
Parch           False
Ticket          False
Fare            False
Cabin            True
Embarked         True
Sex_num         False
Embarked_num    False
dtype: bool


In [44]:
x = train_df[['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex_num', 'Embarked_num']].values
y = train_df['Perished'].values

# x = MinMaxScaler().fit_transform(x)
scaler = MinMaxScaler()
scaler.fit(x)
scaler.data_range_

array([  2.    ,  79.58  ,   8.    ,   6.    , 512.3292,   1.    ,
         2.    ])

In [33]:
X_test, X_train = x[idx[-test_num:]], x[idx[:-test_num]]
Y_test, Y_train = y[idx[-test_num:]], y[idx[:-test_num]]

In [34]:
# model = LinearSVC(max_iter=10000000)
model = LogisticRegression(max_iter=1000000)

In [35]:
model.fit(X_train, Y_train)

LogisticRegression(max_iter=1000000)

In [36]:
model.coef_

array([[ 2.0060412 ,  2.1582367 ,  1.43472421,  0.42465536, -0.57966336,
         2.48693689,  0.52082721]])

In [37]:
y_pred = model.predict(X_test)

TP = 0
TN = 0
FP = 0
FN = 0
for pred, target in zip(y_pred, Y_test):
    if pred == 1:
        if target == 1:
            TP += 1
        elif target == 0:
            FP += 1
        else:
            raise ValueError
    elif pred == 0:
        if target == 1:
            FN += 1
        elif target == 0:
            TN += 1
        else:
            raise ValueError
    else:
        raise ValueError
        
recall = TP / (TP + FN)
precision = TP / (TP + FP)
accuracy = (TP + TN) / (TP + TN + FP + FN)
f1_score = 2 * recall * precision / (recall + precision)

In [38]:
print(f'TP: {TP}')
print(f'FP: {FP}')
print(f'FN: {FN}')
print(f'TN: {TN}')
print(f'recall: {recall}')
print(f'precision: {precision}')
print(f'accuracy: {accuracy}')
print(f'f1_score: {f1_score}')

TP: 28
FP: 2
FN: 4
TN: 16
recall: 0.875
precision: 0.9333333333333333
accuracy: 0.88
f1_score: 0.9032258064516129


In [None]:
sns.pairplot(data=train_df, hue='Perished')