In [54]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [72]:
df = pd.read_csv('/content/drive/MyDrive/Freshmen/Statistical Programming/1.สถิติ/titanic.csv')
df = df.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin', 'Embarked'])
df['Sex'] = df['Sex'].map({'male': 1, 'female': 0})
df.dropna(subset=['Age'], inplace=True)
print(df.isna().sum()) # 0

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
dtype: int64


In [74]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,0,3,1,22.0,1,0,7.25
1,1,1,0,38.0,1,0,71.2833
2,1,3,0,26.0,0,0,7.925
3,1,1,0,35.0,1,0,53.1
4,0,3,1,35.0,0,0,8.05


In [75]:
X = df.drop(columns=['Survived'])
y = df['Survived']

In [78]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [79]:
# convert dataframe to numpy
X_train = np.array(X_train)
X_test = np.array(X_test)

y_train = np.array(y_train)
y_test = np.array(y_test)

In [91]:
print('Shape of X_train: ', X_train.shape)
print('Shape of y_train: ', y_train.shape)
print('Shape of X_test: ', X_test.shape)
print('Shape of y_test: ', y_test.shape)

X_train = X_train.T
y_train = y_train.reshape(1, -1)

X_test = X_test.T
y_test = y_test.reshape(1, -1)

print('After reshape')
print('Shape of X_train: ', X_train.shape)
print('Shape of y_train: ', y_train.shape)
print('Shape of X_test: ', X_test.shape)
print('Shape of y_test: ', y_test.shape)

Shape of X_train:  (478, 6)
Shape of y_train:  (1, 478)
Shape of X_test:  (236, 6)
Shape of y_test:  (1, 236)
After reshape
Shape of X_train:  (6, 478)
Shape of y_train:  (1, 478)
Shape of X_test:  (6, 236)
Shape of y_test:  (1, 236)


In [101]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [108]:
def model(X, y, learning_rate, iterations):
    m = X_train.shape[1]
    n = X_train.shape[0]

    W = np.zeros((n, 1))
    B = 0

    cost_list = []

    for i in range(iterations):
        Z = np.dot(W.T, X) + B
        A = sigmoid(Z)

        cost = -(1/m) * np.sum(y*np.log(A) + (1-y)*np.log(1-A))

        dW = (1/m) * np.dot(A-y, X.T)
        dB = (1/m) * np.sum(A-y)

        W = W - learning_rate * dW.T
        B = B - learning_rate * dB

        cost_list.append(cost)

        if (i % (iterations / 10) == 0):
            print('cost after ', i, 'iteration is: ', cost)

    return W, B, cost_list

In [129]:
iterations = 500000
learning_rate = 0.005
W, B, cost_list = model(X_train, y_train, learning_rate = learning_rate, iterations = iterations)

cost after  0 iteration is:  0.6931471805599452
cost after  50000 iteration is:  0.46344022254548756
cost after  100000 iteration is:  0.457179117818427
cost after  150000 iteration is:  0.45598098018138483
cost after  200000 iteration is:  0.4557066956171745
cost after  250000 iteration is:  0.4556309359806352
cost after  300000 iteration is:  0.45560596629732253
cost after  350000 iteration is:  0.4555965485999611
cost after  400000 iteration is:  0.4555926892223302
cost after  450000 iteration is:  0.4555910367695787


In [127]:
def accuracy(X, y, W, B):
    Z = np.dot(W.T, X) + B
    A = sigmoid(Z)
    A = A > 0.5
    A = np.array(A, dtype='int64')
    acc = (1 - np.sum(np.absolute(A - y)) / y.shape[1]) * 100
    print('Accuracy of model is: ', acc, '%')

In [130]:
accuracy(X_test, y_test, W, B)

Accuracy of model is:  80.08474576271186 %


In [202]:
def predict(X, W, B):
    Z = np.dot(W.T, X) + B
    A = sigmoid(Z)
    predict = (A > 0.5).astype(int)
    return predict

In [203]:
predict = predict(X_test, W, B)
print(predict)

[[1 0 0 1 0 0 1 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0
  1 0 0 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 1 0 1 0 1
  0 0 0 0 0 0 0 1 0 0 0 1 1 0 0 0 0 1 1 0 0 0 0 0 1 0 1 1 0 1 0 0 0 1 0 0
  0 0 1 0 0 0 1 1 0 0 1 0 0 0 0 1 0 0 1 1 0 1 1 0 0 1 0 0 0 0 1 1 1 0 0 0
  0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0
  0 1 1 1 1 1 0 0 0 0 0 0 0 1 0 0 0 0 1 1 1 0 0 1 0 0 0 1 0 1 0 1 1 0 0 0
  0 0 1 0 1 1 0 1 1 0 0 0 0 0 0 0 1 1 1 1]]


In [204]:
print('prediction')
print('Dead', (predict == 0).sum())
print('Survived', (predict == 1).sum())

prediction
Dead 169
Survived 67


In [216]:
def predict(X, W, B):
    Z = np.dot(W.T, X) + B
    A = sigmoid(Z)
    predict = (A > 0.5).astype(int)
    return predict

# Assume a person
person = {'Pclass': [3], 'Sex': [1], 'Age': [40], 'SibSp': [1], 'Parch': [0], 'Fare': [7.250]}
kid = {'Pclass': [3], 'Sex': [0], 'Age': [10], 'SibSp': [1], 'Parch': [0], 'Fare': [71.250]}

test_data = pd.DataFrame(data=kid) # survived
# test_data = pd.DataFrame(data=person) # dead

test_data_array = np.array(test_data).T
test_data_array

result = predict(test_data_array, W, B)
print(result) # dead

[[1]]
