# Train logistic regression model

In [1]:
import pandas as pd
import numpy as np

In [2]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [3]:
train_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/train.csv"
train = pd.read_csv(train_url) #training set

In [4]:
train["Age"] = train["Age"].fillna(train["Age"].median())

train.loc[train["Embarked"] == "S", "Embarked"] = 0
train.loc[train["Embarked"] == "C", "Embarked"] = 1
train.loc[train["Embarked"] == "Q", "Embarked"] = 2
train["Embarked"] = train["Embarked"].fillna(train["Embarked"].mode()[0])

train.loc[train["Sex"] == "male", "Sex"] = 0
train.loc[train["Sex"] == "female", "Sex"] = 1
train["Sex"] = train["Sex"].fillna(train["Sex"].mode()[0])

In [5]:
x = np.array(train[["Pclass","Sex","Age","Embarked"]].values, float)
x = np.concatenate((np.ones((x.shape[0], 1)), x), axis=1)

y_true = np.array(train[["Survived"]], float)

theta = np.random.rand(x.shape[1], 1)

In [6]:
for i in range (1, 2001):
    y_pred = sigmoid(np.dot(x, theta))
    y_pred = np.clip(y_pred, 1e-15, 1 - 1e-15)
    
    theta += 1e-5 * np.dot(x.T, y_true - y_pred)
    
    if i % 200 == 0:
        loss = (-(np.dot(y_true.T, np.log(y_pred)) + np.dot((1 - y_true).T, np.log(1 - y_pred))) / x.shape[0])
        acc = np.sum(y_true == (y_pred >= 0.5)) / x.shape[0] * 100
        
        print("Round", i, "\tLoss: %.4f" % loss[0, 0], "\tAccuracy: %.2f" % acc, "%")

Round 200 	Loss: 0.5586 	Accuracy: 75.98 %
Round 400 	Loss: 0.5213 	Accuracy: 79.01 %
Round 600 	Loss: 0.5076 	Accuracy: 80.47 %
Round 800 	Loss: 0.4992 	Accuracy: 81.26 %
Round 1000 	Loss: 0.4928 	Accuracy: 81.26 %
Round 1200 	Loss: 0.4875 	Accuracy: 81.03 %
Round 1400 	Loss: 0.4831 	Accuracy: 81.14 %
Round 1600 	Loss: 0.4793 	Accuracy: 80.47 %
Round 1800 	Loss: 0.4761 	Accuracy: 80.36 %
Round 2000 	Loss: 0.4733 	Accuracy: 80.47 %


In [7]:
print("Accuracy for train data: %.2f" % acc, "%")

Accuracy for train data: 80.47 %


# Test logictic regression model

In [8]:
test_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/test.csv"
test = pd.read_csv(test_url) #test set

In [9]:
test["Age"] = test["Age"].fillna(test["Age"].median())

test.loc[test["Embarked"] == "S", "Embarked"] = 0
test.loc[test["Embarked"] == "C", "Embarked"] = 1
test.loc[test["Embarked"] == "Q", "Embarked"] = 2
test["Embarked"] = test["Embarked"].fillna(test["Embarked"].mode())

test.loc[test["Sex"] == "male", "Sex"] = 0
test.loc[test["Sex"] == "female", "Sex"] = 1
test["Sex"] = test["Sex"].fillna(test["Sex"].mode())

In [10]:
test_data = np.array(test[["Pclass","Sex","Age","Embarked"]].values, float)
test_data = np.concatenate((np.ones((test_data.shape[0], 1)), test_data), axis=1)

In [11]:
test_result = np.array(sigmoid(np.dot(test_data, theta)) >= 0.5, int)

df = pd.DataFrame()
df["PassengerId"] = test["PassengerId"]
df["Survived"] = test_result

df.to_csv("normal_logis_result.csv", index=False)

# Train linear regression model by gradient descend

In [12]:
theta = np.random.rand(x.shape[1], 1)
theta

array([[0.79125392],
       [0.18907719],
       [0.67113994],
       [0.68848508],
       [0.92608832]])

In [13]:
for i in range (1, 200001):
    y_pred = np.dot(x, theta)
    
    theta += 2e-6 * np.dot(x.T, y_true - y_pred)
    
    if i % 25000 == 0:
        loss = (np.sum((y_true - y_pred) ** 2)) / (2 * x.shape[0])
        acc = np.sum(y_true == np.round(y_pred)) / x.shape[0] * 100
        
        print("Round", i, "\tLoss: %.4f" % loss, "\tAccuracy: %.2f" % acc, "%")

Round 25000 	Loss: 0.0725 	Accuracy: 79.46 %
Round 50000 	Loss: 0.0725 	Accuracy: 79.80 %
Round 75000 	Loss: 0.0725 	Accuracy: 79.80 %
Round 100000 	Loss: 0.0725 	Accuracy: 79.91 %
Round 125000 	Loss: 0.0725 	Accuracy: 79.91 %
Round 150000 	Loss: 0.0725 	Accuracy: 79.91 %
Round 175000 	Loss: 0.0725 	Accuracy: 79.91 %
Round 200000 	Loss: 0.0725 	Accuracy: 79.91 %


In [14]:
print("Theta: ", theta.T)
print("Accuracy for train data: %.2f" % acc, "%")
print("MSE: ", np.sum((y_true - y_pred) ** 2))

Theta:  [[ 0.77654437 -0.18843943  0.49086712 -0.00505436  0.04911346]]
Accuracy for train data: 79.91 %
MSE:  129.12883225626126


# Train linear regression model by matrix inversion

In [15]:
theta = np.linalg.pinv(x.T.dot(x)).dot(x.T).dot(y_true)

In [16]:
y_pred = np.dot(x, theta)
acc = np.sum(y_true == np.round(y_pred)) / x.shape[0] * 100

In [17]:
print("Theta: ", theta.T)
print("Accuracy for train data: %.2f" % acc, "%")
print("MSE: ", np.sum((y_true - y_pred) ** 2))

Theta:  [[ 0.77654442 -0.18843944  0.49086711 -0.00505436  0.04911346]]
Accuracy for train data: 79.91 %
MSE:  129.12883225626115


# Add x^2 features (use logistic regression)

In [34]:
x = np.array(train[["Pclass","Sex","Age","Embarked"]].values, int)
x = np.concatenate((np.ones((x.shape[0], 1)), x, x*x), axis=1)

y_true = np.array(train[["Survived"]], int)

theta = np.random.rand(x.shape[1], 1)
print(x[2])

[  1.   3.   1.  26.   0.   9.   1. 676.   0.]


In [19]:
for i in range (1, 500001):
    y_pred = sigmoid(np.dot(x, theta))
    y_pred = np.clip(y_pred, 1e-15, 1 - 1e-15)
    
    theta += 6e-9 * np.dot(x.T, y_true - y_pred)
    
    if i % 50000 == 0:
        loss = (-(np.dot(y_true.T, np.log(y_pred)) + np.dot((1 - y_true).T, np.log(1 - y_pred))) / x.shape[0])
        acc = np.sum(y_true == (y_pred >= 0.5)) / x.shape[0] * 100
        
        print("Round", i, "\tLoss: %.4f" % loss[0, 0], "\tAccuracy: %.2f" % acc, "%")

Round 50000 	Loss: 0.7044 	Accuracy: 65.21 %
Round 100000 	Loss: 0.5880 	Accuracy: 68.80 %
Round 150000 	Loss: 0.5510 	Accuracy: 72.50 %
Round 200000 	Loss: 0.5356 	Accuracy: 72.50 %
Round 250000 	Loss: 0.5268 	Accuracy: 74.07 %
Round 300000 	Loss: 0.5202 	Accuracy: 74.75 %
Round 350000 	Loss: 0.5147 	Accuracy: 75.65 %
Round 400000 	Loss: 0.5099 	Accuracy: 75.98 %
Round 450000 	Loss: 0.5056 	Accuracy: 78.56 %
Round 500000 	Loss: 0.5017 	Accuracy: 79.01 %


In [20]:
print("Accuracy for train data: %.2f" % acc, "%")

Accuracy for train data: 79.01 %


In [21]:
test_data = np.array(test[["Pclass","Sex","Age","Embarked"]].values, float)
test_data = np.concatenate((np.ones((test_data.shape[0], 1)), test_data, test_data*test_data), axis=1)

In [22]:
test_result = np.array(sigmoid(np.dot(test_data, theta)) >= 0.5, int)

df = pd.DataFrame()
df["PassengerId"] = test["PassengerId"]
df["Survived"] = test_result

df.to_csv("added_logis_result.csv", index=False)

# Reduce features to just Sex and Age (use logistic regression)

In [23]:
x = np.array(train[["Sex","Age"]].values, float)
x = np.concatenate((np.ones((x.shape[0], 1)), x), axis=1)

y_true = np.array(train[["Survived"]], float)

theta = np.random.rand(x.shape[1], 1)

In [24]:
for i in range (1, 2001):
    y_pred = sigmoid(np.dot(x, theta))
    y_pred = np.clip(y_pred, 1e-15, 1 - 1e-15)
    
    theta += 1e-5 * np.dot(x.T, y_true - y_pred)
    
    if i % 200 == 0:
        loss = (-(np.dot(y_true.T, np.log(y_pred)) + np.dot((1 - y_true).T, np.log(1 - y_pred))) / x.shape[0])
        acc = np.sum(y_true == (y_pred >= 0.5)) / x.shape[0] * 100
        
        print("Round", i, "\tLoss: %.4f" % loss[0, 0], "\tAccuracy: %.2f" % acc, "%")

Round 200 	Loss: 0.5965 	Accuracy: 72.62 %
Round 400 	Loss: 0.5864 	Accuracy: 74.07 %
Round 600 	Loss: 0.5777 	Accuracy: 75.53 %
Round 800 	Loss: 0.5703 	Accuracy: 76.21 %
Round 1000 	Loss: 0.5639 	Accuracy: 76.43 %
Round 1200 	Loss: 0.5585 	Accuracy: 76.09 %
Round 1400 	Loss: 0.5537 	Accuracy: 76.21 %
Round 1600 	Loss: 0.5495 	Accuracy: 76.21 %
Round 1800 	Loss: 0.5459 	Accuracy: 76.66 %
Round 2000 	Loss: 0.5427 	Accuracy: 76.99 %


In [25]:
print("Accuracy for train data: %.2f" % acc, "%")

Accuracy for train data: 76.99 %
