In [110]:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

#load data
data = fetch_openml("titanic", version=1, as_frame=True)
titanic = pd.DataFrame(data.data, columns=data.feature_names)
titanic_target = pd.DataFrame(data.target)
titanic.head()


Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [111]:
titanic_target.head()

Unnamed: 0,survived
0,1
1,1
2,0
3,0
4,0


In [112]:
#drop useless colums
titanic.drop(["name", "ticket", "cabin", "embarked", "boat", "body", "home.dest"], axis=1, inplace=True)
titanic


Unnamed: 0,pclass,sex,age,sibsp,parch,fare
0,1,female,29.0000,0,0,211.3375
1,1,male,0.9167,1,2,151.5500
2,1,female,2.0000,1,2,151.5500
3,1,male,30.0000,1,2,151.5500
4,1,female,25.0000,1,2,151.5500
...,...,...,...,...,...,...
1304,3,female,14.5000,1,0,14.4542
1305,3,female,,1,0,14.4542
1306,3,male,26.5000,0,0,7.2250
1307,3,male,27.0000,0,0,7.2250


In [113]:
#make male a column from sex
titanic['male'] = titanic['sex'] == 'male'
titanic.head()

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,male
0,1,female,29.0,0,0,211.3375,False
1,1,male,0.9167,1,2,151.55,True
2,1,female,2.0,1,2,151.55,False
3,1,male,30.0,1,2,151.55,True
4,1,female,25.0,1,2,151.55,False


In [114]:
#merge and extract important columns
merged_titanic = pd.concat([titanic, titanic_target], axis=1)
merged_titanic.head()


Unnamed: 0,pclass,sex,age,sibsp,parch,fare,male,survived
0,1,female,29.0,0,0,211.3375,False,1
1,1,male,0.9167,1,2,151.55,True,1
2,1,female,2.0,1,2,151.55,False,0
3,1,male,30.0,1,2,151.55,True,0
4,1,female,25.0,1,2,151.55,False,0


In [115]:
#model
model = LogisticRegression()

#2 features and drop NaN
merged_titanic = merged_titanic.dropna(subset=["age", "fare"])
X = merged_titanic[['fare', 'age']].values
y = merged_titanic['survived'].values

model.fit(X, y)
print(model.coef_, model.intercept_)

[[ 0.0134446  -0.01675355]] [-0.34105161]


In [116]:
#use all features
X = merged_titanic[['pclass', 'male', 'age', 'sibsp', 'parch', 'fare']].values
y = merged_titanic['survived'].values
model = LogisticRegression()
model.fit(X, y)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [117]:
#test 
model.predict([[3, True, 22.0, 1, 0, 7.25]])

array(['0'], dtype=object)

In [122]:
#predict X
y_pred = model.predict(X)
print("accuracy: ", (y == y_pred).sum() / y.shape[0])

accuracy:  0.785645933014354


In [123]:
model.score(X, y)

0.785645933014354