# Titanic: Machine Learning from Disaster
## Author: Nguyễn Văn Việt (https://www.kaggle.com/soleilvn13)
Using linear regression from sklearn library

In [149]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression

In [150]:
train_data = pd.read_csv('./train.csv')
test_data = pd.read_csv('./test.csv')

In [151]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [152]:
train_data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [153]:
train_data['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [154]:
# replace null value
train_data['Age'] = train_data['Age'].replace(np.NaN, train_data['Age'].mean())
train_data['Cabin'] = train_data['Cabin'].fillna('U') # U for unknow
train_data['Embarked'] = train_data['Embarked'].fillna('S') # fill with the most common value

In [155]:
# train_data['Name'] = train_data['Name'].astype('category')
train_data['Sex'] = train_data['Sex'].astype('category')
train_data['Ticket'] = train_data['Ticket'].astype('category')
train_data['Cabin'] = train_data['Cabin'].astype('category')
train_data['Embarked'] = train_data['Embarked'].astype('category')
train_data['Pclass'] = train_data['Pclass'].astype('category')
train_data['Survived'] = train_data['Survived'].astype('category')

In [156]:
clf = LogisticRegression(solver='lbfgs', penalty='l2')

In [157]:
y = np.array([train_data.Survived.cat.codes]).T
x1 = np.array([train_data.Pclass.cat.codes]).T
x2 = np.array([train_data.Sex.cat.codes]).T
x3 = np.array([train_data.SibSp]).T
x4 = np.array([train_data.Parch]).T
x5 = np.array([train_data.Fare]).T
x6 = np.array([train_data.Age]).T
x7 = np.array([train_data.Embarked.cat.codes]).T
# x8 = np.array([train_data.Cabin.cat.codes]).T

In [158]:
X = np.concatenate((x1,x2,x3,x4, x5, x6, x7),axis=1)

In [159]:
clf.fit(X, y.ravel())



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [160]:
clf.score(X, y)

0.8002244668911336

In [161]:
test_data

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
5,897,3,"Svensson, Mr. Johan Cervin",male,14.0,0,0,7538,9.2250,,S
6,898,3,"Connolly, Miss. Kate",female,30.0,0,0,330972,7.6292,,Q
7,899,2,"Caldwell, Mr. Albert Francis",male,26.0,1,1,248738,29.0000,,S
8,900,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",female,18.0,0,0,2657,7.2292,,C
9,901,3,"Davies, Mr. John Samuel",male,21.0,2,0,A/4 48871,24.1500,,S


In [162]:
# replace null value
test_data['Age'] = test_data['Age'].replace(np.NaN, train_data['Age'].mean())
test_data['Fare'] = test_data['Fare'].replace(np.NaN, train_data['Fare'].mean())
test_data['Cabin'] = test_data['Cabin'].fillna('U') # U for unknow
test_data['Embarked'] = test_data['Embarked'].fillna('S') # fill with the most common value
# train_data['Name'] = train_data['Name'].astype('category')
test_data['Sex'] = test_data['Sex'].astype('category')
test_data['Ticket'] = test_data['Ticket'].astype('category')
test_data['Cabin'] = test_data['Cabin'].astype('category')
test_data['Embarked'] = test_data['Embarked'].astype('category')
test_data['Pclass'] = test_data['Pclass'].astype('category')

In [163]:
x1_test = np.array([test_data.Pclass.cat.codes]).T
x2_test = np.array([test_data.Sex.cat.codes]).T
x3_test = np.array([test_data.SibSp]).T
x4_test = np.array([test_data.Parch]).T
x5_test = np.array([test_data.Fare]).T
x6_test = np.array([test_data.Age]).T
x7_test = np.array([test_data.Embarked.cat.codes]).T
# x8_test = np.array([test_data.Cabin.cat.codes]).T

In [164]:
test_data.isnull().sum()

PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

In [165]:
X_test = np.concatenate((x1_test,x2_test,x3_test,x4_test, x5_test, x6_test, x7_test),axis=1)

In [166]:
y_test = clf.predict(X_test)

In [167]:
df = pd.DataFrame()
df['PassengerId'] = test_data['PassengerId']
df['Survived'] = y_test

In [168]:
df.to_csv('predicts_logistic_regression1.csv', index=False)