In [4]:
'''
Overview

This is the original data from Titanic competition plus some changes that I applied to it to be better suited for binary logistic regression:

1. Merged the train and test data.
2. Removed the 'ticket' and 'cabin' attributes.
3. Moved the 'Survived' attribute to the last column.
4. Added extra zero columns for categorical inputs to be better suited for One-Hot-Encoding.
5. Substituted the values of 'Sex' and 'Embarked' attributes with binary and categorical values respectively.
6. Filled the missing values in 'Age' and 'Fare' attributes with the median of the data.
'''

import os # Operating System Control
import pandas as pd # Data Processing
import numpy as np # Linear Algebra

# os.system("pwd")
# /Users/yungi/Documents/Hello_Atom/Titanic
for dirname, _, filenames in os.walk("/Users/yungi/Documents/Hello_Atom/Titanic/inputs"):
    for filename in filenames:
        print(os.path.join(dirname, filename))

data = pd.read_csv("/Users/yungi/Documents/Hello_Atom/Titanic/inputs/train_and_test2.csv")

## Importing Libraries
import matplotlib.pyplot as plt
import seaborn as sns

## Modeling Libraries
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier # Study
from sklearn.ensemble import RandomForestClassifier # Study
## Evaluate Model
from sklearn.metrics import confusion_matrix, roc_curve

/Users/yungi/Documents/Hello_Atom/Titanic/inputs/train_and_test2.csv


In [5]:
## Data Preprocessing
data.head()

Unnamed: 0,Passengerid,Age,Fare,Sex,sibsp,zero,zero.1,zero.2,zero.3,zero.4,...,zero.12,zero.13,zero.14,Pclass,zero.15,zero.16,Embarked,zero.17,zero.18,2urvived
0,1,22.0,7.25,0,1,0,0,0,0,0,...,0,0,0,3,0,0,2.0,0,0,0
1,2,38.0,71.2833,1,1,0,0,0,0,0,...,0,0,0,1,0,0,0.0,0,0,1
2,3,26.0,7.925,1,0,0,0,0,0,0,...,0,0,0,3,0,0,2.0,0,0,1
3,4,35.0,53.1,1,1,0,0,0,0,0,...,0,0,0,1,0,0,2.0,0,0,1
4,5,35.0,8.05,0,0,0,0,0,0,0,...,0,0,0,3,0,0,2.0,0,0,0


In [6]:
data.columns

Index(['Passengerid', 'Age', 'Fare', 'Sex', 'sibsp', 'zero', 'zero.1',
       'zero.2', 'zero.3', 'zero.4', 'zero.5', 'zero.6', 'Parch', 'zero.7',
       'zero.8', 'zero.9', 'zero.10', 'zero.11', 'zero.12', 'zero.13',
       'zero.14', 'Pclass', 'zero.15', 'zero.16', 'Embarked', 'zero.17',
       'zero.18', '2urvived'],
      dtype='object')

In [8]:
data.describe()

Unnamed: 0,Passengerid,Age,Fare,Sex,sibsp,zero,zero.1,zero.2,zero.3,zero.4,...,zero.12,zero.13,zero.14,Pclass,zero.15,zero.16,Embarked,zero.17,zero.18,2urvived
count,1309.0,1309.0,1309.0,1309.0,1309.0,1309.0,1309.0,1309.0,1309.0,1309.0,...,1309.0,1309.0,1309.0,1309.0,1309.0,1309.0,1307.0,1309.0,1309.0,1309.0
mean,655.0,29.503186,33.281086,0.355997,0.498854,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,2.294882,0.0,0.0,1.492731,0.0,0.0,0.261268
std,378.020061,12.905241,51.7415,0.478997,1.041658,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.837836,0.0,0.0,0.814626,0.0,0.0,0.439494
min,1.0,0.17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,328.0,22.0,7.8958,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0
50%,655.0,28.0,14.4542,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,2.0,0.0,0.0,0.0
75%,982.0,35.0,31.275,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,2.0,0.0,0.0,1.0
max,1309.0,80.0,512.3292,1.0,8.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,2.0,0.0,0.0,1.0


In [12]:
data.info() # No nan values. -> Overview said : Filled the missing value in 'Age' & 'Fare' attributes with the median of the data.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 28 columns):
Passengerid    1309 non-null int64
Age            1309 non-null float64
Fare           1309 non-null float64
Sex            1309 non-null int64
sibsp          1309 non-null int64
zero           1309 non-null int64
zero.1         1309 non-null int64
zero.2         1309 non-null int64
zero.3         1309 non-null int64
zero.4         1309 non-null int64
zero.5         1309 non-null int64
zero.6         1309 non-null int64
Parch          1309 non-null int64
zero.7         1309 non-null int64
zero.8         1309 non-null int64
zero.9         1309 non-null int64
zero.10        1309 non-null int64
zero.11        1309 non-null int64
zero.12        1309 non-null int64
zero.13        1309 non-null int64
zero.14        1309 non-null int64
Pclass         1309 non-null int64
zero.15        1309 non-null int64
zero.16        1309 non-null int64
Embarked       1307 non-null float64
zero.17     

In [15]:
data.rename(columns={'2urvived' : 'Survived'}, inplace=True) # Dataframe's Column name change. (한꺼번에 여러 열의 이름 변경도 가능!)

In [16]:
data.describe()

Unnamed: 0,Passengerid,Age,Fare,Sex,sibsp,zero,zero.1,zero.2,zero.3,zero.4,...,zero.12,zero.13,zero.14,Pclass,zero.15,zero.16,Embarked,zero.17,zero.18,Survived
count,1309.0,1309.0,1309.0,1309.0,1309.0,1309.0,1309.0,1309.0,1309.0,1309.0,...,1309.0,1309.0,1309.0,1309.0,1309.0,1309.0,1307.0,1309.0,1309.0,1309.0
mean,655.0,29.503186,33.281086,0.355997,0.498854,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,2.294882,0.0,0.0,1.492731,0.0,0.0,0.261268
std,378.020061,12.905241,51.7415,0.478997,1.041658,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.837836,0.0,0.0,0.814626,0.0,0.0,0.439494
min,1.0,0.17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,328.0,22.0,7.8958,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0
50%,655.0,28.0,14.4542,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,2.0,0.0,0.0,0.0
75%,982.0,35.0,31.275,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,2.0,0.0,0.0,1.0
max,1309.0,80.0,512.3292,1.0,8.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,2.0,0.0,0.0,1.0
