# Titanic

# 1. Import Libraries and Packages

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline
sns.set_style("whitegrid")

import warnings
warnings.filterwarnings("ignore")

# 2. Loding and Viewing Data Set

In [2]:
df_train = pd.read_csv("./train.csv")
df_test = pd.read_csv("./test.csv")

In [3]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
df_train.describe() # 숫자형인 변수의 정보

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [5]:
df_train.info() # 모든 변수의 간단한 정보

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [6]:
# 변수들의 이름
print(df_train.keys())
print(df_test.keys())

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')


# 3. Dealing with NaN Values(Impuation)

In [7]:
# train, test에 null값이 몇 개 있는지 확인
def null_table(train, test):
    print("Train Data Frame")
    print(pd.isnull(train).sum())
    print("")
    print("Test Data Fram")
    print(pd.isnull(test).sum())
    
null_table(df_train, df_test)

Train Data Frame
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

Test Data Fram
PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64


In [8]:
# 정규표현식으로 .앞에 글자만 뽑아낸다.
# Title이라는 변수를 생성해 넣어준다.
for name in df_train['Name']:
    df_train['Title'] = df_train['Name'].str.extract('([A-Za-z]+\.)',expand=True)

for name in df_test['Name']:
    df_test['Title'] = df_test['Name'].str.extract('([A-Za-z]+\.)',expand=True)

In [9]:
# 여러 직업?의 명칭은 Other로 바꿔준다.
title_replacements = {"Mlle.": "Other.", "Major.": "Other.", "Col.": "Other.", "Sir.": "Other.", "Don.": "Other.", "Mme.": "Other.",
          "Jonkheer.": "Other.", "Lady.": "Other.", "Capt.": "Other.", "Countess.": "Other.", "Ms.": "Other.", "Dona.": "Other.", "Rev.": "Other.", "Dr.": "Other."}

df_train.replace({"Title": title_replacements}, inplace=True)
df_test.replace({"Title": title_replacements}, inplace=True)

# 5개로 분류해준다. 
df_train.loc[df_train["Title"] == "Miss.", "Title"] = 0
df_train.loc[df_train["Title"] == "Mr.", "Title"] = 1
df_train.loc[df_train["Title"] == "Mrs.", "Title"] = 2
df_train.loc[df_train["Title"] == "Master.", "Title"] = 3
df_train.loc[df_train["Title"] == "Other.", "Title"] = 4

df_test.loc[df_test["Title"] == "Miss.", "Title"] = 0
df_test.loc[df_test["Title"] == "Mr.", "Title"] = 1
df_test.loc[df_test["Title"] == "Mrs.", "Title"] = 2
df_test.loc[df_test["Title"] == "Master.", "Title"] = 3
df_test.loc[df_test["Title"] == "Other.", "Title"] = 4

In [10]:
df_train.Title.unique()

array([1, 2, 0, 3, 4], dtype=object)

In [11]:
miss_md = df_train[df_train.Title == 0].median().Age
mr_md = df_train[df_train.Title == 1].median().Age
mrs_md = df_train[df_train.Title == 2].median().Age
master_md = df_train[df_train.Title == 3].median().Age
other_md = df_train[df_train.Title == 4].median().Age

In [12]:
miss_md

21.0

In [13]:
df_train.loc[(df_train['Title'] == 0) & (df_train.Age.isnull() == True), 'Age'] = miss_md
df_train.loc[(df_train['Title'] == 1) & (df_train.Age.isnull() == True), 'Age'] = mr_md
df_train.loc[(df_train['Title'] == 2) & (df_train.Age.isnull() == True), 'Age'] = mrs_md
df_train.loc[(df_train['Title'] == 3) & (df_train.Age.isnull() == True), 'Age'] = master_md
df_train.loc[(df_train['Title'] == 4) & (df_train.Age.isnull() == True), 'Age'] = other_md

df_test.loc[(df_test['Title'] == 0) & (df_test.Age.isnull() == True), 'Age'] = miss_md
df_test.loc[(df_test['Title'] == 1) & (df_test.Age.isnull() == True), 'Age'] = mr_md
df_test.loc[(df_test['Title'] == 2) & (df_test.Age.isnull() == True), 'Age'] = mrs_md
df_test.loc[(df_test['Title'] == 3) & (df_test.Age.isnull() == True), 'Age'] = master_md
df_test.loc[(df_test['Title'] == 4) & (df_test.Age.isnull() == True), 'Age'] = other_md


In [14]:
df_train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,2
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,2
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S,4
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,0
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,21.0,1,2,W./C. 6607,23.4500,,S,0
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C,1


In [15]:
null_table(df_train, df_test)

Train Data Frame
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
Title            0
dtype: int64

Test Data Fram
PassengerId      0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
Title            0
dtype: int64


In [16]:
df_test[df_test.Fare.isnull()]

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
152,1044,3,"Storey, Mr. Thomas",male,60.5,0,0,3701,,,S,1


In [17]:
df_train.Embarked.describe()

count     889
unique      3
top         S
freq      644
Name: Embarked, dtype: object

In [18]:
df_train.Embarked.fillna("S", inplace=True)

In [19]:
null_table(df_train, df_test)

Train Data Frame
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
Title            0
dtype: int64

Test Data Fram
PassengerId      0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
Title            0
dtype: int64


In [20]:
df_test.Fare.fillna(df_train.Fare.median(), inplace=True)

In [21]:
train = df_train.copy()
test = df_test.copy()

In [22]:
null_table(train, test)

Train Data Frame
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
Title            0
dtype: int64

Test Data Fram
PassengerId      0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          327
Embarked         0
Title            0
dtype: int64


In [23]:
train.drop(labels = ['Cabin', 'Ticket'], axis = 1, inplace=True)
test.drop(labels = ['Cabin', 'Ticket'], axis = 1, inplace=True)

null_table(train, test)

Train Data Frame
PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
Title          0
dtype: int64

Test Data Fram
PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
Title          0
dtype: int64
