In [123]:
import pandas as pd
import numpy as np

train = pd.read_csv("titanic/train.csv")
test = pd.read_csv("titanic/test.csv")

In [142]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Name         891 non-null    object 
 3   Sex          891 non-null    object 
 4   Age          891 non-null    float64
 5   SibSp        891 non-null    int64  
 6   Parch        891 non-null    int64  
 7   Ticket       891 non-null    object 
 8   Fare         891 non-null    float64
 9   Cabin        204 non-null    object 
 10  Embarked     889 non-null    object 
 11  1            891 non-null    uint8  
 12  2            891 non-null    uint8  
 13  3            891 non-null    uint8  
 14  deck         204 non-null    object 
dtypes: float64(2), int64(4), object(6), uint8(3)
memory usage: 86.3+ KB


In [143]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Name         418 non-null    object 
 2   Sex          418 non-null    object 
 3   Age          418 non-null    float64
 4   SibSp        418 non-null    int64  
 5   Parch        418 non-null    int64  
 6   Ticket       418 non-null    object 
 7   Fare         418 non-null    float64
 8   Cabin        91 non-null     object 
 9   Embarked     418 non-null    object 
 10  1            418 non-null    uint8  
 11  2            418 non-null    uint8  
 12  3            418 non-null    uint8  
 13  deck         91 non-null     object 
dtypes: float64(2), int64(3), object(6), uint8(3)
memory usage: 37.3+ KB


In [124]:
# 결측값 개수 세기
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [125]:
test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [126]:
# Pclass
# 1등실 : 329 2등실 : 285 3등실 : 710

train_pclass_cnt = train['Pclass'].value_counts()
test_pclass_cnt = test['Pclass'].value_counts()

print("train_pclass_cnt : \n",train_pclass_cnt)
print("test_pclass_cnt : \n",test_pclass_cnt)

train_pclass_cnt : 
 3    491
1    216
2    184
Name: Pclass, dtype: int64
test_pclass_cnt : 
 3    218
1    107
2     93
Name: Pclass, dtype: int64


In [127]:
# Pclass를 one hot vector로 바꿔주기
pclass_train_dummies = pd.get_dummies(train["Pclass"])
pclass_test_dummies = pd.get_dummies(test["Pclass"])

train.drop(["Pclass"],axis=1, inplace=True)
test.drop(["Pclass"],axis=1, inplace=True)

train = train.join(pclass_train_dummies)
test = test.join(pclass_test_dummies)

In [128]:
train.head()

Unnamed: 0,PassengerId,Survived,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,1,2,3
0,1,0,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0,0,1
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1,0,0
2,3,1,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0,0,1
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1,0,0
4,5,0,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0,0,1


In [129]:
# Age 결측값 채우기
train["Age"].fillna(train["Age"].mean() , inplace=True)
test["Age"].fillna(train["Age"].mean() , inplace=True)

In [130]:
# fare : 탑승료
test['Fare'].fillna(0,inplace=True)

In [132]:
# Cabin에서 deck 뽑아오기

def get_deck(x):
    rooms = str(x).split()
    return rooms[0][:1]

train["deck"] = train["Cabin"].apply(get_deck)
train["deck"] = train['deck'].replace("n",np.NaN)

test["deck"] = test["Cabin"].apply(get_deck)
test["deck"] = test['deck'].replace("n",np.NaN)

In [133]:
test

Unnamed: 0,PassengerId,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,1,2,3,deck
0,892,"Kelly, Mr. James",male,34.500000,0,0,330911,7.8292,,Q,0,0,1,
1,893,"Wilkes, Mrs. James (Ellen Needs)",female,47.000000,1,0,363272,7.0000,,S,0,0,1,
2,894,"Myles, Mr. Thomas Francis",male,62.000000,0,0,240276,9.6875,,Q,0,1,0,
3,895,"Wirz, Mr. Albert",male,27.000000,0,0,315154,8.6625,,S,0,0,1,
4,896,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.000000,1,1,3101298,12.2875,,S,0,0,1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,"Spector, Mr. Woolf",male,29.699118,0,0,A.5. 3236,8.0500,,S,0,0,1,
414,1306,"Oliva y Ocana, Dona. Fermina",female,39.000000,0,0,PC 17758,108.9000,C105,C,1,0,0,C
415,1307,"Saether, Mr. Simon Sivertsen",male,38.500000,0,0,SOTON/O.Q. 3101262,7.2500,,S,0,0,1,
416,1308,"Ware, Mr. Frederick",male,29.699118,0,0,359309,8.0500,,S,0,0,1,


In [134]:
train[train[1]==1]['deck'].value_counts()

C    59
B    47
D    29
E    25
A    15
T     1
Name: deck, dtype: int64

In [135]:
test[test[1]==1]['deck'].value_counts()

C    35
B    18
D    11
E     9
A     7
Name: deck, dtype: int64

In [144]:
# 1등실이고 deck가 NaN이면 C로 채우기

In [138]:
train[train[2]==1]['deck'].value_counts()

F    8
D    4
E    4
Name: deck, dtype: int64

In [139]:
test[test[2]==1]['deck'].value_counts()

F    5
D    2
Name: deck, dtype: int64

In [None]:
# 2등실이고 deck가 NaN이면 F로 채우기

In [140]:
train[train[3]==1]['deck'].value_counts()

F    5
G    4
E    3
Name: deck, dtype: int64

In [141]:
test[test[3]==1]['deck'].value_counts()

F    3
G    1
Name: deck, dtype: int64

In [None]:
# 3등실이고 deck가 NaN이면 G로 채우기

In [145]:
# Embarked
train['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [146]:
test["Embarked"].value_counts()

S    270
C    102
Q     46
Name: Embarked, dtype: int64

In [147]:
# train에서 Embarked 2개 결측치 최빈값 S로 채우기!
train["Embarked"].fillna('S', inplace=True)

In [148]:
# one hat vector로 바꾸기

embarked_train_dummies = pd.get_dummies(train['Embarked'])
embarked_test_dummies = pd.get_dummies(test['Embarked'])

embarked_train_dummies.columns = ['S', 'C', 'Q']
embarked_test_dummies.columns = ['S', 'C', 'Q']

train.drop(['Embarked'], axis=1, inplace=True)
test.drop(['Embarked'], axis=1, inplace=True)

train = train.join(embarked_train_dummies)
test = test.join(embarked_test_dummies)

In [149]:
display(train)

Unnamed: 0,PassengerId,Survived,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,1,2,3,deck,S,C,Q
0,1,0,"Braund, Mr. Owen Harris",male,22.000000,1,0,A/5 21171,7.2500,,0,0,1,,0,0,1
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.000000,1,0,PC 17599,71.2833,C85,1,0,0,C,1,0,0
2,3,1,"Heikkinen, Miss. Laina",female,26.000000,0,0,STON/O2. 3101282,7.9250,,0,0,1,,0,0,1
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.000000,1,0,113803,53.1000,C123,1,0,0,C,0,0,1
4,5,0,"Allen, Mr. William Henry",male,35.000000,0,0,373450,8.0500,,0,0,1,,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,"Montvila, Rev. Juozas",male,27.000000,0,0,211536,13.0000,,0,1,0,,0,0,1
887,888,1,"Graham, Miss. Margaret Edith",female,19.000000,0,0,112053,30.0000,B42,1,0,0,B,0,0,1
888,889,0,"Johnston, Miss. Catherine Helen ""Carrie""",female,29.699118,1,2,W./C. 6607,23.4500,,0,0,1,,0,0,1
889,890,1,"Behr, Mr. Karl Howell",male,26.000000,0,0,111369,30.0000,C148,1,0,0,C,1,0,0


In [150]:
display(test)

Unnamed: 0,PassengerId,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,1,2,3,deck,S,C,Q
0,892,"Kelly, Mr. James",male,34.500000,0,0,330911,7.8292,,0,0,1,,0,1,0
1,893,"Wilkes, Mrs. James (Ellen Needs)",female,47.000000,1,0,363272,7.0000,,0,0,1,,0,0,1
2,894,"Myles, Mr. Thomas Francis",male,62.000000,0,0,240276,9.6875,,0,1,0,,0,1,0
3,895,"Wirz, Mr. Albert",male,27.000000,0,0,315154,8.6625,,0,0,1,,0,0,1
4,896,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.000000,1,1,3101298,12.2875,,0,0,1,,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,"Spector, Mr. Woolf",male,29.699118,0,0,A.5. 3236,8.0500,,0,0,1,,0,0,1
414,1306,"Oliva y Ocana, Dona. Fermina",female,39.000000,0,0,PC 17758,108.9000,C105,1,0,0,C,1,0,0
415,1307,"Saether, Mr. Simon Sivertsen",male,38.500000,0,0,SOTON/O.Q. 3101262,7.2500,,0,0,1,,0,0,1
416,1308,"Ware, Mr. Frederick",male,29.699118,0,0,359309,8.0500,,0,0,1,,0,0,1
