In [1]:
import pandas as pd
import numpy as np

In [2]:
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import matplotlib as mpl
import seaborn as sns

In [3]:
global_font = {'family':'D2Coding','size' : 12 , 'weight':'bold'}

#폰트 설정
mpl.rc('font',**global_font)

#유니코드에서 음수 부호 설정
mpl.rc('axes',unicode_minus=False)

#주피터 내부에 그림 표시 (magic명령)
%matplotlib inline

In [4]:
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['font.family'] = 'D2Coding'
plt.rcParams['axes.grid'] = False

## 데이터 확인

In [5]:
%cd C:\\python

C:\python


In [6]:
df = pd.read_csv('datas/Titanic.csv')
df.shape , df.columns

((891, 12),
 Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
        'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
       dtype='object'))

In [7]:
df.columns = [col.lower() for col in df.columns]
df.columns

Index(['passengerid', 'survived', 'pclass', 'name', 'sex', 'age', 'sibsp',
       'parch', 'ticket', 'fare', 'cabin', 'embarked'],
      dtype='object')

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   passengerid  891 non-null    int64  
 1   survived     891 non-null    int64  
 2   pclass       891 non-null    int64  
 3   name         891 non-null    object 
 4   sex          891 non-null    object 
 5   age          714 non-null    float64
 6   sibsp        891 non-null    int64  
 7   parch        891 non-null    int64  
 8   ticket       891 non-null    object 
 9   fare         891 non-null    float64
 10  cabin        204 non-null    object 
 11  embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [9]:
#수치형에 대한 정보 확인
df.describe(include = np.number)

Unnamed: 0,passengerid,survived,pclass,age,sibsp,parch,fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [10]:
#문자형에 대한 정보 확인
df.describe(exclude = np.number)

Unnamed: 0,name,sex,ticket,cabin,embarked
count,891,891,891,204,889
unique,891,2,681,147,3
top,"Braund, Mr. Owen Harris",male,347082,B96 B98,S
freq,1,577,7,4,644


In [11]:
df.head()

Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [12]:
df.tail()

Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


In [13]:
new_survived = pd.Categorical(df['survived'])
new_survived = new_survived.rename_categories(['Died','Survived'])

new_survived.describe()

Unnamed: 0_level_0,counts,freqs
categories,Unnamed: 1_level_1,Unnamed: 2_level_1
Died,549,0.616162
Survived,342,0.383838


## 데이터 분리

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
seed = 42
X_tr,X_te = train_test_split(df, random_state=seed, test_size = 0.2)
X_tr = X_tr.reset_index(drop=True)
X_te = X_te.reset_index(drop=True)

X_tr.shape ,X_te.shape

((712, 12), (179, 12))

In [16]:
X_tr.columns

Index(['passengerid', 'survived', 'pclass', 'name', 'sex', 'age', 'sibsp',
       'parch', 'ticket', 'fare', 'cabin', 'embarked'],
      dtype='object')

In [17]:
# train data에서 target data 확인
new_survived = pd.Categorical(X_tr['survived'])
new_survived = new_survived.rename_categories(['Died','Survived'])

print(new_survived[:5])
new_survived.describe()

['Died', 'Died', 'Died', 'Died', 'Died']
Categories (2, object): ['Died', 'Survived']


Unnamed: 0_level_0,counts,freqs
categories,Unnamed: 1_level_1,Unnamed: 2_level_1
Died,444,0.623596
Survived,268,0.376404


## data cleaning

### 필요없는 값 처리

In [18]:
X_tr['passengerid'].nunique() , X_tr.shape[0]

(712, 712)

In [19]:
# passengerid는 전체 데이터가 unique하기 때문에 삭제
X_tr.drop('passengerid', axis = 1, inplace=True)
X_te.drop('passengerid', axis = 1, inplace=True)

X_tr.columns

Index(['survived', 'pclass', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket',
       'fare', 'cabin', 'embarked'],
      dtype='object')

### 결측치 처리

In [20]:
#각 칼럼 별 결측치 비율
(X_tr.isnull().sum() / X_tr.shape[0]).round(4).sort_values(ascending=False)

cabin       0.7767
age         0.1966
embarked    0.0028
survived    0.0000
pclass      0.0000
name        0.0000
sex         0.0000
sibsp       0.0000
parch       0.0000
ticket      0.0000
fare        0.0000
dtype: float64

In [21]:
print(f'before: {X_tr.shape} / lsnull().sum() : {X_tr.isnull().sum().sum()}')
#결측치가 있는 행 제거 : X_tr.dropna(axis=0)

X_tr = X_tr.drop('cabin', axis = 1)
X_te = X_te.drop('cabin', axis = 1)

print(f'after: {X_tr.shape} / isnull().sum(): {X_tr.isnull().sum().sum()}')

before: (712, 11) / lsnull().sum() : 695
after: (712, 10) / isnull().sum(): 142


In [22]:
X_tr['age'] = X_tr['age'].fillna(X_tr['age'].median())
X_te['age'] = X_tr['age'].fillna(X_te['age'].median())

In [23]:
embarked_mode =  X_tr['embarked'].mode().values[0]

X_tr['embarked'] = X_tr['embarked'].fillna(embarked_mode)
X_te['embarked'] = X_tr['embarked'].fillna(embarked_mode)

In [24]:
X_tr.isnull().sum().sum()

0

## Feature Extraction 
> 기존 Feature에 기반하여 새로운 Feature 생성

### 데이터 타입

In [25]:
X_tr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 712 entries, 0 to 711
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   survived  712 non-null    int64  
 1   pclass    712 non-null    int64  
 2   name      712 non-null    object 
 3   sex       712 non-null    object 
 4   age       712 non-null    float64
 5   sibsp     712 non-null    int64  
 6   parch     712 non-null    int64  
 7   ticket    712 non-null    object 
 8   fare      712 non-null    float64
 9   embarked  712 non-null    object 
dtypes: float64(2), int64(4), object(4)
memory usage: 55.8+ KB


#### 수치형 데이터 타입 변환

In [26]:
df_number = X_tr.select_dtypes(include=np.number)
df_number.columns

Index(['survived', 'pclass', 'age', 'sibsp', 'parch', 'fare'], dtype='object')

In [27]:
df_number.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 712 entries, 0 to 711
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   survived  712 non-null    int64  
 1   pclass    712 non-null    int64  
 2   age       712 non-null    float64
 3   sibsp     712 non-null    int64  
 4   parch     712 non-null    int64  
 5   fare      712 non-null    float64
dtypes: float64(2), int64(4)
memory usage: 33.5 KB


In [28]:
df_number.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
0,0,1,45.5,0,0,28.5
1,0,2,23.0,0,0,13.0
2,0,3,32.0,0,0,7.925
3,0,3,26.0,1,0,7.8542
4,0,3,6.0,4,2,31.275


In [29]:
# Survived
X_tr['survived'] = X_tr['survived'].astype('int32')
X_te['survived'] = X_te['survived'].astype('int32')

In [30]:
# pclass
X_tr['pclass'].unique()

array([1, 2, 3], dtype=int64)

In [31]:
X_tr['pclass'] = X_tr['pclass'].astype('category')
X_te['pclass'] = X_te['pclass'].astype('category')

In [32]:
# age
X_tr['age'].nunique()

83

In [33]:
X_tr['age'] = X_tr['age'].astype('int32')
X_te['age'] = X_te['age'].astype('int32')

In [34]:
#sibsp
X_tr['sibsp'].nunique()

7

In [35]:
X_tr['sibsp'] = X_tr['sibsp'].astype('category')
X_te['sibsp'] = X_te['sibsp'].astype('category')

In [36]:
# parch
X_tr['parch'].nunique()

7

In [37]:
X_tr['parch'] = X_tr['parch'].astype('category')
X_te['parch'] = X_te['parch'].astype('category')

In [38]:
# fare
X_tr['fare'].unique()

array([ 28.5   ,  13.    ,   7.925 ,   7.8542,  31.275 , 247.5208,
        26.55  ,  27.7208,   7.8958,  35.5   ,  24.15  ,  12.275 ,
         7.0542,   9.5   ,  26.    ,  90.    , 227.525 ,  57.    ,
         6.2375,   8.6625,  26.25  ,   9.5875,   7.2292,  22.3583,
         9.4833, 120.    ,  14.4583,   8.05  , 211.5   ,   7.25  ,
         7.725 ,  25.4667,  21.075 ,  30.    ,  61.3792,  20.2125,
        30.5   ,   7.05  ,  14.5   ,   7.5208, 151.55  ,  21.    ,
       262.375 ,   7.75  ,   7.775 ,  80.    ,   9.8417,  12.35  ,
         0.    ,   7.225 ,   6.4375,  12.475 , 133.65  ,   6.975 ,
        77.9583,  10.5   , 106.425 ,  81.8583,  11.1333,  27.75  ,
       153.4625,   8.3   ,  15.05  , 110.8833,  15.0458,  39.6875,
         7.8792,  23.45  ,   7.65  ,  15.7417,  15.2458,  51.8625,
        15.5   ,  41.5792,  14.4542,  10.5167,  20.525 ,  89.1042,
        36.75  ,  55.4417,  50.    ,  13.8625,  16.7   ,  13.5   ,
        35.    ,  55.9   ,   7.8   ,  34.375 ,  18.    ,  47.1

In [39]:
X_tr['fare'] = X_tr['fare'].astype('float32')
X_te['fare'] = X_te['fare'].astype('float32')

In [40]:
X_tr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 712 entries, 0 to 711
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   survived  712 non-null    int32   
 1   pclass    712 non-null    category
 2   name      712 non-null    object  
 3   sex       712 non-null    object  
 4   age       712 non-null    int32   
 5   sibsp     712 non-null    category
 6   parch     712 non-null    category
 7   ticket    712 non-null    object  
 8   fare      712 non-null    float32 
 9   embarked  712 non-null    object  
dtypes: category(3), float32(1), int32(2), object(4)
memory usage: 33.6+ KB


#### 범주형 데이터 타입 변환

In [41]:
df_object = X_tr.select_dtypes(include='object')
df_object.columns

Index(['name', 'sex', 'ticket', 'embarked'], dtype='object')

In [42]:
df_object.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 712 entries, 0 to 711
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   name      712 non-null    object
 1   sex       712 non-null    object
 2   ticket    712 non-null    object
 3   embarked  712 non-null    object
dtypes: object(4)
memory usage: 22.4+ KB


In [43]:
df_object.head()

Unnamed: 0,name,sex,ticket,embarked
0,"Partner, Mr. Austen",male,113043,S
1,"Berriman, Mr. William John",male,28425,S
2,"Tikkanen, Mr. Juho",male,STON/O 2. 3101293,S
3,"Hansen, Mr. Henrik Juul",male,350025,S
4,"Andersson, Miss. Ebba Iris Alfrida",female,347082,S


In [44]:
# sex
X_tr['sex'].unique()

array(['male', 'female'], dtype=object)

In [45]:
X_tr['sex'] = X_tr['sex'].astype('category')
X_te['sex'] = X_te['sex'].astype('category')

In [46]:
# embarked
X_tr["embarked"] = X_tr["embarked"].astype("category")
X_te["embarked"] = X_te["embarked"].astype("category")

In [47]:
X_tr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 712 entries, 0 to 711
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   survived  712 non-null    int32   
 1   pclass    712 non-null    category
 2   name      712 non-null    object  
 3   sex       712 non-null    category
 4   age       712 non-null    int32   
 5   sibsp     712 non-null    category
 6   parch     712 non-null    category
 7   ticket    712 non-null    object  
 8   fare      712 non-null    float32 
 9   embarked  712 non-null    category
dtypes: category(5), float32(1), int32(2), object(2)
memory usage: 24.1+ KB


In [48]:
X_tr.head()

Unnamed: 0,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,embarked
0,0,1,"Partner, Mr. Austen",male,45,0,0,113043,28.5,S
1,0,2,"Berriman, Mr. William John",male,23,0,0,28425,13.0,S
2,0,3,"Tikkanen, Mr. Juho",male,32,0,0,STON/O 2. 3101293,7.925,S
3,0,3,"Hansen, Mr. Henrik Juul",male,26,1,0,350025,7.8542,S
4,0,3,"Andersson, Miss. Ebba Iris Alfrida",female,6,4,2,347082,31.275,S


### 문자열

In [49]:
df_object = X_tr.select_dtypes(include='object')

In [50]:
df_object.head()

Unnamed: 0,name,ticket
0,"Partner, Mr. Austen",113043
1,"Berriman, Mr. William John",28425
2,"Tikkanen, Mr. Juho",STON/O 2. 3101293
3,"Hansen, Mr. Henrik Juul",350025
4,"Andersson, Miss. Ebba Iris Alfrida",347082


In [51]:
df_object.describe()  #ticket column에 중복값 존재

Unnamed: 0,name,ticket
count,712,712
unique,712,558
top,"Partner, Mr. Austen",CA. 2343
freq,1,7


#### 공백제거
> strip : 앞뒤 공백 제거 / str에만 사용 가능

In [52]:
X_tr['name'] = X_tr['name'].map(lambda x : x.strip())
X_tr['ticket'] = X_tr['ticket'].map(lambda x : x.strip())

X_te['name'] = X_te['name'].map(lambda x : x.strip())
X_te['ticket'] = X_te['ticket'].map(lambda x : x.strip())

In [53]:
df_object.head()

Unnamed: 0,name,ticket
0,"Partner, Mr. Austen",113043
1,"Berriman, Mr. William John",28425
2,"Tikkanen, Mr. Juho",STON/O 2. 3101293
3,"Hansen, Mr. Henrik Juul",350025
4,"Andersson, Miss. Ebba Iris Alfrida",347082


#### 문자열 포함 여부

Mr, Miss 정보를 통해서 탑승객에 대한 추가적인 정보 (Feature) 

In [54]:
dict_designation = {
    'Mr.': '남성',
    'Master.': '남성',
    'Sir.': '남성',
    'Miss.': '미혼 여성',
    'Mrs.': '기혼 여성',
    'Ms.': '미혼/기혼 여성',
    'Lady.': '숙녀',
    'Mlle.': '아가씨',
    # 직업
    'Dr.': '의사',
    'Rev.': '목사',
    'Major.': '계급',
    'Don.': '교수',
    'Col.': '군인',
    'Capt.': '군인',
    # 귀족
    'Mme.': '영부인',
    'Countess.': '백작부인',
    'Jonkheer.': '귀족'
}


In [55]:
dict_designation.keys()

dict_keys(['Mr.', 'Master.', 'Sir.', 'Miss.', 'Mrs.', 'Ms.', 'Lady.', 'Mlle.', 'Dr.', 'Rev.', 'Major.', 'Don.', 'Col.', 'Capt.', 'Mme.', 'Countess.', 'Jonkheer.'])

In [56]:
def add_designation(name):
    designation = 'unknown'
    for key in dict_designation.keys():
        if key in name:
            designation = key
            break
    return designation

In [57]:
X_tr['designation'] = X_tr['name'].map(lambda x : add_designation(x))
X_te['designation'] = X_te['name'].map(lambda x : add_designation(x))

X_tr.head()

Unnamed: 0,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,embarked,designation
0,0,1,"Partner, Mr. Austen",male,45,0,0,113043,28.5,S,Mr.
1,0,2,"Berriman, Mr. William John",male,23,0,0,28425,13.0,S,Mr.
2,0,3,"Tikkanen, Mr. Juho",male,32,0,0,STON/O 2. 3101293,7.925,S,Mr.
3,0,3,"Hansen, Mr. Henrik Juul",male,26,1,0,350025,7.8542,S,Mr.
4,0,3,"Andersson, Miss. Ebba Iris Alfrida",female,6,4,2,347082,31.275,S,Miss.


In [58]:
# train data 검증
cond = X_tr['designation'] == 'unknown'
X_tr.loc[cond]

Unnamed: 0,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,embarked,designation


In [59]:
#test data 검증

In [60]:
cond = X_te['designation'] == 'unknown'
X_te.loc[cond]

Unnamed: 0,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,embarked,designation


#### 문자열 분리

In [61]:
df_object.head()

Unnamed: 0,name,ticket
0,"Partner, Mr. Austen",113043
1,"Berriman, Mr. William John",28425
2,"Tikkanen, Mr. Juho",STON/O 2. 3101293
3,"Hansen, Mr. Henrik Juul",350025
4,"Andersson, Miss. Ebba Iris Alfrida",347082


In [62]:
# 이름에서 성만 분리 -> last name
X_tr['last_name'] = X_tr['name'].map(lambda x : x.split(',')[0])
X_te['last_name'] = X_te['name'].map(lambda x : x.split(',')[0])

X_tr.head()

Unnamed: 0,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,embarked,designation,last_name
0,0,1,"Partner, Mr. Austen",male,45,0,0,113043,28.5,S,Mr.,Partner
1,0,2,"Berriman, Mr. William John",male,23,0,0,28425,13.0,S,Mr.,Berriman
2,0,3,"Tikkanen, Mr. Juho",male,32,0,0,STON/O 2. 3101293,7.925,S,Mr.,Tikkanen
3,0,3,"Hansen, Mr. Henrik Juul",male,26,1,0,350025,7.8542,S,Mr.,Hansen
4,0,3,"Andersson, Miss. Ebba Iris Alfrida",female,6,4,2,347082,31.275,S,Miss.,Andersson


In [63]:
# 이름만 분리 -> first name
X_tr['first_name'] = X_tr['name'].map(lambda x : x.split(',')[1].split('.')[1])
X_te['first_name'] = X_te['name'].map(lambda x : x.split(',')[1].split('.')[1])

X_tr.head()

Unnamed: 0,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,embarked,designation,last_name,first_name
0,0,1,"Partner, Mr. Austen",male,45,0,0,113043,28.5,S,Mr.,Partner,Austen
1,0,2,"Berriman, Mr. William John",male,23,0,0,28425,13.0,S,Mr.,Berriman,William John
2,0,3,"Tikkanen, Mr. Juho",male,32,0,0,STON/O 2. 3101293,7.925,S,Mr.,Tikkanen,Juho
3,0,3,"Hansen, Mr. Henrik Juul",male,26,1,0,350025,7.8542,S,Mr.,Hansen,Henrik Juul
4,0,3,"Andersson, Miss. Ebba Iris Alfrida",female,6,4,2,347082,31.275,S,Miss.,Andersson,Ebba Iris Alfrida


In [64]:
#ticket column에서 맨 뒤의 숫자만 추출
X_tr['ticket_number'] = X_tr['ticket'].map(lambda x : x.split(' ')[-1])
X_te['ticket_number'] = X_te['ticket'].map(lambda x : x.split(' ')[-1])

X_tr.head()

Unnamed: 0,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,embarked,designation,last_name,first_name,ticket_number
0,0,1,"Partner, Mr. Austen",male,45,0,0,113043,28.5,S,Mr.,Partner,Austen,113043
1,0,2,"Berriman, Mr. William John",male,23,0,0,28425,13.0,S,Mr.,Berriman,William John,28425
2,0,3,"Tikkanen, Mr. Juho",male,32,0,0,STON/O 2. 3101293,7.925,S,Mr.,Tikkanen,Juho,3101293
3,0,3,"Hansen, Mr. Henrik Juul",male,26,1,0,350025,7.8542,S,Mr.,Hansen,Henrik Juul,350025
4,0,3,"Andersson, Miss. Ebba Iris Alfrida",female,6,4,2,347082,31.275,S,Miss.,Andersson,Ebba Iris Alfrida,347082


### 집계

#### 피벗테이블

In [65]:
# pclass별 요금 평균
df_pivot = pd.pivot_table(X_tr, index='pclass', values='fare', aggfunc='mean').reset_index()
df_pivot.rename(columns={'fare': 'fare_mean_by_class'},inplace=True)
df_pivot

Unnamed: 0,pclass,fare_mean_by_class
0,1,89.253914
1,2,20.575939
2,3,13.93486


In [66]:
# train data에 column추가
print(f'before: {X_tr.shape}')
X_tr = pd.merge(X_tr,df_pivot,how='left',on='pclass')
X_te = pd.merge(X_te,df_pivot,how='left',on='pclass')
print(f'after: {X_tr.shape}')
X_tr.head()

before: (712, 14)
after: (712, 15)


Unnamed: 0,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,embarked,designation,last_name,first_name,ticket_number,fare_mean_by_class
0,0,1,"Partner, Mr. Austen",male,45,0,0,113043,28.5,S,Mr.,Partner,Austen,113043,89.253914
1,0,2,"Berriman, Mr. William John",male,23,0,0,28425,13.0,S,Mr.,Berriman,William John,28425,20.575939
2,0,3,"Tikkanen, Mr. Juho",male,32,0,0,STON/O 2. 3101293,7.925,S,Mr.,Tikkanen,Juho,3101293,13.93486
3,0,3,"Hansen, Mr. Henrik Juul",male,26,1,0,350025,7.8542,S,Mr.,Hansen,Henrik Juul,350025,13.93486
4,0,3,"Andersson, Miss. Ebba Iris Alfrida",female,6,4,2,347082,31.275,S,Miss.,Andersson,Ebba Iris Alfrida,347082,13.93486
