In [10]:
import pandas as pd
import numpy as np

In [11]:
#データの読み込み
df_train = pd.read_csv("data_titanic/train.csv")
df_test = pd.read_csv("data_titanic/test.csv")

In [12]:
#データの確認
df_train.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [13]:
#データの確認
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [14]:
#データの前処理

#年齢のカテゴリ分け
def simplify_ages(df):
    df.Age = df.Age.fillna(-0.5)
    bins = (-1, 0, 5, 12, 18, 25, 35, 60, 120)
    group_names = ['Unknown', 'Baby', 'Child', 'Teenager', 'Student', 'Young Adult', 'Adult', 'Senior']
    categories = pd.cut(df.Age, bins, labels=group_names)
    df.Age = categories
    return df

#Cabinのカテゴリ分け
def simplify_cabins(df):
    df.Cabin = df.Cabin.fillna('N')
    df.Cabin = df.Cabin.apply(lambda x: x[0])
    return df

#fareのカテゴリ分け
def simplify_fares(df):
    df.Fare = df.Fare.fillna(-0.5)
    bins = (-1, 0, 8, 15, 31, 1000)
    group_names = ['Unknown', '1_quartile', '2_quartile', '3_quartile', '4_quartile']
    categories = pd.cut(df.Fare, bins, labels=group_names)
    df.Fare = categories
    return df

#名前とprefix整理
def format_name(df):
    df['Lname'] = df.Name.apply(lambda x: x.split(' ')[0])
    df['NamePrefix'] = df.Name.apply(lambda x: x.split(' ')[1])
    return df

#不必要なカラム削除
def drop_features(df):
    return df.drop(['Ticket', 'Name', 'Embarked'], axis=1)

def transform_features(df):
    df = simplify_ages(df)
    df = simplify_cabins(df)
    df = simplify_fares(df)
    df = format_name(df)
    df = drop_features(df)
    return df

df_train = transform_features(df_train)
df_test = transform_features(df_test)
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Lname,NamePrefix
0,1,0,3,male,Student,1,0,1_quartile,N,"Braund,",Mr.
1,2,1,1,female,Adult,1,0,4_quartile,C,"Cumings,",Mrs.
2,3,1,3,female,Young Adult,0,0,1_quartile,N,"Heikkinen,",Miss.
3,4,1,1,female,Young Adult,1,0,4_quartile,C,"Futrelle,",Mrs.
4,5,0,3,male,Young Adult,0,0,2_quartile,N,"Allen,",Mr.


In [15]:
#バイナリデータの作成

from sklearn import preprocessing
def binarize_features(df_train, df_test):
    features = ['Pclass', 'Sex','Age','Fare' ]
    df_combined = pd.concat([df_train[features], df_test[features]])
    df_lb_train = pd.DataFrame()
    df_lb_test = pd.DataFrame()

    for feature in features:
        lb = preprocessing.LabelBinarizer()
        lb = lb.fit(df_combined[feature])
#         lb_train = lb.transform(df_train[feature])
#         lb_test = lb.transform(df_test[feature])
        df_lb_train= pd.concat([df_lb_train, pd.DataFrame(lb.transform(df_train[feature]))], axis=1)
        df_lb_test= pd.concat([df_lb_test, pd.DataFrame(lb.transform(df_test[feature]))], axis=1)
                                
    return df_lb_train, df_lb_test

df_lb_train, df_lb_test = binarize_features(df_train, df_test)

In [16]:
df_lb_train.head()

Unnamed: 0,0,1,2,0.1,0.2,1.1,2.1,3,4,5,6,7,0.3,1.2,2.2,3.1,4.1
0,0,0,1,1,0,0,0,0,1,0,0,0,1,0,0,0,0
1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0
2,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0
4,0,0,1,1,0,0,0,0,0,0,0,1,0,1,0,0,0


In [17]:
#トレーニングデータとテストデータの生成

#学習データはfloat型、テストデータはint型にする
x_train= df_lb_train.values.astype(np.float32)
y_train = df_train["Survived"].values
y_train = y_train.reshape(len(y_train),1).astype(np.int32)
x_test = df_lb_test.values.astype(np.float32)

In [18]:
#データの確認
x_train

array([[0., 0., 1., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 1., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 1., ..., 1., 0., 0.],
       [1., 0., 0., ..., 1., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.]], dtype=float32)

  from ._conv import register_converters as _register_converters


iteration   main/loss 
[J100         0.556185    
[J200         0.450288    
[J300         0.438324    
[J400         0.438085    
[J500         0.438085    
[J600         0.438085    
[J700         0.438085    
[J800         0.438085    
[J900         0.438085    
