In [1]:
import pandas as pd

titanic_df = pd.read_csv('./titanic_train.csv')
titanic_df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [2]:
print("학습 데이터 정보\n")
print(titanic_df.info())

학습 데이터 정보

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None


#### Age, Cabin, Embarked 에 Null 값이 있고, String 값이라는 것을 알 수 있다.
age 에는 평균값을, cabin, embarked 에는 N을 넣는다.

In [3]:
titanic_df['Age'].fillna(titanic_df["Age"].mean(), inplace= True)
titanic_df['Cabin'].fillna("N", inplace= True)
titanic_df['Embarked'].fillna("N", inplace= True)

In [4]:
print("Sex 값 분포 : \n",titanic_df['Sex'].value_counts())
print("\nCabin 값 분포 : \n",titanic_df['Cabin'].value_counts())
print("\nEmbarked 값 분포 : \n",titanic_df['Embarked'].value_counts())

Sex 값 분포 : 
 male      577
female    314
Name: Sex, dtype: int64

Cabin 값 분포 : 
 N              687
B96 B98          4
C23 C25 C27      4
G6               4
E101             3
              ... 
E46              1
D46              1
C103             1
C32              1
B94              1
Name: Cabin, Length: 148, dtype: int64

Embarked 값 분포 : 
 S    644
C    168
Q     77
N      2
Name: Embarked, dtype: int64


#### 자리등급을 뜻하는 Cabin 데이터 값 처리

In [5]:
titanic_df['Cabin']= titanic_df["Cabin"].str[:1]  
print(titanic_df['Cabin'].value_counts())

N    687
C     59
B     47
D     33
E     32
A     15
F     13
G      4
T      1
Name: Cabin, dtype: int64


#### Sex, Age, Embarked 를 숫자 값으로 인코딩 한다. (의사 결정이 아닌 경우 핫 인코딩 사용 !)

In [6]:
from sklearn import preprocessing

def encode_features (dataDF):
    features = ['Cabin', 'Sex', 'Embarked']
    for feature in features:
        le = preprocessing.LabelEncoder()
        le = le.fit(dataDF[feature])
        dataDF[feature] = le.transform(dataDF[feature])
    return dataDF
        
titanic_df = encode_features(titanic_df)
titanic_df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,7,3
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,2,0
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,7,3


#### 함수화

In [7]:
def fillna(df):
    df['Age'].fillna(df['Age'].mean(), inplace=True)
    df['Cabin'].fillna("N", inplace= True)
    df['Embarked'].fillna("N", inplace= True)
    df['Fare'].fillna(0, inplace=True)
    return df

def drop_features(df):
    df.drop(['PassengerId','Name','Ticket'],axis=1,inplace=True)
    return df

def format_features(df):
    df['Cabin']=df["Cabin"].str[:1]
    features = ['Cabin','Sex','Embarked']
    for feature in features:
        le = preprocessing.LabelEncoder()
        le = le.fit(df[feature])
        df[feature] = le.transform(df[feature])
    return df

def get_titanic_dataset():
    titanic_df = pd.read_csv('./titanic_train.csv')
    titanic_df = fillna(titanic_df)
    titanic_df = drop_features(titanic_df)
    titanic_df = format_features(titanic_df)
    return titanic_df