# 라이브러리(필요한 도구) 불러오기

In [35]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score


## 데이터 로딩

In [36]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

## 데이터 전처리 

In [37]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Train 데이터 변환

train.columns (1차: 0.76 -> 0.80)
- PassengerId : -
- Pclass : -
- Name : Title 처리 후 드랍
- Sex : -
- Age : Data Binning 이후 드랍
- Sibsp : -
- Parch : - 
- Ticket : -
- Fare : -
- Cabin : -
- Embarked : 원 핫 인코딩


In [38]:
# 결측치 확인
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

### Name 컬럼 처리

In [39]:
# Name 컬럼 확인
train['Title'] = train.Name.str.extract('([A-Za-z]+)\.', expand=False)
test['Title'] = train.Name.str.extract('([A-Za-z]+)\.', expand=False)

pd.crosstab(train['Title'], train['Sex'])
# Master는 모두 남성을 칭하는 것을 알 수 있음
# 1, 2 번씩 짜잘하게 나오는 값들은 rare로 일괄처리

Sex,female,male
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
Capt,0,1
Col,0,2
Countess,1,0
Don,0,1
Dr,1,6
Jonkheer,0,1
Lady,1,0
Major,0,2
Master,0,40
Miss,182,0


In [40]:
# rare 데이터 변환
train['Title'] = train['Title'].replace(['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

# female (male) 을 칭하는 호칭을 Miss (Mrs)로 통일
train['Title'] = train['Title'].replace(['Mlle'], 'Miss')
train['Title'] = train['Title'].replace(['Ms'], 'Miss')
train['Title'] = train['Title'].replace(['Mme'], 'Mrs')

In [41]:
# 원 핫 인코딩
mapping = {'Mr' : 1, 'Miss' : 2, 'Mrs' : 3, 'Master' : 4, 'Rare' : 5}

train['Title'] = train['Title'].map(mapping).astype(int)
print(train['Title'].value_counts())

Title
1    517
2    185
3    126
4     40
5     23
Name: count, dtype: int64


### Age 컬럼 체크

In [42]:
# Age 결측치 평균값으로 처리
train['Age'].fillna(train['Age'].mean(), inplace=True)
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
Title            0
dtype: int64

In [43]:
train['Sex'] = train['Sex'].map({'male' : 0, 'female' : 1})

### Embarked 처리

In [44]:
# Embarked 결측치 2개 제거
train = train.dropna(subset=['Embarked'])
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
Title            0
dtype: int64

In [45]:
train['Embarked'].value_counts()

Embarked
S    644
C    168
Q     77
Name: count, dtype: int64

In [46]:
train.loc[train['Embarked'] == 'S', 'Embarked']=2
train.loc[train['Embarked'] == 'C', 'Embarked']=1
train.loc[train['Embarked'] == 'Q', 'Embarked']=0

train['Embarked'] = train['Embarked'].astype(int)

In [47]:
train_x = train[['Pclass', 'SibSp','Age', 'Sex']]
train_y = train['Survived']

test_x = test[['Pclass', 'SibSp', 'Age', 'Sex']]

## 모델 정의 및 학습

In [48]:
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(train_x, train_y, test_size=0.2, random_state=42)

model = DecisionTreeClassifier()
model.fit(X_train, y_train)


In [49]:
# 검증용 데이터에 대해 예측
val_pred = model.predict(X_val)

# F1 score 계산
f1 = f1_score(y_val, val_pred, average='macro')
print(f"F1 score on validation set: {f1:.4f}")


F1 score on validation set: 0.8085


## 제출 파일 생성

In [50]:
# submission = pd.read_csv("titanic/sample_submission.csv")
# submission['Survived'] = test_y_pred
# submission.to_csv('베이스 라인 .csv', index = False)