# 라이브러리(필요한 도구) 불러오기

In [10]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score


## 데이터 로딩

In [11]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

## 데이터 전처리 

In [12]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Train 데이터 변환

train.columns (1차: 0.76 -> 0.80)
              (2차: 0.80 -> 0.8008)
# PassengerId : drop
# Pclass : 등실의 등급 연관성 체크
# Name : Title 처리 후 드랍
# Sex : 원 핫 인코딩
# Age : Data Binning 이후 드랍
# Sibsp : 형제자매, 아내, 남편 수
# Parch : 부모, 자식 수 
# Sibsp + Parch + 1 (나포함 가족)
# Ticket : drop
# Fare : Data Binning
# Cabin : drop
# Embarked : 원 핫 인코딩


In [13]:
# 결측치 확인
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

### Name 컬럼 처리

In [14]:
# Name 컬럼 확인
train['Title'] = train.Name.str.extract('([A-Za-z]+)\.', expand=False)
test['Title'] = train.Name.str.extract('([A-Za-z]+)\.', expand=False)

pd.crosstab(train['Title'], train['Sex'])
# Master는 모두 남성을 칭하는 것을 알 수 있음
# 1, 2 번씩 짜잘하게 나오는 값들은 rare로 일괄처리

Sex,female,male
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
Capt,0,1
Col,0,2
Countess,1,0
Don,0,1
Dr,1,6
Jonkheer,0,1
Lady,1,0
Major,0,2
Master,0,40
Miss,182,0


In [15]:
# rare 데이터 변환
train['Title'] = train['Title'].replace(['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

# female (male) 을 칭하는 호칭을 Miss (Mrs)로 통일
train['Title'] = train['Title'].replace(['Mlle'], 'Miss')
train['Title'] = train['Title'].replace(['Ms'], 'Miss')
train['Title'] = train['Title'].replace(['Mme'], 'Mrs')

In [16]:
# 원 핫 인코딩
mapping = {'Mr' : 1, 'Miss' : 2, 'Mrs' : 3, 'Master' : 4, 'Rare' : 5}

train['Title'] = train['Title'].map(mapping).astype(int)

print(train['Title'].value_counts())
print()
print(test['Title'].value_counts())

Title
1    517
2    185
3    126
4     40
5     23
Name: count, dtype: int64

Title
Mr        228
Miss      101
Mrs        58
Master     23
Rev         3
Dr          3
Don         1
Mme         1
Name: count, dtype: int64


### Age 컬럼 체크

In [17]:
# Age 결측치 평균값으로 처리
train['Age'].fillna(train['Age'].mean(), inplace=True)
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
Title            0
dtype: int64

In [18]:
train['Sex'] = train['Sex'].map({'male' : 0, 'female' : 1})

In [19]:
# Age값을 나이대 (8구간) 으로 나누기
# 더 다양한 구간으로 나누면, 더 좋은 효과를 볼 수 있을 것 같음.
# Data Binning
train['AgeBand'] = pd.cut(train['Age'], 8)
train.groupby('AgeBand', as_index=False)['Survived'].mean().sort_values(by = 'AgeBand', ascending=True)

train['AgeBand'] = pd.cut(train['Age'], bins = 8, labels = [0, 1, 2, 3, 4, 5, 6, 7])

train.groupby('AgeBand', as_index=False)['Survived'].mean().sort_values(by = 'AgeBand', ascending = True)

Unnamed: 0,AgeBand,Survived
0,0,0.59375
1,1,0.382609
2,2,0.334152
3,3,0.445161
4,4,0.383721
5,5,0.404762
6,6,0.235294
7,7,0.2


### Family, Alone 변수 추가

In [20]:
# 형제자매 + 부모자식 + 나
train['Family'] = train['SibSp'] + train['Parch'] + 1

train.groupby('Family', as_index=False)

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000002338AA1EB90>

In [21]:
train['Alone'] = 0
train.loc[train['Family'] == 1, 'Alone'] = 1

train.groupby('Alone', as_index=False)['Survived'].mean()
# 혼자 탔을때랑 가족이랑 탔을때랑 생존 확률이 다른 것을 알 수 있음

Unnamed: 0,Alone,Survived
0,0,0.50565
1,1,0.303538


### Fare 표준화

In [22]:
# Fare : 티켓 요금

# pd.qcut : q=5 : 동일한 개수로 범위를 5개로 나눔.
# > 각 구간에 동일한 데이터 개수가 들어감
# pd.cut : 사용자가 지정한 경계값으로 범위를 나눔

# 값을 보면 티켓 가격이 비쌀수록 Survived의 값이 올라가는 것을 확인할 수 있음

train['FareBand'] = pd.qcut(train['Fare'], q=5)
train.groupby('FareBand', as_index=False)['Survived'].mean().sort_values(by='FareBand', ascending=True)
fareband_analysis = train.groupby('FareBand', as_index=False)['Survived'].mean().sort_values(by='FareBand', ascending=True)
print(fareband_analysis)

train['FareBand'] = pd.qcut(train['Fare'], q=5, labels=[0, 1, 2, 3, 4])
train.groupby('FareBand', as_index=False)['Survived'].mean().sort_values(by='FareBand', ascending=True)

            FareBand  Survived
0    (-0.001, 7.854]  0.217877
1      (7.854, 10.5]  0.201087
2     (10.5, 21.679]  0.424419
3   (21.679, 39.688]  0.444444
4  (39.688, 512.329]  0.642045


Unnamed: 0,FareBand,Survived
0,0,0.217877
1,1,0.201087
2,2,0.424419
3,3,0.444444
4,4,0.642045


In [23]:
# AgeBand 컬럼을 category에서 float64로 변환 -> 해야할까??
train['FareBand'] = train['FareBand'].cat.codes.astype('float64')


### Embarked 처리

In [24]:
# Embarked 결측치 2개 제거
train = train.dropna(subset=['Embarked'])
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
Title            0
AgeBand          0
Family           0
Alone            0
FareBand         0
dtype: int64

In [25]:
train['Embarked'].value_counts()

Embarked
S    644
C    168
Q     77
Name: count, dtype: int64

In [26]:
train.loc[train['Embarked'] == 'S', 'Embarked']=2
train.loc[train['Embarked'] == 'C', 'Embarked']=1
train.loc[train['Embarked'] == 'Q', 'Embarked']=0

train['Embarked'] = train['Embarked'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['Embarked'] = train['Embarked'].astype(int)


In [27]:
train['Cabin'].value_counts() # 연관성 없어보이니 드랍


Cabin
B96 B98        4
G6             4
C23 C25 C27    4
E101           3
C22 C26        3
              ..
E34            1
C7             1
C54            1
E36            1
C148           1
Name: count, Length: 146, dtype: int64

In [28]:
train.isnull().sum()  # 결측치 확인

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
Title            0
AgeBand          0
Family           0
Alone            0
FareBand         0
dtype: int64

## 모델 정의 및 학습

In [39]:
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

X = train.drop(['Survived', 'Cabin', 'PassengerId', 'Name', 'Ticket', 'Age'], axis=1)
y = train['Survived']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

model = DecisionTreeClassifier()
model.fit(X_train, y_train)


In [40]:
# 검증용 데이터에 대해 예측
val_pred = model.predict(X_val)

# F1 score 계산
f1 = f1_score(y_val, val_pred, average='macro')
print(f"F1 score on validation set: {f1:.4f}")


F1 score on validation set: 0.8008


## 다양한 모델 사용해보기

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

# 모델 정의
model = LogisticRegression(random_state=42)

# 교차 검증 평균 정확도 계산 (훈련 데이터 사용)
cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
print(f"Cross-Validation Accuracy (mean): {cv_scores.mean():.2f}")

model.fit(X_train, y_train)

# 검증 데이터 정확도 계산
val_predictions = model.predict(X_val)
val_accuracy = accuracy_score(y_val, val_predictions)
print(f"Validation Accuracy: {val_accuracy:.2f}")

# Cross-Validation (Logistic Regression): 0.81

Cross-Validation Accuracy (mean): 0.81
Validation Accuracy: 0.76


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

param = {
    'C': [0.1, 1, 10, 100, 1000],
    'gamma': [1e-3, 1e-2, 1e-1, 1, 10],
}


# SVM 모델과 그리드 서치 설정
svc = SVC()
grid_search = GridSearchCV(estimator=svc, param_grid=param, scoring='accuracy', cv=5, n_jobs=-1)

# 학습 및 최적 파라미터 탐색
grid_search.fit(X_train, y_train)
# 결과 출력
print("최적 하이퍼파라미터:", grid_search.best_params_)
print("최적 정확도:", grid_search.best_score_)
# 점수가 가장 높은 모델은 best_estimator_ 에 저장됨
svm = grid_search.best_estimator_

# SVM, GridSearch: 0.822

최적 하이퍼파라미터: {'C': 1000, 'gamma': 0.001}
최적 정확도: 0.8227715946025805


## 제출 파일 생성

In [38]:
# submission = pd.read_csv("titanic/sample_submission.csv")
# submission['Survived'] = test_y_pred
# submission.to_csv('베이스 라인 .csv', index = False)