In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.model_selection import cross_validate, cross_val_predict

### 데이터 path설정은 파일위치에 따라 수정해야함

In [2]:
path = 'data_file/'

df_train = pd.read_csv(path+'train.csv')
df_test = pd.read_csv(path+'test.csv')

In [3]:
from sklearn.model_selection import KFold
from sklearn.metrics import * # accuracy_score, recall_score, f1_score, precision_score 등등 전부가져오기
import numpy as np
from sklearn.ensemble import RandomForestClassifier

In [4]:
# 가정: X는 특성 데이터, y는 타겟 데이터
features = ["Pclass", "Sex", "SibSp", "Parch"]

X = pd.get_dummies(df_train[features]) # One_hot인코딩형태로..
y = df_train["Survived"]

# 랜덤 포레스트 모델 초기화
rf_model = RandomForestClassifier()

# K-Fold 교차 검증 설정
kf = KFold(n_splits=5)  # 5겹 교차 검증

# 각 분할에 대한 정확도를 저장할 리스트
accuracies = []
all_f1_score = []
precisions = []
recalls = []
# K-Fold 교차 검증 수행
for train_index, test_index in kf.split(X):
    # 훈련 데이터와 테스트 데이터 분할
    # print(train_index, test_index)
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # 모델 훈련
    rf_model.fit(X_train, y_train)

    # 예측 및 정확도 계산
    predictions = rf_model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    f1_scores = f1_score(y_test, predictions)
    precision = precision_score(y_test, predictions)
    recall = recall_score(y_test, predictions)

    # 평가지표 추가
    accuracies.append(accuracy)
    all_f1_score.append(f1_scores)
    precisions.append(precision)
    recalls.append(recall)

# 평균 스코어 확인
avg_accuracy = np.mean(accuracies)
avg_f1 = np.mean(f1_scores)
avg_precision = np.mean(precisions)
avg_recall = np.mean(recalls)

# 평균값 출력
print("Average Accuracy:", avg_accuracy)
print("Average F1 Score:", avg_f1)
print("Average Precision:", avg_precision)
print("Average Recall:", avg_recall)

Average Accuracy: 0.780051471972883
Average F1 Score: 0.7368421052631577
Average Precision: 0.7545321850824698
Average Recall: 0.6428518273224111


In [5]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from scipy.stats import randint

In [6]:
scores = ['accuracy', 'recall', 'precision', 'f1']

param_distribs = {
    'n_estimators': randint(100, 200),
    'max_depth': randint(3, 10)
}

# 여러 평가 지표 설정
scoring = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score, average='weighted'),
    'recall': make_scorer(recall_score, average='weighted'),
    'f1_score': make_scorer(f1_score, average='weighted')
}

# 랜덤 서치 수행
rd_rf = RandomizedSearchCV(estimator=rf_model, cv=5, param_distributions=param_distribs, n_iter=10,
                            n_jobs=-1, scoring=scoring, refit='accuracy', random_state=42)
rd_rf.fit(X, y)

# 교차 검증 결과 확인
cv_results = rd_rf.cv_results_

result = pd.DataFrame(cv_results)

display(result[['params','mean_test_accuracy', 'mean_test_precision',
                'mean_test_recall', 'mean_test_f1_score']])

print('--' * 40)

print(rd_rf.best_params_) # 최적의 parameter print

Unnamed: 0,params,mean_test_accuracy,mean_test_precision,mean_test_recall,mean_test_f1_score
0,"{'max_depth': 9, 'n_estimators': 151}",0.783416,0.782443,0.783416,0.78052
1,"{'max_depth': 7, 'n_estimators': 114}",0.785663,0.785023,0.785663,0.782599
2,"{'max_depth': 5, 'n_estimators': 171}",0.800251,0.80019,0.800251,0.797584
3,"{'max_depth': 7, 'n_estimators': 120}",0.78791,0.788081,0.78791,0.784483
4,"{'max_depth': 9, 'n_estimators': 182}",0.786793,0.787212,0.786793,0.783005
5,"{'max_depth': 9, 'n_estimators': 174}",0.78454,0.784276,0.78454,0.781018
6,"{'max_depth': 5, 'n_estimators': 187}",0.800264,0.800426,0.800264,0.7971
7,"{'max_depth': 7, 'n_estimators': 199}",0.78791,0.787757,0.78791,0.784604
8,"{'max_depth': 5, 'n_estimators': 121}",0.795776,0.795696,0.795776,0.792966
9,"{'max_depth': 7, 'n_estimators': 101}",0.78454,0.783607,0.78454,0.781541


--------------------------------------------------------------------------------
{'max_depth': 5, 'n_estimators': 187}


In [21]:
rf_final = RandomForestClassifier(max_depth=5, n_estimators=187)

rf_final.fit(X, y)

In [22]:
features = ["Pclass", "Sex", "SibSp", "Parch"]

x_test = pd.get_dummies(df_test[features])

In [23]:
y_test_pred = rf_final.predict(x_test)

In [24]:
# 타이타닉 데이터프레임으로 만든 후 to_csv를 통해 titanic.csv파일로 변환

submission = pd.DataFrame({
        "PassengerId": df_test["PassengerId"],
        "Survived": y_test_pred
    })
submission.to_csv('titanic.csv', index=False)

## Confusion Matrix

In [30]:
path = 'data_file/'

df_train = pd.read_csv(path+'train.csv')
df_test = pd.read_csv(path+'test.csv')

In [31]:
features = ["Pclass", "Sex", "SibSp", "Parch"]

x = pd.get_dummies(df_train[features])
y = df_train["Survived"]

In [32]:
y.value_counts(normalize=True)

Survived
0    0.616162
1    0.383838
Name: proportion, dtype: float64

In [33]:
# stratify는 불균형한 타겟값의 비율을 맞춰서 나눠준다.

x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2
                                                  ,stratify=y, random_state=42)

In [34]:
x_train.shape, x_val.shape, y_train.shape, y_val.shape

((712, 5), (179, 5), (712,), (179,))

In [35]:
y_train.value_counts(normalize=True)

Survived
0    0.616573
1    0.383427
Name: proportion, dtype: float64

In [36]:
y_val.value_counts(normalize=True)

Survived
0    0.614525
1    0.385475
Name: proportion, dtype: float64

In [37]:
rf_2 = RandomForestClassifier()

rf_2.fit(x_train, y_train)

In [38]:
val_pred = rf_2.predict(x_val)

### classification_report 추가

In [39]:
print(confusion_matrix(y_val, val_pred))
print(classification_report(y_val, val_pred))

[[96 14]
 [29 40]]
              precision    recall  f1-score   support

           0       0.77      0.87      0.82       110
           1       0.74      0.58      0.65        69

    accuracy                           0.76       179
   macro avg       0.75      0.73      0.73       179
weighted avg       0.76      0.76      0.75       179

