<a href="https://colab.research.google.com/github/vforjj/AI-ML-Review/blob/main/1_ModelLearning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support

%matplotlib inline

df = pd.read_csv('dataset_preprocessing.csv', index_col=0)
df.head()

Unnamed: 0,Age,SibSp,Parch,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Embarked_C,Embarked_N,Embarked_Q,Embarked_S,Survived
0,0.271174,0.125,0.0,0.014151,0,0,1,0,1,0,0,0,1,0
1,0.472229,0.125,0.0,0.139136,1,0,0,1,0,1,0,0,0,1
2,0.321438,0.0,0.0,0.015469,0,0,1,1,0,0,0,0,1,1
3,0.434531,0.125,0.0,0.103644,1,0,0,1,0,0,0,0,1,1
4,0.434531,0.0,0.0,0.015713,0,0,1,0,1,0,0,0,1,0


In [None]:
label = df["Survived"]
df = df.drop(["Survived"], axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df, label, test_size=0.25, random_state=1234)

### 0. 일반적인 모델 학습

In [None]:
# 모델 초기화
n_estimator = 300
random_state = 1234

model = RandomForestClassifier(n_estimators=n_estimator, random_state=random_state)
# 모델 학습
model.fit(X_train, y_train)
# 테스트 데이터에 대한 예측
y_pred = model.predict(X_test)
# 정확도 계산
accuracy = accuracy_score(y_test, y_pred)
prf = precision_recall_fscore_support(y_test, y_pred, average='binary')

In [None]:
# 정확도 출력
accuracy

0.8116591928251121

In [None]:
prf

(0.788235294117647, 0.7362637362637363, 0.7613636363636365, None)

### 1. MLflow를 활용한 모델 학습 Tracking
* custom logging (직접 로깅) 을 통한 모델 학습 기록 관리
* auto logging을 활용한 모델 학습 기록 관리
* auto loggine + custom logging

##### 1) Custom logging(직접 로깅)을 통한 모델 학습 기록 관리

In [None]:
import mlflow

'''
	To do
    tracking uri를 셋팅하기 -> local host로 진행
    experiment 생성하기 -> hellomlflow!
    experiment 셋팅하기 -> hellomlflow!
'''

mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.create_experiment("hellomlflow!")


'899358510632907849'

In [None]:
mlflow.set_experiment("hellomlflow!")

<Experiment: artifact_location='mlflow-artifacts:/899358510632907849', creation_time=1685018101041, experiment_id='899358510632907849', last_update_time=1685018101041, lifecycle_stage='active', name='hellomlflow!', tags={}>

In [None]:
'''
	To do
	mlflow 활용해서 Custom Logging을 진행
    아래 함수를 이용해 logging을 진행
		1) mlflow.log_param()
			n_estimator
		2) mlflow.log_model()
			save model
        3) mlflow.log_metric()
			metric -> accuracy, precision, recall, f1score

'''

n_estimator = 80
random_state = 1234

with mlflow.start_run():
    model = RandomForestClassifier(n_estimators=n_estimator, random_state=random_state)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    prf = precision_recall_fscore_support(y_test, y_pred, average="binary")
    mlflow.log_param("n_estimator", n_estimator)
    mlflow.log_metric("accuracy_on_test", accuracy)
    mlflow.log_metric("precision_on_test", prf[0])
    mlflow.log_metric("recall_on_test", prf[1])
    mlflow.log_metric("f1score_on_test", prf[2])
    mlflow.sklearn.log_model(model, "model")



##### 2) auto logging을 활용한 모델 학습 기록 관리

In [None]:
# Mlflow Sklearn을 활용해서 모델 및 메트릭 자동 기록!
'''
	To do
	use autolog() on mlflow
'''

mlflow.sklearn.autolog()

In [None]:
# 모델 초기화
n_estimator = 77
random_state = 2222

model = RandomForestClassifier(n_estimators=n_estimator, random_state=random_state)
# 모델 학습
model.fit(X_train, y_train)
# 테스트 데이터에 대한 예측
y_pred = model.predict(X_test)
# 정확도 계산
accuracy = accuracy_score(y_test, y_pred)
prf = precision_recall_fscore_support(y_test, y_pred, average='binary')

2023/05/25 21:46:24 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '666ea157ed7b434685d1411bc85f7aff', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


### 3) auto logging + custom logging을 활용한 모델 학습 관리

In [None]:
'''
	To do
	auto logging에서 수집되지 않는 정보 추가로 logging 하기
		precision, recall, f1score, accuracy for test data set

'''

with mlflow.start_run():
    n_estimator = 400
    random_state = 7777
    max_depth = 2
    model = RandomForestClassifier(n_estimators=n_estimator, random_state=random_state, max_depth=max_depth)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    prf = precision_recall_fscore_support(y_test, y_pred, average="binary")
    mlflow.log_metric("precision_on_test", prf[0])
    mlflow.log_metric("recall_on_test", prf[1])
    mlflow.log_metric("f1score_on_test", prf[2])
    mlflow.log_metric("accuracy_on_test", accuracy)





### 2. MLflow를 활용한 모델 최적화 Tracking
* 직접 parameter 변경해가며 모델 최적화
* GridSearch 혹은 RandomSearch를 활용하여 모델 최적화


##### 1) 직접 parameter 변경해가며 모델 최적화

In [None]:
def train_model_with_hyperparameters(n_estimator, max_depth, max_feature):
    with mlflow.start_run():
        model = RandomForestClassifier(n_estimators=n_estimator, max_depth=max_depth, max_features=max_feature)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        prf = precision_recall_fscore_support(y_test, y_pred, average='binary')
        mlflow.log_metric("precision_on_test", prf[0])
        mlflow.log_metric("recall_on_test", prf[1])
        mlflow.log_metric("f1score_on_test", prf[2])
        mlflow.log_metric("accuracy_on_test", accuracy)

In [None]:
n_estimators = [50, 100, 150, 200]
max_depths = [2, 5, 10]
max_features = [5, 8, 10, 13]

'''
    To do
    run model with hyper-parameter combination
'''

for n_estimator in n_estimators:
    for max_depth in max_depths:
        for max_feature in max_features:
            train_model_with_hyperparameters(n_estimator, max_depth, max_feature)



##### 2) GridSearch 혹은 RandomSearch를 활용하여 모델 최적화


In [None]:
from sklearn.model_selection import GridSearchCV


'''
	To do
	run GridSearchCV with mlflow
'''

with mlflow.start_run():
    model_grid = GridSearchCV(RandomForestClassifier(), {"n_estimators":[100,200], "max_depth":[20,25]})
    model_grid.fit(X_train, y_train)
    y_pred = model_grid.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    prf = precision_recall_fscore_support(y_test, y_pred, average="binary")
    mlflow.log_metric("precision_on_test", prf[0])
    mlflow.log_metric("recall_on_test", prf[1])
    mlflow.log_metric("f1score_on_test", prf[2])
    mlflow.log_metric("accuracy_on_test", accuracy)


2023/05/25 22:06:26 INFO mlflow.sklearn.utils: Logging the 5 best runs, no runs will be omitted.
