In [68]:
import pandas as pd
import os

path=os.getcwd()

train_df = pd.read_csv(path + f'/데이터들/라벨 인코딩/train.csv')
test_df = pd.read_csv(path + f'/데이터들/라벨 인코딩/test.csv')

In [69]:
from flaml import AutoML
import numpy as np
from sklearn.metrics import mean_squared_log_error
import time

automl = AutoML()

def rmsle_custom_metric(
    X_val, y_val, estimator, labels,
    X_train, y_train, weight_val=None, weight_train=None,
    *args,
):
    def postprocess_predictions(y_pred):
        """ Ensure all predictions are non-negative """
        return np.where(y_pred < 0, 1, y_pred)
    
    start = time.time()
    # Predict on validation set
    y_pred_val = postprocess_predictions(estimator.predict(X_val))
    pred_time = (time.time() - start) / len(X_val)
    
    # Calculate RMSLE on the validation set
    rmsle_val = np.sqrt(mean_squared_log_error(y_val, y_pred_val, sample_weight=weight_val))
    
    # Optionally, compute RMSLE on the training set for logging
    y_pred_train = postprocess_predictions(estimator.predict(X_train))
    rmsle_train = np.sqrt(mean_squared_log_error(y_val, y_pred_val, sample_weight=weight_val))
    
    # Return the RMSLE for the validation set and log additional metrics
    return rmsle_val, {"val_rmsle": rmsle_val, "train_rmsle": rmsle_train, "pred_time": pred_time}

# Specify your task and the metric to optimize for
automl_settings = {
    "time_budget": 300,  # time budget in seconds
    "metric": rmsle_custom_metric,
    "task": 'regression',
    "log_file_name": "flaml_log.txt",
}

X_train=train_df.drop(['ECLO', 'Unnamed: 0', '사망자수', '중상자수', '경상자수', '부상자수', '요일', '기상상태', '노면상태', '사고유형', '시', '구', '동', '도로형태1', '도로형태2', 'holiday', '시간구분', '계절'], axis=1)

print(X_train.columns)

Index(['년도', '월', '시간', '기상상태_레이블', '노면상태_레이블', '사고유형_레이블', '시_레이블', '구_레이블',
       '동_레이블', '도로형태1_레이블', '도로형태2_레이블', 'holiday_레이블', '시간구분_레이블', '계절_레이블'],
      dtype='object')


In [66]:
# Train the model
automl.fit(X_train, y_train=train_df['ECLO'], **automl_settings)

# Predictions
flaml_predictions = automl.predict(test_df.drop(['Unnamed: 0'], axis=1))
print(flaml_predictions[:5])

[flaml.automl.logger: 03-04 11:52:14] {1679} INFO - task = regression
[flaml.automl.logger: 03-04 11:52:14] {1690} INFO - Evaluation method: holdout
[flaml.automl.logger: 03-04 11:52:14] {1788} INFO - Minimizing error metric: customized metric
[flaml.automl.logger: 03-04 11:52:14] {1900} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'catboost', 'xgboost', 'extra_tree', 'xgb_limitdepth']
[flaml.automl.logger: 03-04 11:52:14] {2218} INFO - iteration 0, current learner lgbm
[flaml.automl.logger: 03-04 11:52:14] {2344} INFO - Estimated sufficient time budget=8382s. Estimated necessary time budget=72s.
[flaml.automl.logger: 03-04 11:52:14] {2391} INFO -  at 0.4s,	estimator lgbm's best error=0.4411,	best estimator lgbm's best error=0.4411
[flaml.automl.logger: 03-04 11:52:14] {2218} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 03-04 11:52:14] {2391} INFO -  at 0.5s,	estimator lgbm's best error=0.4411,	best estimator lgbm's best error=0.4411
[flaml.automl.logger: 

In [70]:
best_models = {}

my_estimator = ['lgbm', 'rf', 'catboost', 'xgboost', 'extra_tree']

for estimator in my_estimator:
    automl = AutoML()
    automl.fit(X_train=X_train, y_train=train_df['ECLO'], estimator_list=[estimator], **automl_settings)
    # Save the best model for the current estimator
    best_models[estimator] = automl.model

[flaml.automl.logger: 03-04 16:47:02] {1679} INFO - task = regression
[flaml.automl.logger: 03-04 16:47:02] {1690} INFO - Evaluation method: holdout
[flaml.automl.logger: 03-04 16:47:02] {1788} INFO - Minimizing error metric: customized metric
[flaml.automl.logger: 03-04 16:47:02] {1900} INFO - List of ML learners in AutoML Run: ['lgbm']
[flaml.automl.logger: 03-04 16:47:02] {2218} INFO - iteration 0, current learner lgbm
[flaml.automl.logger: 03-04 16:47:02] {2344} INFO - Estimated sufficient time budget=11211s. Estimated necessary time budget=11s.
[flaml.automl.logger: 03-04 16:47:02] {2391} INFO -  at 0.5s,	estimator lgbm's best error=0.4411,	best estimator lgbm's best error=0.4411
[flaml.automl.logger: 03-04 16:47:02] {2218} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 03-04 16:47:02] {2391} INFO -  at 0.5s,	estimator lgbm's best error=0.4411,	best estimator lgbm's best error=0.4411
[flaml.automl.logger: 03-04 16:47:02] {2218} INFO - iteration 2, current learner l

In [None]:
print(best_models)

{'lgbm': <flaml.automl.model.LGBMEstimator object at 0x0000024429F219A0>, 'rf': <flaml.automl.model.RandomForestEstimator object at 0x00000244261DAC90>, 'catboost': <flaml.automl.model.CatBoostEstimator object at 0x0000024429F28E60>, 'xgboost': <flaml.automl.model.XGBoostSklearnEstimator object at 0x0000024426148CB0>, 'extra_tree': <flaml.automl.model.ExtraTreesEstimator object at 0x000002446D3AE240>, 'xgb_limitdepth': <flaml.automl.model.XGBoostLimitDepthEstimator object at 0x00000244298A2810>}


In [71]:
def voting_ensemble(models, X):
    """Simple voting ensemble for regression."""
    predictions = np.array([model.predict(X) for model in models.values()])
    # For regression, use mean predictions
    ensemble_predictions = np.mean(predictions, axis=0)
    return ensemble_predictions

# Use the voting ensemble to make predictions
ensemble_predictions = voting_ensemble(best_models, X_train)

In [72]:
quest = test_df.drop(['ID','Unnamed: 0', '요일', '기상상태', '노면상태', '사고유형', '시', '구', '동', '도로형태1', '도로형태2', 'holiday', '시간구분', '계절'], axis=1)

answer=voting_ensemble(best_models, quest)

submission_df = pd.DataFrame({
    'ID': test_df['ID'],  # Ensure this matches the ID column name in your test dataset
    'ECLO': answer # This is the array of predictions from FLAML's ensemble
})

submission_file_path = path + f'/데이터들/전처리 전/submission_flaml.csv'

submission_df.to_csv(submission_file_path, index=False)

In [67]:
submission_df = pd.DataFrame({
    'ID': test_df['ID'],  # Ensure this matches the ID column name in your test dataset
    'ECLO': flaml_predictions  # This is the array of predictions from FLAML
})

submission_file_path = path + f'/데이터들/전처리 전/submission_one_flaml.csv'

submission_df.to_csv(submission_file_path, index=False)