In [None]:
# mlflow.create_experiment("Boston-HousePrice-predict")

In [2]:
import json
import importlib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from datetime import datetime

def load_config(config_path):
    with open(config_path, 'r') as f:
        config = json.load(f)
    return config

def get_model(model_path):
    module_name, class_name = model_path.rsplit('.', 1)
    module = importlib.import_module(module_name)
    model_class = getattr(module, class_name)
    return model_class()

def train_models(X, y, config):
    models = {name: get_model(path) for name, path in config['models'].items()}
    params = config['params']
    best_estimators = {}
    for name, model in models.items():
        grid = GridSearchCV(model, params[name], cv=10, scoring='neg_mean_squared_error')
        grid.fit(X, y)
        best_estimators[name] = grid.best_estimator_
        # Log the best parameters
        with mlflow.start_run(run_name=f"{name}_grid_best_estimator"):
            mlflow.log_params(grid.best_params_)
            mlflow.sklearn.log_model(grid.best_estimator_, artifact_path=f"{name}_model")
            y_pred_train = grid.best_estimator_.predict(X)
            mse = mean_squared_error(y, y_pred_train)
            r2 = r2_score(y, y_pred_train)
            mlflow.log_metric('MSE', mse)
            mlflow.log_metric('R2', r2)
    return best_estimators

def evaluate_models(models, X_test, y_test):
    results = {}
    mlflow.set_experiment("Boston-HousePrice-predict")
    for name, model in models.items():
        with mlflow.start_run(run_name=f"{name}_models",experiment_id="593623705617405727", nested=True):
            y_pred = model.predict(X_test)
            mse = mean_squared_error(y_test, y_pred)
            r2 = r2_score(y_test, y_pred)
            results[name] = {'MSE': mse, 'R2': r2}
            mlflow.log_metric('MSE', mse)
            mlflow.log_metric('R2', r2)
            mlflow.sklearn.log_model(model, artifact_path=f"{name}_model")
            # Log model parameters
            mlflow.log_params(model.get_params())
    return results

def main():
    # Load the config file
    mlflow.sklearn.autolog()
    config = load_config('./config.json')
    data = pd.read_csv('processed_data.csv')
    X = data.drop('取引価格（総額）', axis=1)
    y = data['取引価格（総額）']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    # scaler = StandardScaler()
    # X_train_scaled = scaler.fit_transform(X_train)
    # X_test_scaled = scaler.transform(X_test)

    models = train_models(X_train, y_train, config)
    print("the best parameter:", models)
    results = evaluate_models(models, X_test, y_test)
    for model, metrics in results.items():
        if 'MSE' in metrics:
            print(f"{model}: MSE = {metrics['MSE']}, R2 = {metrics['R2']}")
        else:
            print(f"{model}: Accuracy = {metrics['Accuracy']}, F1 = {metrics['F1']}")

if __name__ == "__main__":
    main()


2024/06/30 21:19:38 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '145be07ba38d476e996e9f6d95610fb8', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
2024/06/30 21:19:43 INFO mlflow.sklearn.utils: Logging the 5 best runs, no runs will be omitted.
2024/06/30 21:19:44 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '90c02d2674e04d159622374e9dec7dca', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
2024/06/30 21:19:52 INFO mlflow.sklearn.utils: Logging the 5 best runs, no runs will be omitted.
2024/06/30 21:19:54 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '5572986d4fa4481eb9cf305ae5e0b433', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


KeyboardInterrupt: 