In [None]:
# mlflow.create_experiment("Boston-HousePrice-predict")

In [14]:
import json
import importlib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from datetime import datetime


def load_config(config_path):
    with open(config_path, 'r') as f:
        config = json.load(f)
    return config

def get_model(model_path):
    module_name, class_name = model_path.rsplit('.', 1)
    module = importlib.import_module(module_name)
    model_class = getattr(module, class_name)
    return model_class()

def train_models(X, y, config):
    models = {name: get_model(path) for name, path in config['models'].items()}
    params = config['params']
    best_estimators = {}
    for name, model in models.items():
        grid = GridSearchCV(model, params[name], cv=10, scoring='neg_mean_squared_error')
        grid.fit(X, y)
        best_estimators[name] = grid.best_estimator_
        # Log the best parameters
        with mlflow.start_run(run_name=f"{name}_grid_best_estimator"):
            mlflow.log_params(grid.best_params_)
            mlflow.sklearn.log_model(grid.best_estimator_, artifact_path=f"{name}_model")
            y_pred_train = grid.best_estimator_.predict(X)
            mse = mean_squared_error(y, y_pred_train)
            r2 = r2_score(y, y_pred_train)
            mlflow.log_metric('MSE', mse)
            mlflow.log_metric('R2', r2)
    return best_estimators

def evaluate_models(models, X_test, y_test):
    results = {}
    mlflow.set_experiment("Boston-HousePrice-predict")
    for name, model in models.items():
        with mlflow.start_run(run_name=f"{name}_models",experiment_id="593623705617405727", nested=True):
            y_pred = model.predict(X_test)
            mse = mean_squared_error(y_test, y_pred)
            r2 = r2_score(y_test, y_pred)
            results[name] = {'MSE': mse, 'R2': r2}
            mlflow.log_metric('MSE', mse)
            mlflow.log_metric('R2', r2)
            mlflow.sklearn.log_model(model, artifact_path=f"{name}_model")
            # Log model parameters
            mlflow.log_params(model.get_params())
    return results

def main():
    # Load the config file
    mlflow.sklearn.autolog()
    config = load_config('./config.json')
    data = pd.read_csv('exported_data5.csv')
    X = data.drop('取引価格（総額）', axis=1)
    y = data['取引価格（総額）']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    # scaler = StandardScaler()
    # X_train_scaled = scaler.fit_transform(X_train)
    # X_test_scaled = scaler.transform(X_test)

    models = train_models(X_train, y_train, config)
    print("the best parameter:", models)
    results = evaluate_models(models, X_test, y_test)
    for model, metrics in results.items():
        if 'MSE' in metrics:
            print(f"{model}: MSE = {metrics['MSE']}, R2 = {metrics['R2']}")
        else:
            print(f"{model}: Accuracy = {metrics['Accuracy']}, F1 = {metrics['F1']}")

if __name__ == "__main__":
    main()


2024/07/04 14:59:27 INFO mlflow.sklearn.utils: Logging the 5 best runs, no runs will be omitted.
2024/07/04 14:59:29 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'c1d89ab4a3f14c1a86fbb48cc0de2b6a', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


In [4]:
import json
import importlib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from datetime import datetime
# Importing ML libraries and metrics
from math import sqrt
from tqdm import tqdm
import lightgbm as lgb
import xgboost as xgb
from lightgbm import early_stopping, log_evaluation
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.metrics import r2_score

mlflow.sklearn.autolog()
# config = load_config('./config.json')
data = pd.read_csv('exported_data7.csv')
X = data.drop('取引価格（総額）', axis=1)
y = data['取引価格（総額）']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



params = {
#  'tree_method': 'gpu_hist', # GPU-optimized tree method
#  'predictor': 'gpu_predictor', # Use GPU for prediction
 'objective': 'reg:squarederror', # Squared error regression task
 'n_estimators': 183, # Number of trees (estimators)
 'learning_rate': 0.05, # Learning rate
 'max_depth': 6, # Maximum depth of trees
 'subsample': 0.8, # Ratio of samples for subsampling
 'colsample_bytree': 0.8, # Proportion of columns for subsampling
 'reg_alpha': 0.1, # L1 Regularization
 'reg_lambda': 0.1, # L2 Regularization
 'verbosity': 1 # Verbosity level (0 for mute)
}

# Initialize and train the XGBoost model
xgb_model = xgb.XGBRegressor(**params)
xgb_model.fit(X_train, y_train, 
              eval_set=[(X_test, y_test)], 
            #   early_stopping_rounds=10, 
              verbose=True)

# Make predictions with the trained model
xgb_model_pred = xgb_model.predict(X_test)

# # LightGBM model parameters with GPU and other adjustments
# params = {
#  'boosting_type': 'gbdt', # Boosting type
#  'objective': 'regression', # Task type: regression
#  'metric': 'rmse', # Evaluation metric: Root Mean Squared Error
# #  'device': 'gpu', # Use GPU
# #  'gpu_platform_id': 0, # GPU platform ID (adjust if necessary)
# #  'gpu_device_id': 0, # GPU device ID (adjust if necessary)
#  'num_leaves': 31, # Number of leaves in the tree
#  'learning_rate': 0.05, # Learning rate
#  'n_estimators': 1000, # Number of trees (estimators)
#  'max_depth': -1, # Maximum depth of trees (-1 for no limit)
#  'min_child_samples': 20, # Minimum number of samples in a child node
#  'subsample': 0.8, # Ratio of samples for subsampling
#  'colsample_bytree': 0.8, # Proportion of columns for subsampling
#  'reg_alpha': 0.1, # L1 Regularization
#  'reg_lambda': 0.1, # L2 Regularization
#  'verbose': -1 # Verbosity level (-1 for mute)
# }

# # Initialize and train the LightGBM model
# lgbm_model = lgb.LGBMRegressor(**params)
# lgbm_model.fit(X_train, y_train,
#                eval_set=[(X_test, y_test)],  
#                callbacks=[log_evaluation(10)])

# # Make predictions with the trained model
# lgbm_predictions = lgbm_model.predict(X_test)

2024/07/04 16:40:26 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '9a237b21efdd403f8b695246b292f343', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


[0]	validation_0-rmse:19885798.42092
[1]	validation_0-rmse:19497840.43567
[2]	validation_0-rmse:19139777.51090
[3]	validation_0-rmse:18943190.17063
[4]	validation_0-rmse:18617892.34151
[5]	validation_0-rmse:18394969.08109
[6]	validation_0-rmse:18113608.56033
[7]	validation_0-rmse:17923102.02481
[8]	validation_0-rmse:17670914.87009
[9]	validation_0-rmse:17504546.03576
[10]	validation_0-rmse:17353397.93086
[11]	validation_0-rmse:17136339.65044
[12]	validation_0-rmse:16944896.57386
[13]	validation_0-rmse:16777890.83609
[14]	validation_0-rmse:16659774.89523
[15]	validation_0-rmse:16615832.46361
[16]	validation_0-rmse:16446543.97866
[17]	validation_0-rmse:16288683.88707
[18]	validation_0-rmse:16146874.75745
[19]	validation_0-rmse:16022432.27873
[20]	validation_0-rmse:15928251.31021
[21]	validation_0-rmse:15805492.63565
[22]	validation_0-rmse:15700119.17275
[23]	validation_0-rmse:15605457.47156
[24]	validation_0-rmse:15514732.22690
[25]	validation_0-rmse:15431327.90903
[26]	validation_0-rmse

In [5]:
import json
import importlib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from datetime import datetime
# Importing ML libraries and metrics
from math import sqrt
from tqdm import tqdm
import lightgbm as lgb
import xgboost as xgb
from lightgbm import early_stopping, log_evaluation
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.metrics import r2_score

mlflow.sklearn.autolog()

# Load data
data = pd.read_csv('exported_data7.csv')
X = data.drop('取引価格（総額）', axis=1)
y = data['取引価格（総額）']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define model parameters
params = {
    # 'tree_method': 'gpu_hist', # GPU-optimized tree method
    # 'predictor': 'gpu_predictor', # Use GPU for prediction
    'objective': 'reg:squarederror', # Squared error regression task
    'n_estimators': 183, # Number of trees (estimators)
    'learning_rate': 0.05, # Learning rate
    'max_depth': 6, # Maximum depth of trees
    'subsample': 0.8, # Ratio of samples for subsampling
    'colsample_bytree': 0.8, # Proportion of columns for subsampling
    'reg_alpha': 0.1, # L1 Regularization
    'reg_lambda': 0.1, # L2 Regularization
    'verbosity': 1 # Verbosity level (0 for mute)
}

# Start an MLflow run
with mlflow.start_run(run_name="xgboost1"):
    # Initialize and train the XGBoost model
    xgb_model = xgb.XGBRegressor(**params)
    xgb_model.fit(X_train, y_train, 
                  eval_set=[(X_test, y_test)], 
                #   early_stopping_rounds=10, 
                  verbose=True)
    
    # Make predictions with the trained model
    xgb_model_pred = xgb_model.predict(X_test)
    
    # Calculate and log metrics
    mse = mean_squared_error(y_test, xgb_model_pred)
    rmse = sqrt(mse)
    r2 = r2_score(y_test, xgb_model_pred)
    mae = mean_absolute_error(y_test, xgb_model_pred)
    
    mlflow.log_metric("mse", mse)
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)
    mlflow.log_metric("mae", mae)
    
    # Log model
    mlflow.sklearn.log_model(xgb_model, "model")
    
    # Optionally, log model parameters
    mlflow.log_params(params)

print(f"Test MSE: {mse}")
print(f"Test RMSE: {rmse}")
print(f"Test R²: {r2}")
print(f"Test MAE: {mae}")

[0]	validation_0-rmse:19885798.42092
[1]	validation_0-rmse:19497840.43567
[2]	validation_0-rmse:19139777.51090
[3]	validation_0-rmse:18943190.17063
[4]	validation_0-rmse:18617892.34151
[5]	validation_0-rmse:18394969.08109
[6]	validation_0-rmse:18113608.56033
[7]	validation_0-rmse:17923102.02481
[8]	validation_0-rmse:17670914.87009
[9]	validation_0-rmse:17504546.03576
[10]	validation_0-rmse:17353397.93086
[11]	validation_0-rmse:17136339.65044
[12]	validation_0-rmse:16944896.57386
[13]	validation_0-rmse:16777890.83609
[14]	validation_0-rmse:16659774.89523
[15]	validation_0-rmse:16615832.46361
[16]	validation_0-rmse:16446543.97866
[17]	validation_0-rmse:16288683.88707
[18]	validation_0-rmse:16146874.75745
[19]	validation_0-rmse:16022432.27873
[20]	validation_0-rmse:15928251.31021
[21]	validation_0-rmse:15805492.63565
[22]	validation_0-rmse:15700119.17275
[23]	validation_0-rmse:15605457.47156
[24]	validation_0-rmse:15514732.22690
[25]	validation_0-rmse:15431327.90903
[26]	validation_0-rmse