In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso # type: ignore
from sklearn.svm import SVR # type: ignore
from sklearn.ensemble import RandomForestRegressor # type: ignore
from sklearn.metrics import mean_squared_error, r2_score # type: ignore

In [None]:
#mlflow.create_experiment("Boston-HousePrice-predict")

In [None]:
def load_data(filepath):
    """加载并初步处理数据。"""
    data = pd.read_csv(filepath, encoding='utf-8')
    data.rename(columns=lambda x: x.strip(), inplace=True)
    return data

def clean_data(data):
    """清洗数据，包括处理缺失值和异常值。"""
    data.fillna(data.median(), inplace=True)
    for col in data.columns:
        Q1 = data[col].quantile(0.25)
        Q3 = data[col].quantile(0.75)
        IQR = Q3 - Q1
        data = data[(data[col] >= Q1 - 1.5 * IQR) & (data[col] <= Q3 + 1.5 * IQR)]
    return data

def train_models(X, y):
    """训练多个模型并返回最佳参数和模型。"""
    models = {
        'LinearRegression': LinearRegression(),
        'Ridge': Ridge(),
        'Lasso': Lasso(),
        'SVR': SVR(),
        'RandomForest': RandomForestRegressor()
    }
    params = {
        'LinearRegression': {'fit_intercept': [True, False], 'copy_X': [True, False], 'n_jobs': [None, -1]},
        'Ridge': {'alpha': [0.1, 1.0, 10.0]},
        'Lasso': {'alpha': [0.1, 0.5, 1.0]},
        'SVR': {'C': [0.1, 1.0, 10.0], 'kernel': ['linear', 'poly', 'rbf']},
        'RandomForest': {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20]}
    }
    best_estimators = {}
    for name, model in models.items():
            grid = GridSearchCV(model, params[name], cv=10, scoring='neg_mean_squared_error')
            grid.fit(X, y)
            best_estimators[name] = grid.best_estimator_
            # mlflow.log_params(grid.best_params_)
            # mlflow.sklearn.log_model(grid.best_estimator_, name)
    return best_estimators

def evaluate_models(models, X_test, y_test):
    """评估模型。"""
    results = {}
    for name, model in models.items():
        with mlflow.start_run(run_name=f"{name}_evaluation",experiment_id = f"593623705617405727",nested=True):
            y_pred = model.predict(X_test)
            mse = mean_squared_error(y_test, y_pred)
            r2 = r2_score(y_test, y_pred)
            results[name] = {'MSE': mse, 'R2': r2}
            mlflow.log_metric('MSE', mse)
            mlflow.log_metric('R2', r2)
            mlflow.sklearn.log_model(model,artifact_path="model")
            mlflow.log_params(model)
    return results

In [None]:
def main():
    data = load_data('Boston Housing.csv')
    data_cleaned = clean_data(data)

    X = data_cleaned.drop('medv', axis=1)
    y = data_cleaned['medv']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    models = train_models(X_train_scaled, y_train)
    print("the best parameter:",models)
    
    results = evaluate_models(models, X_test_scaled, y_test)
    for model, metrics in results.items():
        print(f"{model}: MSE = {metrics['MSE']}, R2 = {metrics['R2']}")

if __name__ == "__main__":
    main()