In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import requests
import zipfile
from io import BytesIO

# Load and preprocess the datasets
def load_and_preprocess_parkinsons():
    url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/parkinsons/telemonitoring/parkinsons_updrs.data'
    df = pd.read_csv(url)
    df = df.drop(['subject#'], axis=1)
    X = df.drop(['motor_UPDRS', 'total_UPDRS'], axis=1).values
    y = df['total_UPDRS'].values
    return X, y

def load_and_preprocess_energy():
    url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00242/ENB2012_data.xlsx'
    df = pd.read_excel(url)
    X = df.drop(['Y1', 'Y2'], axis=1).values
    y = df['Y1'].values
    return X, y

# def load_and_preprocess_superconductivity():
#     url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00464/superconduct.zip'
#     df = pd.read_csv(url, compression='zip')
#     X = df.drop(['critical_temp'], axis=1).values
#     y = df['critical_temp'].values
#     return X, y

def load_and_preprocess_superconductivity():
    url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00464/superconduct.zip'
    response = requests.get(url)
    zf = zipfile.ZipFile(BytesIO(response.content))
    df = pd.read_csv(zf.open('train.csv'))
    X = df.drop(['critical_temp'], axis=1).values
    y = df['critical_temp'].values
    return X, y

def load_and_preprocess_forest_fires():
    url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/forest-fires/forestfires.csv'
    df = pd.read_csv(url)
    df = pd.get_dummies(df)
    X = df.drop(['area'], axis=1).values
    y = df['area'].values
    return X, y

def load_and_preprocess_wine_quality():
    url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
    df = pd.read_csv(url, delimiter=';')
    X = df.drop(['quality'], axis=1).values
    y = df['quality'].values
    return X, y

X_parkinsons, y_parkinsons = load_and_preprocess_parkinsons()
X_energy, y_energy = load_and_preprocess_energy()
X_superconductivity, y_superconductivity = load_and_preprocess_superconductivity()
X_forest_fires, y_forest_fires = load_and_preprocess_forest_fires()
X_wine_quality, y_wine_quality = load_and_preprocess_wine_quality()

# Scale the datasets
scaler_parkinsons = StandardScaler().fit(X_parkinsons)
scaler_energy = StandardScaler().fit(X_energy)
scaler_superconductivity = StandardScaler().fit(X_superconductivity)
scaler_forest_fires = StandardScaler().fit(X_forest_fires)
scaler_wine_quality = StandardScaler().fit(X_wine_quality)

X_parkinsons_scaled = scaler_parkinsons.transform(X_parkinsons)
X_energy_scaled = scaler_energy.transform(X_energy)
X_superconductivity_scaled = scaler_superconductivity.transform(X_superconductivity)
X_forest_fires_scaled = scaler_forest_fires.transform(X_forest_fires)
X_wine_quality_scaled = scaler_wine_quality.transform(X_wine_quality)

In [2]:
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, ExtraTreesRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score


def run_regression_models(X, y):
    X_train_all, X_test_all, y_train_all, y_test_all = train_test_split(X, y, test_size=0.2, random_state=42)
    kf = KFold(n_splits=10, shuffle=True, random_state=42)

    regression_models = [
        ('Lasso', Lasso(alpha=0.1)),
        ('RandomForest', RandomForestRegressor(n_estimators=100, random_state=42)),
        ('XGBoost', XGBRegressor(n_estimators=100, random_state=42)),
        ('SVR', SVR(kernel='linear')),
        ('k-NN', KNeighborsRegressor(n_neighbors=5)),
        ('AdaBoost', AdaBoostRegressor(n_estimators=100, random_state=42)),
        ('ExtraTrees', ExtraTreesRegressor(n_estimators=100, random_state=42))
    ]

    results = {}
    for name, model in regression_models:
        mse_list = []
        r2_list = []
        for train_index, val_index in kf.split(X_train_all):
            X_train, X_val = X_train_all[train_index], X_train_all[val_index]
            y_train, y_val = y_train_all[train_index], y_train_all[val_index]

            model.fit(X_train, y_train)
            y_pred = model.predict(X_val)
            mse = mean_squared_error(y_val, y_pred)
            mse_list.append(mse)
            r2 = r2_score(y_val, y_pred)
            r2_list.append(r2)

        #model.fit(X_train_all, y_train_all)
        y_pred_test = model.predict(X_test_all)
        test_error = mean_squared_error(y_test_all, y_pred_test)
        test_r2 = r2_score(y_test_all, y_pred_test)

        results[name] = {
            'training_mse': np.mean(mse_list),
            'training_r2': np.mean(r2_list),
            'test_mse': test_error,
            'test_r2': test_r2
        }

    return results

datasets = {
    'Parkinsons': (X_parkinsons_scaled, y_parkinsons),
    'Energy Efficiency': (X_energy_scaled, y_energy),
    'Superconductivity': (X_superconductivity_scaled, y_superconductivity),
    'Forest Fires': (X_forest_fires_scaled, y_forest_fires),
    'Wine Quality': (X_wine_quality_scaled, y_wine_quality)
}

for dataset_name, (X, y) in datasets.items():
    print(f"\n{dataset_name}:")
    results = run_regression_models(X, y)
    for model_name, metrics in results.items():
        print(f"{model_name}: Training MSE: {metrics['training_mse']:.4f}, Test MSE: {metrics['test_mse']:.4f}, Test R2: {metrics['test_r2']:.4f}")



Parkinsons:
Lasso: Training MSE: 95.9297, Test MSE: 93.2147, Test R2: 0.1588
RandomForest: Training MSE: 3.5105, Test MSE: 3.0004, Test R2: 0.9729
XGBoost: Training MSE: 4.5990, Test MSE: 4.4886, Test R2: 0.9595
SVR: Training MSE: 98.4611, Test MSE: 96.6101, Test R2: 0.1282
k-NN: Training MSE: 43.0568, Test MSE: 39.9974, Test R2: 0.6391
AdaBoost: Training MSE: 69.4774, Test MSE: 70.4075, Test R2: 0.3646
ExtraTrees: Training MSE: 2.8546, Test MSE: 2.4723, Test R2: 0.9777

Energy Efficiency:
Lasso: Training MSE: 8.9874, Test MSE: 9.8086, Test R2: 0.9059
RandomForest: Training MSE: 0.2217, Test MSE: 0.2264, Test R2: 0.9978
XGBoost: Training MSE: 0.1064, Test MSE: 0.1210, Test R2: 0.9988
SVR: Training MSE: 8.9780, Test MSE: 9.8531, Test R2: 0.9055
k-NN: Training MSE: 5.7151, Test MSE: 5.0299, Test R2: 0.9517
AdaBoost: Training MSE: 3.6595, Test MSE: 4.4775, Test R2: 0.9570
ExtraTrees: Training MSE: 0.2225, Test MSE: 0.2153, Test R2: 0.9979

Superconductivity:
Lasso: Training MSE: 342.9528