In [1]:
pip install flaml[synapse]==1.2.1 xgboost==1.6.1 pandas==1.5.1 numpy==1.23.4 pyspark==3.5.1 --force-reinstall

Collecting flaml==1.2.1 (from flaml[synapse]==1.2.1)
  Using cached FLAML-1.2.1-py3-none-any.whl.metadata (12 kB)
Collecting xgboost==1.6.1
  Using cached xgboost-1.6.1-py3-none-manylinux2014_x86_64.whl.metadata (1.8 kB)
Collecting pandas==1.5.1
  Using cached pandas-1.5.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting numpy==1.23.4
  Using cached numpy-1.23.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.3 kB)
Collecting pyspark==3.5.1
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting lightgbm>=2.3.1 (from flaml==1.2.1->flaml[synapse]==1.2.1)
  Using cached lightgbm-4.6.0-py3-none-manylinux_2_28_x86_64.whl.metadata (17 kB)
Collecting scipy>=1.4.1 (from flaml==1.2.1->flaml[synapse]==1.2.1)
  Using cached scipy-1.15.3-cp311-cp311-manylinu

In [2]:
import numpy as np
import pandas as pd

np.random.seed(42)
n_rows = 10000

data = {
    "PatientID": np.arange(1, n_rows + 1),
    "Age": np.random.randint(20, 80, n_rows),
    "Gender": np.random.choice(["M", "F"], n_rows),
    "BMI": np.round(np.random.uniform(18.5, 40.0, n_rows), 1),
    "SmokingStatus": np.random.choice(["Current", "Former", "Never"], n_rows),
    "BloodPressure": np.random.choice(["120/80", "130/85", "140/90", "150/95", "125/82"], n_rows),
    "Cholesterol": np.random.choice(["Normal", "Borderline", "High"], n_rows),
    "FamilyHistory": np.random.choice(["Yes", "No"], n_rows),
    "PhysicalActivity": np.random.choice(["Low", "Moderate", "High"], n_rows),
    "AlcoholIntake": np.random.choice(["Low", "Moderate", "High"], n_rows),
    "DietQuality": np.random.choice(["Poor", "Moderate", "Good", "Excellent"], n_rows),
    "StressLevel": np.random.choice(["Low", "Medium", "High"], n_rows),
    "GlucoseLevel": np.round(np.random.uniform(70, 180, n_rows), 1),
    "HbA1c": np.round(np.random.uniform(4.5, 8.0, n_rows), 1)
}

df = pd.DataFrame(data)

mappings = {
    "Gender": {"M": 0, "F": 1},
    "SmokingStatus": {"Current": 2, "Former": 1, "Never": 0},
    "BloodPressure": {"120/80": 1, "130/85": 2, "140/90": 3, "150/95": 4, "125/82": 1.5},
    "Cholesterol": {"Normal": 1, "Borderline": 2, "High": 3},
    "FamilyHistory": {"Yes": 1, "No": 0},
    "PhysicalActivity": {"Low": 1, "Moderate": 2, "High": 3},
    "AlcoholIntake": {"Low": 0, "Moderate": 1, "High": 2},
    "DietQuality": {"Poor": 1, "Moderate": 2, "Good": 3, "Excellent": 4},
    "StressLevel": {"Low": 1, "Medium": 2, "High": 3}
}

for col, mapping in mappings.items():
    df[col] = df[col].map(mapping)

df["DiabetesRiskScore"] = (
    0.1 * df["Age"] + 0.2 * df["BMI"]**1.5 + 0.25 * np.log1p(df["GlucoseLevel"]) +
    0.15 * df["SmokingStatus"] + 0.3 * df["BloodPressure"] +
    0.25 * df["Cholesterol"] * df["DietQuality"] + 0.2 * df["FamilyHistory"] +
    0.1 * df["PhysicalActivity"] + np.random.normal(0, 2, n_rows)
).astype(int)

df["HeartDiseaseRiskScore"] = (
    0.2 * np.sqrt(df["Age"]) + 0.15 * df["BMI"] + 0.3 * df["BloodPressure"] +
    0.25 * df["Cholesterol"]**2 + 0.2 * df["StressLevel"] +
    0.15 * df["PhysicalActivity"] * df["DietQuality"] +
    0.25 * df["AlcoholIntake"] + np.random.normal(0, 3, n_rows)
).astype(int)

df["DiabetesRiskScore"] = df["DiabetesRiskScore"].clip(0, 100)
df["HeartDiseaseRiskScore"] = df["HeartDiseaseRiskScore"].clip(0, 100)

df.head()


Unnamed: 0,PatientID,Age,Gender,BMI,SmokingStatus,BloodPressure,Cholesterol,FamilyHistory,PhysicalActivity,AlcoholIntake,DietQuality,StressLevel,GlucoseLevel,HbA1c,DiabetesRiskScore,HeartDiseaseRiskScore
0,1,58,0,27.5,0,2.0,1,0,1,1,3,3,84.7,6.4,35,6
1,2,71,0,26.0,0,1.5,3,0,3,1,2,2,141.6,5.1,37,10
2,3,48,0,37.0,2,3.0,3,1,3,0,4,1,177.5,5.5,56,14
3,4,34,0,21.2,2,3.0,1,0,3,0,2,1,119.8,6.3,26,8
4,5,62,0,30.7,1,1.5,3,0,3,2,1,1,86.4,7.9,42,7


In [3]:
def _init_spark():
    import pyspark

    spark = (
        pyspark.sql.SparkSession.builder.appName("MyApp")
        .master("local[2]")
        .config(
            "spark.jars.packages",
            (
                "com.microsoft.azure:synapseml_2.12:0.10.2,"
                "org.apache.hadoop:hadoop-azure:3.3.5,"
                "com.microsoft.azure:azure-storage:8.6.6"
            ),
        )
        .config("spark.jars.repositories", "https://mmlspark.azureedge.net/maven")
        .config("spark.sql.debug.maxToStringFields", "100")
        .getOrCreate()
    )
    return spark

spark = _init_spark()

In [4]:
spark_df = spark.createDataFrame(df)

In [5]:
train_raw, test_raw = spark_df.randomSplit([0.8, 0.2], seed=41)

In [6]:
from pyspark.ml.feature import VectorAssembler

feature_cols = [c for c in df.columns
                if c not in ("PatientID", "DiabetesRiskScore", "HeartDiseaseRiskScore")]
featurizer = VectorAssembler(inputCols=feature_cols, outputCol="features")
train_diabetes = featurizer.transform(train_raw).select("DiabetesRiskScore", "features")
test_diabetes  = featurizer.transform(test_raw).select("DiabetesRiskScore", "features")

train_heart = featurizer.transform(train_raw).select("HeartDiseaseRiskScore", "features")
test_heart  = featurizer.transform(test_raw).select("HeartDiseaseRiskScore", "features")

In [7]:
settings = {
    "task": "regression",
    "metric": "rmse",
    "time_budget": 60,
    "estimator_list": ["lgbm_spark"]
}

In [8]:
# Fit & evaluate DiabetesRiskScore
from flaml import AutoML
from flaml.automl.spark.utils import to_pandas_on_spark
from pyspark.ml.evaluation import RegressionEvaluator

automl = AutoML()
automl.fit(dataframe=to_pandas_on_spark(train_diabetes), label='DiabetesRiskScore', labelCol="DiabetesRiskScore", **settings)

test_diabetes_df = to_pandas_on_spark(test_diabetes)

predictions = automl.model.estimator.transform(test_diabetes)

evaluator = RegressionEvaluator(
    labelCol="DiabetesRiskScore",
    predictionCol="prediction",
    metricName="rmse"
)
rmse = evaluator.evaluate(predictions)
print("Diabetes Risk Score RMSE:", rmse)

  from .core import DMatrix, DeviceQuantileDMatrix, Booster, DataIter, build_info


[flaml.automl.logger: 05-30 02:25:57] {1682} INFO - task = regression
[flaml.automl.logger: 05-30 02:25:57] {1689} INFO - Data split method: uniform
[flaml.automl.logger: 05-30 02:25:57] {1692} INFO - Evaluation method: cv
[flaml.automl.logger: 05-30 02:26:03] {1790} INFO - Minimizing error metric: rmse
[flaml.automl.logger: 05-30 02:26:03] {1900} INFO - List of ML learners in AutoML Run: ['lgbm_spark']
[flaml.automl.logger: 05-30 02:26:03] {2210} INFO - iteration 0, current learner lgbm_spark


  df = df.to_spark()


[flaml.automl.logger: 05-30 02:26:35] {2336} INFO - Estimated sufficient time budget=323707s. Estimated necessary time budget=324s.
[flaml.automl.logger: 05-30 02:26:35] {2383} INFO -  at 46.7s,	estimator lgbm_spark's best error=7.3984,	best estimator lgbm_spark's best error=7.3984
[flaml.automl.logger: 05-30 02:26:38] {2619} INFO - retrain lgbm_spark for 2.2s
[flaml.automl.logger: 05-30 02:26:38] {2622} INFO - retrained model: LightGBMRegressor_285d495dfcf9
[flaml.automl.logger: 05-30 02:26:38] {1930} INFO - fit succeeded
[flaml.automl.logger: 05-30 02:26:38] {1931} INFO - Time taken to find the best model: 46.65064573287964
Diabetes Risk Score RMSE: 7.324952166259268


In [9]:
# Fit & evaluate HeartDiseaseRiskScore

automl = AutoML()
automl.fit(dataframe=to_pandas_on_spark(train_heart), label='HeartDiseaseRiskScore', labelCol="HeartDiseaseRiskScore", **settings)

test_heart_df = to_pandas_on_spark(test_heart)

predictions = automl.model.estimator.transform(test_heart)

evaluator = RegressionEvaluator(
    labelCol="HeartDiseaseRiskScore",
    predictionCol="prediction",
    metricName="rmse"
)
rmse = evaluator.evaluate(predictions)
print("HeartDisease Risk Score RMSE:", rmse)

[flaml.automl.logger: 05-30 02:26:50] {1682} INFO - task = regression
[flaml.automl.logger: 05-30 02:26:50] {1689} INFO - Data split method: uniform
[flaml.automl.logger: 05-30 02:26:50] {1692} INFO - Evaluation method: cv
[flaml.automl.logger: 05-30 02:26:51] {1790} INFO - Minimizing error metric: rmse
[flaml.automl.logger: 05-30 02:26:51] {1900} INFO - List of ML learners in AutoML Run: ['lgbm_spark']
[flaml.automl.logger: 05-30 02:26:51] {2210} INFO - iteration 0, current learner lgbm_spark
[flaml.automl.logger: 05-30 02:27:15] {2336} INFO - Estimated sufficient time budget=246589s. Estimated necessary time budget=247s.
[flaml.automl.logger: 05-30 02:27:15] {2383} INFO -  at 26.9s,	estimator lgbm_spark's best error=3.2451,	best estimator lgbm_spark's best error=3.2451
[flaml.automl.logger: 05-30 02:27:15] {2210} INFO - iteration 1, current learner lgbm_spark
[flaml.automl.logger: 05-30 02:27:38] {2383} INFO -  at 49.3s,	estimator lgbm_spark's best error=3.2451,	best estimator lgbm_s