# Configure SynapseML based on doc

https://microsoft.github.io/SynapseML/

In [1]:
%%configure -f
{
  "name": "synapseml",
  "conf": {
      "spark.jars.packages": "com.microsoft.azure:synapseml_2.12:0.11.4-spark3.3",
      "spark.jars.repositories": "https://mmlspark.azureedge.net/maven",
      "spark.jars.excludes": "org.scala-lang:scala-reflect,org.apache.spark:spark-tags_2.12,org.scalactic:scalactic_2.12,org.scalatest:scalatest_2.12,com.fasterxml.jackson.core:jackson-databind",
      "spark.yarn.user.classpath.first": "true",
      "spark.sql.parquet.enableVectorizedReader": "false"
  }
}

## Doesn't compile without loading the jars explicitly

In [3]:
%%configure -f
{
    "conf": {
        "spark.jars": "abfss://metadata@storagemai01us2dev.dfs.core.windows.net/komsdriver/jar_files/json4s-native_2.12-3.5.3.jar,abfss://metadata@storagemai01us2dev.dfs.core.windows.net/komsdriver/jar_files/xgboost4j_2.12-1.4.1.jar,abfss://metadata@storagemai01us2dev.dfs.core.windows.net/komsdriver/jar_files/xgboost4j-spark_2.12-1.4.1.jar",
    }
}

## Train, Predict and Save the model

In [2]:
import org.apache.spark.ml.feature.VectorAssembler
import ml.dmlc.xgboost4j.scala.spark.XGBoostRegressor
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.sql.functions._
import org.apache.hadoop.fs.FileSystem
 
// Create a SparkSession
// val spark = SparkSession.builder()
//   .appName("XGBoostRegressorSample")
//   .getOrCreate()
 
// Sample data
val data = Seq(
  (1.0, 2.0, 3.0),
  (2.0, 3.0, 4.0),
  (3.0, 4.0, 5.0),
  (4.0, 5.0, 6.0),
  (5.0, 6.0, 7.0)
)
 
// Define schema for sample data
val schema = List("feature1", "feature2", "label")
 
// Create DataFrame from sample data
val df = spark.createDataFrame(data).toDF(schema: _*)
 
// Define feature columns and label column
val featureCols = Array("feature1", "feature2")
val labelCol = "label"
 
// Create a VectorAssembler to combine feature columns into a single vector column
val assembler = new VectorAssembler()
  .setInputCols(featureCols)
  .setOutputCol("features")
 
// Transform the data using VectorAssembler
val assembledDF = assembler.transform(df)
 
// Define XGBoost parameters
val paramMap = Map(
  "eta" -> 0.1,
  "max_depth" -> 2,
  "objective" -> "reg:linear",
  "num_round" -> 100,
  "early_stopping_rounds" -> 10
)
 
// Create XGBoostRegressor
val xgbRegressor = new XGBoostRegressor(paramMap)
  .setLabelCol(labelCol)
  .setFeaturesCol("features")
 
// Train XGBoost model
val xgbModel = xgbRegressor.fit(assembledDF)
 
//Make predictions
val predictions = xgbModel.transform(assembledDF)
 
// Show predictions
predictions.show()
 
// Save model to primary storage account
xgbModel.write.overwrite().save("MS_DRIVER_MODEL/test001.json")

In [3]:
import com.tccc.dna.synapse.spark.Classpath  
{
    Classpath.getAllDependencies.foreach(println)
}