In [None]:
#r "nuget:Microsoft.Spark"
#r "nuget:Microsoft.ML"

In [None]:
using System;
using System.Collections.Generic;
using System.Linq;
using Microsoft.ML;
using Microsoft.ML.Data;
using Microsoft.ML.Trainers;
using Microsoft.Spark;
using Microsoft.Spark.Sql;
using static Microsoft.Spark.Sql.Functions;

// run spark : spark-submit --class org.apache.spark.deploy.dotnet.DotnetRunner --master local D:\3bStudio\Sandbox\spark-program\FirstSparkProgram\bin\Debug\net6.0\microsoft-spark-3-0_2.12-2.1.0.jar debug
// see Spark portal at : http://localhost:4040
var spark = SparkSession
    .Builder()
    .AppName("spark-ml-clustering")
    .GetOrCreate();

spark.SparkContext.SetLogLevel("WARN");

In [None]:
DataFrame df = spark
    .Read()
    .Option("header", true)
    .Option("inferSchema", true)
    .Csv(@"D:\3bStudio\Sandbox\3bs-spark-training\resources\heart.csv");

df.Show();

In [None]:
public class HeartProfile
{
    public float Age;
    public float Cholesterol;
    public float RestingBP;
    public float FastingBS;
    public bool HeartDisease;
}

public class ClusterPrediction
{
    [ColumnName("PredictedLabel")]
    public uint PredictedClusterId;

    [ColumnName("Score")]
    public float[] Distances;
}

In [None]:
var rows = df.Collect().ToList();

var dataset = new List<HeartProfile>();
foreach(var row in rows)
{
    object[] rowValues = row.Values;
    
    dataset.Add(new HeartProfile {
        Age = Convert.ToSingle(rowValues[0]),
        Cholesterol = Convert.ToSingle(rowValues[4]),
        RestingBP = Convert.ToSingle(rowValues[3]),
        FastingBS = Convert.ToSingle(rowValues[5]),
        HeartDisease = (int)rowValues[11] == 1 ? true : false
    });
}

In [None]:
var mlContext = new MLContext();

var dataView = mlContext.Data.LoadFromEnumerable(dataset);

// Define trainer options.
var options = new KMeansTrainer.Options
{
    NumberOfClusters = 3,
    OptimizationTolerance = 1e-6f,
    NumberOfThreads = 1,
    FeatureColumnName = "Features"
};

// Define the trainer.
var pipeline = mlContext.Transforms
        .Concatenate("Features", new[] { "Age", "Cholesterol", "RestingBP", "FastingBS" })
        .Append(mlContext.Clustering.Trainers.KMeans(options));

var model = pipeline.Fit(dataView);


In [None]:
var transformedTestData = model.Transform(dataView);
var metrics = mlContext.Clustering.Evaluate(transformedTestData);

metrics.Display();

In [None]:
var predictionEngine = mlContext.Model.CreatePredictionEngine<HeartProfile, ClusterPrediction>(model);

var predictionResult = new List<ClusterPrediction>();
foreach(var row in dataset)
{
    var result = predictionEngine.Predict(row);
    predictionResult.Add(result);
}

var countCluster1 = predictionResult.Count(x => x.PredictedClusterId == 1);
var countCluster2 = predictionResult.Count(x => x.PredictedClusterId == 2);
var countCluster3 = predictionResult.Count(x => x.PredictedClusterId == 3);

var countPeopleBadHeartCondition = dataset.Count(x => x.HeartDisease);
Console.WriteLine($"Labelde people : {countPeopleBadHeartCondition}");
Console.WriteLine($"Cluster 1 : {countCluster1}");
Console.WriteLine($"Cluster 2 : {countCluster2}");
Console.WriteLine($"Cluster 3 : {countCluster3}");