In [None]:
#r "nuget:Microsoft.Spark"
#r "nuget:Microsoft.ML"

In [None]:
using System;
using System.Collections.Generic;
using System.Linq;
using Microsoft.ML;
using Microsoft.ML.Data;
using Microsoft.Spark;
using Microsoft.Spark.Sql;
using static Microsoft.Spark.Sql.Functions;

// run spark : spark-submit --class org.apache.spark.deploy.dotnet.DotnetRunner --master local D:\3bStudio\Sandbox\spark-program\FirstSparkProgram\bin\Debug\net6.0\microsoft-spark-3-0_2.12-2.1.0.jar debug
// see Spark portal at : http://localhost:4040
var spark = SparkSession
    .Builder()
    .AppName("spark-train-sentiment")
    .GetOrCreate();

spark.SparkContext.SetLogLevel("WARN");

In [None]:
public class SentimentIssue
{
    [LoadColumn(0)]
    public bool Label { get; set; }
    
    [LoadColumn(2)]
    public string Text { get; set; }
}

public class SentimentPrediction
{
    [ColumnName("PredictedLabel")]
    public bool Prediction { get; set; }

    public float Probability { get; set; }

    public float Score { get; set; }
}

In [None]:
DataFrame df = spark
    .Read()
    .Option("header", true)
    .Option("inferSchema", true)
    .Csv(@"D:\3bStudio\Sandbox\3bs-spark-training\resources\sentient-train.csv");

df.Show();
df.PrintSchema();

In [None]:
var rows = df.Collect().ToList();
var dataset = rows.Select(x => new SentimentIssue{
    Label = x[1].ToString() == "1" ? true : false,
    Text = x[0].ToString(),
}).ToList();

In [None]:
// STEP 1: Common data loading configuration
var mlContext = new MLContext();
var dataView = mlContext.Data.LoadFromEnumerable(dataset);

DataOperationsCatalog.TrainTestData dataSplit = mlContext.Data.TrainTestSplit(dataView, testFraction: 0.2);
IDataView trainData = dataSplit.TrainSet;
IDataView testData = dataSplit.TestSet;

In [None]:
// STEP 2: Common data process configuration with pipeline data transformations          
var dataProcessPipeline = mlContext.Transforms.Text.FeaturizeText(outputColumnName: "Features", inputColumnName: nameof(SentimentIssue.Text));

In [None]:
// STEP 3: Set the training algorithm, then create and config the modelBuilder   
var trainer = mlContext.BinaryClassification.Trainers.SdcaLogisticRegression(labelColumnName: "Label", featureColumnName: "Features");

var trainingPipeline = dataProcessPipeline.Append(trainer);

In [None]:
// STEP 4: Train the model fitting to the DataSet
ITransformer trainedModel = trainingPipeline.Fit(trainData);

In [None]:
// STEP 5: Evaluate the model and show accuracy stats
var predictions = trainedModel.Transform(testData);
var metrics = mlContext.BinaryClassification.Evaluate(data: predictions, labelColumnName: "Label", scoreColumnName: "Score");
Console.WriteLine("");
Console.WriteLine("");
Console.WriteLine($"************************************************************");
Console.WriteLine($"*       Metrics for Heart disease detrection binary classification model      ");
Console.WriteLine($"*-----------------------------------------------------------");
Console.WriteLine($"*       Accuracy: {metrics.Accuracy:P2}");
Console.WriteLine($"*       Area Under Roc Curve:      {metrics.AreaUnderRocCurve:P2}");
Console.WriteLine($"*       Area Under PrecisionRecall Curve:  {metrics.AreaUnderPrecisionRecallCurve:P2}");
Console.WriteLine($"*       F1Score:  {metrics.F1Score:P2}");
Console.WriteLine($"*       LogLoss:  {metrics.LogLoss:#.##}");
Console.WriteLine($"*       LogLossReduction:  {metrics.LogLossReduction:#.##}");
Console.WriteLine($"*       PositivePrecision:  {metrics.PositivePrecision:#.##}");
Console.WriteLine($"*       PositiveRecall:  {metrics.PositiveRecall:#.##}");
Console.WriteLine($"*       NegativePrecision:  {metrics.NegativePrecision:#.##}");
Console.WriteLine($"*       NegativeRecall:  {metrics.NegativeRecall:P2}");
Console.WriteLine($"************************************************************");
Console.WriteLine("");
Console.WriteLine("");