
### This Interactive Notebook was generated by ML.NET Tooling.

The code below demonstrates how to

1. Define the model input and output schema
1. Load in data from a text file to an IDataView
1. Set up the training pipeline with data transforms
1. Choose an algorithm and append it to the pipeline
1. Train the model
1. Evaluate the model
1. Consume the model


## Install the necessary NuGet packages for training ML.NET model and plotting:

In [1]:

/* ML.NET Model Builder generated Notebook file. Notebook files contain both code snippets and rich text elements.
Use the "run" button in the left margin to execute each code snippet and explore ML.NET. */

#i "nuget:https://pkgs.dev.azure.com/dnceng/public/_packaging/dotnet5/nuget/v3/index.json" 
#i "nuget:https://pkgs.dev.azure.com/dnceng/public/_packaging/dotnet-tools/nuget/v3/index.json" 

#r "nuget:Microsoft.ML,1.7.0"
#r "nuget:Microsoft.Data.Analysis,0.4.0"


In [1]:
using static Microsoft.DotNet.Interactive.Formatting.PocketViewTags;
using Microsoft.DotNet.Interactive.Formatting;
using Microsoft.Data.Analysis;

In [1]:
// Register your dataset into a dataframe to nicely display metrics

using Microsoft.AspNetCore.Html;
using Microsoft.DotNet.Interactive.Formatting;
using static Microsoft.DotNet.Interactive.Formatting.PocketViewTags;

Formatter.Register<DataFrame>((df, writer) =>
{
    var headers = new List<IHtmlContent>();
    headers.Add(th(i("index")));
    headers.AddRange(df.Columns.Select(c => (IHtmlContent) th(c.Name)));
    var rows = new List<List<IHtmlContent>>();
    var take = 20;
    for (var i = 0; i < Math.Min(take, df.Rows.Count); i++)
    {
        var cells = new List<IHtmlContent>();
        cells.Add(td(i));
        foreach (var obj in df.Rows[i])
        {            cells.Add(td(obj));
        }
        rows.Add(cells);
    }
    
    var t = table(
        thead(
            headers),
        tbody(
            rows.Select(
                r => tr(r))));
    
    writer.Write(t);
}, "text/html");

In [1]:
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using Microsoft.ML;
using Microsoft.ML.Data;

## Define the model input and output schemas:

In [1]:
// Define the model input schema (which columns you will be loading in for training)
public class ModelInput
{
    [ColumnName(@"species"), LoadColumn(0)]
    public string Species { get; set; }
    
    [ColumnName(@"island"), LoadColumn(1)]
    public string Island { get; set; }
    
    [ColumnName(@"bill_length_mm"), LoadColumn(2)]
    public float Bill_length_mm { get; set; }
    
    [ColumnName(@"bill_depth_mm"), LoadColumn(3)]
    public float Bill_depth_mm { get; set; }
    
    [ColumnName(@"flipper_length_mm"), LoadColumn(4)]
    public float Flipper_length_mm { get; set; }
    
    [ColumnName(@"body_mass_g"), LoadColumn(5)]
    public float Body_mass_g { get; set; }
    
    [ColumnName(@"sex"), LoadColumn(6)]
    public string Sex { get; set; }
    
    [ColumnName(@"year"), LoadColumn(7)]
    public float Year { get; set; }
    
}


In [1]:
﻿// Define the model output schema (what the model will return)
public class ModelOutput
{
    [ColumnName("PredictedLabel")]
    public string PredictedLabel { get; set;}

    public float[] Score { get; set;}
}




## Create MLContext and load training data:

In [1]:
// Define path to training data
string trainDataPath = @"C:\Users\TomaszCekało\source\repos\PalmerPenguins\PalmerPenguins\penguins.csv";


In [1]:
// Create a new MLContext (the starting point for all ML.NET operations)
var mlContext = new MLContext();

// Load data from a text file to an IDataView (a flexible, efficient way of describing tabular data)
IDataView trainData = mlContext.Data.LoadFromTextFile<ModelInput>(
    path: trainDataPath ,
    hasHeader: true ,
    separatorChar: ',',
    allowQuoting: true,
    allowSparse: false);

// Display training data schema
display(trainData.Schema); 



In [1]:
// Show 5 rows of loaded data
public static List<ModelInput> Head(MLContext mlContext, IDataView dataView, int numberOfRows = 4)
{
    var rows = mlContext.Data.CreateEnumerable<ModelInput>(dataView, reuseRowObject: false)
                    .Take(numberOfRows)
                    .ToList();
    
    return rows;
}

display(h4("Showing 5 rows from training DataView:"));

var fewRows = Head(mlContext, trainData, 5);
display(fewRows);

## Create the training pipeline, choose an algorithm, and train the model:

In [1]:
using Microsoft.ML.Data;
using Microsoft.ML.Trainers;
using Microsoft.ML;


In [1]:
// Append the trainer to the data processing pipeline
var pipeline = mlContext.Transforms.Categorical.OneHotEncoding(new []{new InputOutputColumnPair(@"island", @"island"),new InputOutputColumnPair(@"sex", @"sex")})      
                 .Append(mlContext.Transforms.ReplaceMissingValues(new []{new InputOutputColumnPair(@"bill_length_mm", @"bill_length_mm"),new InputOutputColumnPair(@"bill_depth_mm", @"bill_depth_mm"),new InputOutputColumnPair(@"flipper_length_mm", @"flipper_length_mm"),new InputOutputColumnPair(@"body_mass_g", @"body_mass_g"),new InputOutputColumnPair(@"year", @"year")}))      
                 .Append(mlContext.Transforms.Concatenate(@"Features", new []{@"island",@"sex",@"bill_length_mm",@"bill_depth_mm",@"flipper_length_mm",@"body_mass_g",@"year"}))      
                 .Append(mlContext.Transforms.Conversion.MapValueToKey(outputColumnName:@"species",inputColumnName:@"species"))      
                 .Append(mlContext.Transforms.NormalizeMinMax(@"Features", @"Features"))      
                 .Append(mlContext.MulticlassClassification.Trainers.OneVersusAll(binaryEstimator: mlContext.BinaryClassification.Trainers.LbfgsLogisticRegression(new LbfgsLogisticRegressionBinaryTrainer.Options(){L1Regularization=0.397552F,L2Regularization=0.03125F,LabelColumnName=@"species",FeatureColumnName=@"Features"}), labelColumnName:@"species"))      
                 .Append(mlContext.Transforms.Conversion.MapKeyToValue(outputColumnName:@"PredictedLabel",inputColumnName:@"PredictedLabel"));

// Train the model (fit the model to the training data)
var model = pipeline.Fit(trainData);



## Consume the model

In [1]:
﻿ // Define sample model input
var sampleData = new ModelInput()
{
    Island = @"Torgersen",
    Bill_length_mm = 39.5F,
    Bill_depth_mm = 17.4F,
    Flipper_length_mm = 186F,
    Body_mass_g = 3800F,
    Sex = @"female",
    Year = 2007F,
};

// Create a Prediction Engine (used to make single predictions)
var predEngine = mlContext.Model.CreatePredictionEngine<ModelInput, ModelOutput>(model);
// Use the model and Prediction Engine to predict on new sample data
var predictionResult = predEngine.Predict(sampleData);
Console.WriteLine("Using model to make single prediction -- Comparing actual Species with predicted Species from sample data...\n\n");

Console.WriteLine($"Species: {@"Adelie"}");
Console.WriteLine($"Island: {@"Torgersen"}");
Console.WriteLine($"Bill_length_mm: {39.5F}");
Console.WriteLine($"Bill_depth_mm: {17.4F}");
Console.WriteLine($"Flipper_length_mm: {186F}");
Console.WriteLine($"Body_mass_g: {3800F}");
Console.WriteLine($"Sex: {@"female"}");
Console.WriteLine($"Year: {2007F}");

Console.WriteLine($"\n\nPredicted Species: {predictionResult.PredictedLabel}\n\n");


## Evaluate the model:

In [1]:
// Evaluate the model using the cross validation method
// Learn more about cross validation at https://aka.ms/mlnet-cross-validation

var crossValidationResults = mlContext.MulticlassClassification.CrossValidate(trainData, pipeline, numberOfFolds: 5, labelColumnName:"species");

var metricsInMultipleFolds = crossValidationResults.Select(r => r.Metrics);

var microAccuracyValues = metricsInMultipleFolds.Select(m => m.MicroAccuracy);
var microAccuracyAverage = microAccuracyValues.Average();
var microAccuraciesStdDeviation = CalculateStandardDeviation(microAccuracyValues);
var microAccuraciesConfidenceInterval95 = CalculateConfidenceInterval95(microAccuracyValues);

var macroAccuracyValues = metricsInMultipleFolds.Select(m => m.MacroAccuracy);
var macroAccuracyAverage = macroAccuracyValues.Average();
var macroAccuraciesStdDeviation = CalculateStandardDeviation(macroAccuracyValues);
var macroAccuraciesConfidenceInterval95 = CalculateConfidenceInterval95(macroAccuracyValues);

var logLossValues = metricsInMultipleFolds.Select(m => m.LogLoss);
var logLossAverage = logLossValues.Average();
var logLossStdDeviation = CalculateStandardDeviation(logLossValues);
var logLossConfidenceInterval95 = CalculateConfidenceInterval95(logLossValues);

var logLossReductionValues = metricsInMultipleFolds.Select(m => m.LogLossReduction);
var logLossReductionAverage = logLossReductionValues.Average();
var logLossReductionStdDeviation = CalculateStandardDeviation(logLossReductionValues);
var logLossReductionConfidenceInterval95 = CalculateConfidenceInterval95(logLossReductionValues);

// Print out the evaluation metrics
var metricNames = new StringDataFrameColumn("Metric Name", new[] {"Average MicroAccuracy", "Average MacroAccuracy", "Average LogLoss", "Average LogLossReduction"});
var metricValues = new StringDataFrameColumn("Value",new[] {$"{microAccuracyAverage:#.###}", $"{macroAccuracyAverage:#.###}", $"{logLossAverage:#.###}", $"{logLossReductionAverage:#.###}"});
var standardDeviationValues = new StringDataFrameColumn("Standard deviation",new[] {$"{microAccuraciesStdDeviation:#.###}", $"{macroAccuraciesStdDeviation:#.###}", $"{logLossStdDeviation:#.###}", $"{logLossReductionStdDeviation:#.###}"});
var intervalValues = new StringDataFrameColumn("Confidence Interval 95%n",new[] {$"{microAccuraciesConfidenceInterval95:#.###}", $"{macroAccuraciesConfidenceInterval95:#.###}", $"{logLossConfidenceInterval95:#.###}", $"{logLossReductionConfidenceInterval95:#.###}"});

var stats = new DataFrame(metricNames, metricValues, standardDeviationValues, intervalValues);

public static double CalculateStandardDeviation(IEnumerable<double> values)
{
    double average = values.Average();
    double sumOfSquaresOfDifferences = values.Select(val => (val - average) * (val - average)).Sum();
    double standardDeviation = Math.Sqrt(sumOfSquaresOfDifferences / (values.Count() - 1));
    return standardDeviation;
}

public static double CalculateConfidenceInterval95(IEnumerable<double> values)
{
    double confidenceInterval95 = 1.96 * CalculateStandardDeviation(values) / Math.Sqrt((values.Count() - 1));
    return confidenceInterval95;
}

stats

