# Using ML.NET and XPlot in Jupyter notebooks
## Scenario: Regression model for Taxi fares

Regression is a ML task type of supervised machine learning algorithms. 
A regression ML model predicts continuous value outputs (such as numbers). 
For instance, predicting the fare of a Taxi trip or predicting the price of a car is a regression problem.

# Install the NuGet packages you use in the notebook

In [1]:
// ML.NET Nuget packages installation
#r "nuget:Microsoft.ML,1.4.0"
    
//Install XPlot package
#r "nuget:XPlot.Plotly,2.0.0"

using Microsoft.ML;
using Microsoft.ML.Data;
using XPlot.Plotly;

Installed package XPlot.Plotly version 2.0.0

Installed package Microsoft.ML version 1.4.0

# Declare data-classes for input data and predictions

In [2]:
display(h4("Declaring data-classes to use across the code in this notebook."));

public class TaxiTrip
{
    [LoadColumn(0)]
    public string VendorId;

    [LoadColumn(1)]
    public string RateCode;

    [LoadColumn(2)]
    public float PassengerCount;

    [LoadColumn(3)]
    public float TripTime;

    [LoadColumn(4)]
    public float TripDistance;

    [LoadColumn(5)]
    public string PaymentType;

    [LoadColumn(6)]
    public float FareAmount;
}

public class TaxiTripFarePrediction
{
    [ColumnName("Score")]
    public float Score;
}












# Load datasets into IDataView and display the schema 

In [3]:
display(h1("Code for loading the data into IDataViews: training dataset and test dataset"));

MLContext mlContext = new MLContext(seed: 0);

string TrainDataPath = "./taxi-fare-train.csv";
string TestDataPath = "./taxi-fare-test.csv";

IDataView trainDataView = mlContext.Data.LoadFromTextFile<TaxiTrip>(TrainDataPath, hasHeader: true, separatorChar: ',');
IDataView testDataView = mlContext.Data.LoadFromTextFile<TaxiTrip>(TestDataPath, hasHeader: true, separatorChar: ',');

display(h4("Schema of training DataView:"));
display(trainDataView.Schema);


index,Name,Index,IsHidden,Type,Annotations
0,VendorId,0,False,String,
1,RateCode,1,False,String,
2,PassengerCount,2,False,Single,
3,TripTime,3,False,Single,
4,TripDistance,4,False,Single,
5,PaymentType,5,False,String,
6,FareAmount,6,False,Single,


## Show a few rows of loaded data 

In [4]:
//Util class to preview loaded data in IDataView

public static List<TaxiTrip> Head(MLContext mlContext, IDataView dataView, int numberOfRows = 4)
{
    string msg = string.Format("DataView: Showing {0} rows with the columns", numberOfRows.ToString());
    display(msg);
          
    var rows = mlContext.Data.CreateEnumerable<TaxiTrip>(dataView, reuseRowObject: false)
                    .Take(numberOfRows)
                    .ToList();
    
    return rows;
}

display(h4("Showing a few rows from training DataView:"));

var fewRows = Head(mlContext, trainDataView, 5);
display(fewRows);

DataView: Showing 5 rows with the columns

index,VendorId,RateCode,PassengerCount,TripTime,TripDistance,PaymentType,FareAmount
0,CMT,1,1,1271,3.8,CRD,17.5
1,CMT,1,1,474,1.5,CRD,8.0
2,CMT,1,1,637,1.4,CRD,8.5
3,CMT,1,1,181,0.6,CSH,4.5
4,CMT,1,1,661,1.1,CRD,8.5


## Extract important input variables as arrays to be used for plotting

In [5]:
//Extract some data into arrays for plotting:

int numberOfRows = 1000;
float[] fares = trainDataView.GetColumn<float>("FareAmount").Take(numberOfRows).ToArray();
float[] distances = trainDataView.GetColumn<float>("TripDistance").Take(numberOfRows).ToArray();
float[] times = trainDataView.GetColumn<float>("TripTime").Take(numberOfRows).ToArray();
float[] passengerCounts = trainDataView.GetColumn<float>("PassengerCount").Take(numberOfRows).ToArray();

## Show a histogram: Distribution of taxi trips per fare cost 

In [6]:
// Distribution of taxi trips per cost
//XPlot Histogram reference: http://tpetricek.github.io/XPlot/reference/xplot-plotly-graph-histogram.html

var faresHistogram = Chart.Plot(new Graph.Histogram(){x = fares, autobinx = false, nbinsx = 20});
var layout = new Layout.Layout(){title="Distribution of taxi trips per cost"};
faresHistogram.WithLayout(layout);
faresHistogram.WithXTitle("Fare ranges");
faresHistogram.WithYTitle("Number of trips");
display(faresHistogram);


##  Plot Time vs. Distance with different color on Fares cost-frame

In [7]:
// Plot Time vs. Distance with different color on Fares cost-frame

var chart = Chart.Plot(
    new Graph.Scatter()
    {
        x = times,
        y = distances,
        mode = "markers",
        marker = new Graph.Marker()
        {
            color = fares,
            colorscale = "Jet"
        }
    }
);

var layout = new Layout.Layout(){title="Plot Time vs. Distance & color scale on Fares"};
chart.WithLayout(layout);
chart.Width = 500;
chart.Height = 500;
chart.WithXTitle("Time");
chart.WithYTitle("Distance");
chart.WithLegend(false);

display(chart);


## Plot Fares depending on trip's Time 

In [8]:
// Plot Fare depending on Time

var chartFareVsTime = Chart.Plot(
    new Graph.Scatter()
    {
        x = times,
        y = fares,
        mode = "markers",
        marker = new Graph.Marker()
        {
            color = distances,
            colorscale = "Jet"
        }
    }
);

var layout = new Layout.Layout(){title="Plot Fare depending on Time"};
chartFareVsTime.WithLayout(layout);
chartFareVsTime.Width = 500;
chartFareVsTime.Height = 500;
chartFareVsTime.WithXTitle("Time");
chartFareVsTime.WithYTitle("Fares");
chartFareVsTime.WithLegend(false);

display(chartFareVsTime);

# Plot Fares depending on trip's Distance 

In [9]:
// Plot Fare depending on Distance 
var chartFareVsDist = Chart.Plot(
    new Graph.Scatter()
    {
        x = distances,
        y = fares,
        mode = "markers",
        marker = new Graph.Marker()
        {
            color = times,
            colorscale = "Jet"
        }
    }
);
var layout = new Layout.Layout(){title="Plot Fare depending on Distance"};
chartFareVsDist.WithLayout(layout);
chartFareVsDist.Width = 500;
chartFareVsDist.Height = 500;
chartFareVsDist.WithXTitle("Distance");
chartFareVsDist.WithYTitle("Fares");
chartFareVsDist.WithLegend(false);
display(chartFareVsDist);

# Plot Fares depending on trip's passengers 

In [10]:
// Plot Fare depending on Passengers

int numberOfRows = 2000;
float[] fares = trainDataView.GetColumn<float>("FareAmount").Take(numberOfRows).ToArray();
float[] passengerCounts = trainDataView.GetColumn<float>("PassengerCount").Take(numberOfRows).ToArray();

float[] distances = trainDataView.GetColumn<float>("TripDistance").Take(numberOfRows).ToArray();
float[] times = trainDataView.GetColumn<float>("TripTime").Take(numberOfRows).ToArray();

var chartFareVsPassengers = Chart.Plot(
    new Graph.Scatter()
    {
        x = passengerCounts,
        y = fares,
        mode = "markers",
    }
);

var layout = new Layout.Layout(){title="Plot Fare depending on Passengers"};
chartFareVsPassengers.WithLayout(layout);
chartFareVsPassengers.Width = 500;
chartFareVsPassengers.Height = 500;
chartFareVsPassengers.WithXTitle("Passengers");
chartFareVsPassengers.WithYTitle("Fares");
chartFareVsPassengers.WithLegend(false);

display(chartFareVsPassengers);

### Implementation of PeekTransformedData() function

In [11]:
// Util method to show transformed data in IDataView
public static void PeekTransformedData(MLContext mlContext, IDataView dataView, IEstimator<ITransformer> pipeline, int numberOfRows = 4)
{
    string msg = string.Format("Peek data in DataView: Showing {0} rows with the columns", numberOfRows.ToString());
    display(msg);
    
    var transformer = pipeline.Fit(dataView);
    var transformedData = transformer.Transform(dataView);
    
    // 'transformedData' is a 'promise' of data, lazy-loading. call Preview  
    //and iterate through the returned collection from preview.
    var preViewTransformedData = transformedData.Preview(maxRows: numberOfRows);
    
    //display(preViewTransformedData.RowView);
                
    foreach (var row in preViewTransformedData.RowView)
    {
        var ColumnCollection = row.Values;
        string lineToPrint = "Row--> ";
        foreach (KeyValuePair<string, object> column in ColumnCollection)
        {
            lineToPrint += $"| {column.Key}:{column.Value}";
        }
        display(lineToPrint + "\n");
    }   
}






# Data transformations pipeline for ML.NET model

In [13]:
display(h1("Apply Data Transformations pipeline"));

// STEP 2: Common data process configuration with pipeline data transformations
var dataProcessPipeline = mlContext.Transforms.Categorical.OneHotEncoding(outputColumnName: "VendorIdEncoded", inputColumnName: nameof(TaxiTrip.VendorId))
                  .Append(mlContext.Transforms.Categorical.OneHotEncoding(outputColumnName: "RateCodeEncoded", inputColumnName: nameof(TaxiTrip.RateCode)))
                  .Append(mlContext.Transforms.Categorical.OneHotEncoding(outputColumnName: "PaymentTypeEncoded",inputColumnName: nameof(TaxiTrip.PaymentType)))
                  .Append(mlContext.Transforms.NormalizeMeanVariance(outputColumnName: nameof(TaxiTrip.PassengerCount)))
                  .Append(mlContext.Transforms.NormalizeMeanVariance(outputColumnName: nameof(TaxiTrip.TripTime)))
                  .Append(mlContext.Transforms.NormalizeMeanVariance(outputColumnName: nameof(TaxiTrip.TripDistance)))
                  .Append(mlContext.Transforms.Concatenate("Features", "VendorIdEncoded", "RateCodeEncoded", "PaymentTypeEncoded", 
                                                           nameof(TaxiTrip.PassengerCount), nameof(TaxiTrip.TripTime), nameof(TaxiTrip.TripDistance)));

display(h3("Show transformed data..."));

PeekTransformedData(mlContext, trainDataView, dataProcessPipeline, 5);


Peek data in DataView: Showing 5 rows with the columns

Row--> | VendorId:CMT| RateCode:1| PassengerCount:1| PassengerCount:0.42661828| TripTime:1271| TripTime:1.514717| TripDistance:3.8| TripDistance:0.87563246| PaymentType:CRD| FareAmount:17.5| VendorIdEncoded:1| VendorIdEncoded:Sparse vector of size 2, 1 explicit values| RateCodeEncoded:1| RateCodeEncoded:Sparse vector of size 6, 1 explicit values| PaymentTypeEncoded:1| PaymentTypeEncoded:Sparse vector of size 5, 1 explicit values| Features:Sparse vector of size 16, 6 explicit values


Row--> | VendorId:CMT| RateCode:1| PassengerCount:1| PassengerCount:0.42661828| TripTime:474| TripTime:0.5648905| TripDistance:1.5| TripDistance:0.3456444| PaymentType:CRD| FareAmount:8| VendorIdEncoded:1| VendorIdEncoded:Sparse vector of size 2, 1 explicit values| RateCodeEncoded:1| RateCodeEncoded:Sparse vector of size 6, 1 explicit values| PaymentTypeEncoded:1| PaymentTypeEncoded:Sparse vector of size 5, 1 explicit values| Features:Sparse vector of size 16, 6 explicit values


Row--> | VendorId:CMT| RateCode:1| PassengerCount:1| PassengerCount:0.42661828| TripTime:637| TripTime:0.7591461| TripDistance:1.4| TripDistance:0.32260144| PaymentType:CRD| FareAmount:8.5| VendorIdEncoded:1| VendorIdEncoded:Sparse vector of size 2, 1 explicit values| RateCodeEncoded:1| RateCodeEncoded:Sparse vector of size 6, 1 explicit values| PaymentTypeEncoded:1| PaymentTypeEncoded:Sparse vector of size 5, 1 explicit values| Features:Sparse vector of size 16, 6 explicit values


Row--> | VendorId:CMT| RateCode:1| PassengerCount:1| PassengerCount:0.42661828| TripTime:181| TripTime:0.21570714| TripDistance:0.6| TripDistance:0.13825777| PaymentType:CSH| FareAmount:4.5| VendorIdEncoded:1| VendorIdEncoded:Sparse vector of size 2, 1 explicit values| RateCodeEncoded:1| RateCodeEncoded:Sparse vector of size 6, 1 explicit values| PaymentTypeEncoded:2| PaymentTypeEncoded:Sparse vector of size 5, 1 explicit values| Features:Sparse vector of size 16, 6 explicit values


Row--> | VendorId:CMT| RateCode:1| PassengerCount:1| PassengerCount:0.42661828| TripTime:661| TripTime:0.78774816| TripDistance:1.1| TripDistance:0.25347257| PaymentType:CRD| FareAmount:8.5| VendorIdEncoded:1| VendorIdEncoded:Sparse vector of size 2, 1 explicit values| RateCodeEncoded:1| RateCodeEncoded:Sparse vector of size 6, 1 explicit values| PaymentTypeEncoded:1| PaymentTypeEncoded:Sparse vector of size 5, 1 explicit values| Features:Sparse vector of size 16, 6 explicit values






# Append the trainer/algorithm to pipeline and train the model

In [14]:
%%time
display(h1("Build Training Pipeline and Train the model"));
display(h4("Creating the Training Pipeline with trainer/algorithm"));

// STEP 3: Set the training algorithm - Selected Trainer (SDCA Regression algorithm)                            
var trainer = mlContext.Regression.Trainers.Sdca(labelColumnName: "FareAmount", featureColumnName: "Features");
var trainingPipeline = dataProcessPipeline.Append(trainer);

// STEP 4: Train the model fitting to the DataSet
//The pipeline is trained on the dataset that has been loaded and transformed.
display("=============== Training the model ===============");
var trainedModel = trainingPipeline.Fit(trainDataView);



(1,1): error CS1525: El término de expresión '%' no es válido

(1,2): error CS1525: El término de expresión '%' no es válido

(1,7): error CS1002: Se esperaba ;



Cell not executed: compilation error

## Make predictions in bulk from the TestDataset to be used for the metrics

In [30]:
// Make predictions to plot agaist actual values
display(h3("===== Making predictions in bulk for the whole Test Dataset ====="));
// Make predictions in bulk (Transformed IDataView will have the predictions plus the actual/true values)
IDataView predictionsDataView = trainedModel.Transform(testDataView);


## Display the metrics (Model quality evaluation)

In [31]:
display(h3("===== Evaluating Model's accuracy with Test dataset ====="));

var metrics = mlContext.Regression.Evaluate(predictionsDataView, labelColumnName: "FareAmount", scoreColumnName: "Score");
display(metrics);

MeanAbsoluteError,MeanSquaredError,RootMeanSquaredError,LossFunction,RSquared
0.7697523625564575,35.35930367724503,5.946368948967515,35.35930408053133,0.6980621928339114


## Bar chart showing 'Actual fares vs. Predicted fares Comparison' 

In [15]:
// Number of rows to use for Bar chart
int totalNumberForBarChart  = 20;

float[] actualFares = predictionsDataView.GetColumn<float>("FareAmount").Take(totalNumberForBarChart).ToArray();
float[] predictionFares = predictionsDataView.GetColumn<float>("Score").Take(totalNumberForBarChart).ToArray();
int[] elements = Enumerable.Range(0, totalNumberForBarChart).ToArray();

// Define group for Actual values 
var ActualValuesGroupBarGraph = new Graph.Bar()
{
    x = elements,
    y = actualFares,
    name = "Actual"
};

// Define group for Prediction values 
var PredictionValuesGroupBarGraph = new Graph.Bar()
{
    x = elements,
    y = predictionFares,
    name = "Predicted"
};

var chart = Chart.Plot(new[] {ActualValuesGroupBarGraph, PredictionValuesGroupBarGraph});

var layout = new Layout.Layout(){barmode = "group", title="Actual fares vs. Predicted fares Comparison"};
chart.WithLayout(layout);
chart.WithXTitle("Cases");
chart.WithYTitle("Fare");
chart.WithLegend(true);
chart.Width = 700;
chart.Height = 400;

display(chart);


(4,23): error CS0103: El nombre 'predictionsDataView' no existe en el contexto actual

(5,27): error CS0103: El nombre 'predictionsDataView' no existe en el contexto actual



Cell not executed: compilation error

### Implementation of CalculateRegressionLine() function

In [40]:
// Function to calculate the regression line 
// (This function could be substituted by a pre-built Math function from a NuGet such as Math.NET)

public static (double[], double[]) CalculateRegressionLine(float[] actualFares, float[] predictionFares, int totalNumber)
{   
    // Regression Line calculation explanation:
    // https://www.khanacademy.org/math/statistics-probability/describing-relationships-quantitative-data/more-on-regression/v/regression-line-example
    // Generic function for Y for the regression line
    // y = (m * x) + b;
    // Similar code: https://gist.github.com/tansey/1375526 

    double yTotal = 0;
    double xTotal = 0;
    double xyMultiTotal = 0;
    double xSquareTotal = 0;

    for (int i = 0; i < (actualFares.Length); i++)
    {
        var x = actualFares[i];
        var y = predictionFares[i];

        xTotal += x;
        yTotal += y;

        double multi = x * y;
        xyMultiTotal += multi;

        double xSquare = x * x;
        xSquareTotal += xSquare;

        double ySquare = y * y;

        //display($"-------------------------------------------------");
        //display($"Predicted : {y}");
        //display($"Actual:    {x}");
        //display($"-------------------------------------------------");
    }

    double minY = yTotal / totalNumber;
    double minX = xTotal / totalNumber;
    double minXY = xyMultiTotal / totalNumber;
    double minXsquare = xSquareTotal / totalNumber;

    double m = ((minX * minY) - minXY) / ((minX * minX) - minXsquare);

    double b = minY - (m * minX);

    //Generic function for Y for the regression line
    // y = (m * x) + b;

    // Start x on 0
    double x1 = 0;
    //Function for Y1 in the line
    double y1 = (m * x1) + b;

    // Get the max val of X or Y for our X in the line so the line is long enough for outliers
    var maxValueForX = Math.Max(actualFares.Max(), predictionFares.Max());

    double x2 = maxValueForX;
    //Function for Y2 in the line
    double y2 = (m * x2) + b;

    // Extract/create two simple arrays for the line coordinates
    var xArray = new double[2];
    var yArray = new double[2];
    xArray[0] = x1;
    yArray[0] = y1;
    xArray[1] = x2;
    yArray[1] = y2;
            
    return(xArray, yArray);
}


# Plotting Regression line and Predictions vs. Actual values

In [41]:
using XPlot.Plotly;

// Number of rows to use for Plotting the Regression chart
int totalNumber  = 500;

float[] actualFares = predictionsDataView.GetColumn<float>("FareAmount").Take(totalNumber).ToArray();
float[] predictionFares = predictionsDataView.GetColumn<float>("Score").Take(totalNumber).ToArray();

// Display the Best Bit Regression Line 

// Define scatter plot grapgh (dots) 
var ActualVsPredictedGraph = new Graph.Scatter()
{
    x = actualFares,
    y = predictionFares,
    mode = "markers",
    marker = new Graph.Marker() { color = "purple"} //"rgb(142, 124, 195)"             
};

// Calculate Regression line
// Get a touple with the two X and two Y values determining the regression line
(double[] xArray, double[] yArray) = CalculateRegressionLine(actualFares, predictionFares, totalNumber);

//display("Display values defining the regression line");
//display(xArray);
//display(yArray);

// Define grapgh for the line 
var regressionLine = new Graph.Scatter()
{
    x = xArray,
    y = yArray,
    mode = "lines"
};


// 'Perfect' line, 45 degrees (Predicted values equal to actual values)
var maximumValue = Math.Max(actualFares.Max(), predictionFares.Max());

var perfectLine = new Graph.Scatter()
{
    x = new[] {0, maximumValue},
    y = new[] {0, maximumValue},
    mode = "lines",
    line = new Graph.Line(){color = "grey"}
};
//////

// XPlot Charp samples: https://fslab.org/XPlot/chart/plotly-line-scatter-plots.html 
//Display the chart's figures
var chart = Chart.Plot(new[] {ActualVsPredictedGraph, regressionLine, perfectLine });
chart.WithXTitle("Actual Values");
chart.WithYTitle("Predicted Values");
chart.WithLegend(true);
chart.WithLabels(new[]{"Prediction vs. Actual", "Regression Line", "Perfect Regression Line"});
chart.Width = 700;
chart.Height = 600;

display(chart);



# Save the ML model as a file

In [42]:
display(h1("Saving the ML.NET Model as a file..."));

string modelFilePath = "./MLRegressionModel.zip";

// GetAbsolutePath(modelRelativePath)
mlContext.Model.Save(trainedModel, trainDataView.Schema, modelFilePath);

display(h3($"The model was saved to: {modelFilePath}"));
