In [1]:
#r "nuget:Microsoft.ML"
#r "nuget:CSTNet"

Installed package CSTNet version 1.0.0

Installed package Microsoft.ML version 1.5.2

In [1]:
using System;
using System.IO;
using System.Collections.Generic;
using CSTNet;
using Microsoft.ML;
using Microsoft.ML.Data;
using Microsoft.ML.Transforms.Text;

In [1]:
class Input 
{
    public string Text { get; set; }
}

class Output
{
    public float[] BagOfWords { get; set; }
}

In [1]:
var ctx = new MLContext();

In [1]:
var path = Path.Combine(Environment.CurrentDirectory, "data");
var translations = File.ReadAllText(Path.Combine(path, "translations.cst"));
var data = new List<Input>
{
    new Input { Text = CaretSeparatedText.Parse(translations, "qbfakina"), }
};

In [1]:
var dataView = ctx.Data.LoadFromEnumerable(data);
var pipeline = ctx.Transforms.Text.ProduceWordBags(
    "BagOfWords",
    "Text",
    ngramLength: 1,
    useAllLengths: false,
    weighting: NgramExtractingEstimator.WeightingCriteria.Tf
);
var fit = pipeline.Fit(dataView);
var transform = fit.Transform(dataView);
var engine = ctx.Model.CreatePredictionEngine<Input, Output>(fit);
var predict = engine.Predict(data[0]);

In [1]:
VBuffer<ReadOnlyMemory<char>> slotNames = default;
transform.Schema["BagOfWords"].GetSlotNames(ref slotNames);
var bagOfWordsCol = transform.GetColumn<VBuffer<float>>(transform.Schema["BagOfWords"]);

In [1]:
Console.Write("NGrams: ");
foreach (var row in bagOfWordsCol)
    foreach (var item in row.Items())
        Console.WriteLine($"{slotNames.GetValues()[item.Key]}");

NGrams: 

ti


cufa


bopá


tuti


jinatu


otá


nali


rohó.


In [1]:
Console.WriteLine("Word counts: ");
for (var i = 0; i < predict.BagOfWords.Length; i++)
    Console.WriteLine($"{predict.BagOfWords[i]:F4}");

Word counts: 


1.0000


1.0000


1.0000


1.0000


1.0000


1.0000


1.0000


1.0000
