From ad665c44447f2920e79fc31d8c8e32deb4e19abc Mon Sep 17 00:00:00 2001 From: Piotr Stachaczynski Date: Mon, 21 Apr 2025 22:09:25 +0200 Subject: [PATCH 1/4] feat: improve KM performance --- Examples/Examples.SimpleConsole/Program.cs | 23 +- Releases/0.1.5.md | 3 + src/MaIN.Core/.nuspec | 2 +- src/MaIN.Core/Hub/Contexts/ChatContext.cs | 16 +- .../Constants/ServiceConstants.cs | 1 + .../Services/LLMService/ChatMemoryOptions.cs | 11 + .../LLMService/Memory/DocumentProcessor.cs | 841 ++++++++++++++++++ .../LLMService/Memory/MemoryService.cs | 25 +- .../Services/LLMService/OpenAiService.cs | 10 - .../Services/LLMService/Utils/ChatHelper.cs | 5 +- 10 files changed, 917 insertions(+), 20 deletions(-) create mode 100644 Releases/0.1.5.md create mode 100644 src/MaIN.Services/Services/LLMService/ChatMemoryOptions.cs create mode 100644 src/MaIN.Services/Services/LLMService/Memory/DocumentProcessor.cs diff --git a/Examples/Examples.SimpleConsole/Program.cs b/Examples/Examples.SimpleConsole/Program.cs index d1c17f64..0e164459 100644 --- a/Examples/Examples.SimpleConsole/Program.cs +++ b/Examples/Examples.SimpleConsole/Program.cs @@ -1,10 +1,25 @@ using MaIN.Core; using MaIN.Core.Hub; +using MaIN.Domain.Entities; MaINBootstrapper.Initialize(); -await AIHub.Chat() - .WithModel("gemma2:2b") - .WithMessage("Hello, World!") - .CompleteAsync(interactive: true); +// await AIHub.Chat() +// .WithModel("gemma2:2b") +// .WithMessage("Hello, World!") +// .CompleteAsync(interactive: true); + +var result = AIHub.Chat() + .WithModel("llama3.2:3b") + .WithMessage( + "Write this invoice as JSON.") + .WithMemoryParams(new MemoryParams() + { + AnswerTokens = 2000, + ContextSize = 4500, + }) + .WithFiles(["3.pdf"], preProcess: true); + +var chatResult = await result.CompleteAsync(); +Console.WriteLine(chatResult.Message.Content); diff --git a/Releases/0.1.5.md b/Releases/0.1.5.md new file mode 100644 index 00000000..5e6aa22f --- /dev/null +++ b/Releases/0.1.5.md @@ -0,0 +1,3 @@ +# 0.1.5 release + +- Enable pre processing of documents, it can greatly improve KM performance on small models \ No newline at end of file diff --git a/src/MaIN.Core/.nuspec b/src/MaIN.Core/.nuspec index 5d207368..3934eb82 100644 --- a/src/MaIN.Core/.nuspec +++ b/src/MaIN.Core/.nuspec @@ -2,7 +2,7 @@ MaIN.NET - 0.1.4 + 0.1.5 Wisedev Wisedev favicon.png diff --git a/src/MaIN.Core/Hub/Contexts/ChatContext.cs b/src/MaIN.Core/Hub/Contexts/ChatContext.cs index a61d05a0..480b4889 100644 --- a/src/MaIN.Core/Hub/Contexts/ChatContext.cs +++ b/src/MaIN.Core/Hub/Contexts/ChatContext.cs @@ -1,5 +1,6 @@ using MaIN.Domain.Entities; using MaIN.Domain.Models; +using MaIN.Services.Constants; using MaIN.Services.Dtos; using MaIN.Services.Services.Abstract; using MaIN.Services.Services.Models; @@ -10,6 +11,7 @@ namespace MaIN.Core.Hub.Contexts; public class ChatContext { private readonly IChatService _chatService; + private bool _preProcess; private Chat _chat { get; set; } private List _files { get; set; } @@ -84,7 +86,7 @@ public ChatContext WithSystemPrompt(string systemPrompt) return this; } - public ChatContext WithFiles(List fileStreams) + public ChatContext WithFiles(List fileStreams, bool preProcess = false) { var files = fileStreams.Select(p => new FileInfo() { @@ -94,17 +96,19 @@ public ChatContext WithFiles(List fileStreams) StreamContent = p }).ToList(); + _preProcess = preProcess; _files = files; return this; } - public ChatContext WithFiles(List files) + public ChatContext WithFiles(List files, bool preProcess = false) { _files = files; + _preProcess = preProcess; return this; } - public ChatContext WithFiles(List filePaths) + public ChatContext WithFiles(List filePaths, bool preProcess = false) { var files = filePaths.Select(p => new FileInfo() { @@ -113,6 +117,7 @@ public ChatContext WithFiles(List filePaths) Extension = Path.GetExtension(p) }).ToList(); + _preProcess = preProcess; _files = files; return this; } @@ -135,6 +140,11 @@ public async Task CompleteAsync( throw new InvalidOperationException("Chat has no messages."); //TODO good candidate for domain exception } _chat.Messages.Last().Files = _files; + if(_preProcess) + { + _chat.Messages.Last().Properties.Add(ServiceConstants.Messages.PreProcessProperty, string.Empty); + } + if (!await ChatExists(_chat.Id)) { await _chatService.Create(_chat); diff --git a/src/MaIN.Services/Constants/ServiceConstants.cs b/src/MaIN.Services/Constants/ServiceConstants.cs index 65670d70..385c4ca8 100644 --- a/src/MaIN.Services/Constants/ServiceConstants.cs +++ b/src/MaIN.Services/Constants/ServiceConstants.cs @@ -20,6 +20,7 @@ public static class ApiUrls public static class Messages { public const string GeneratedImageContent = "Generated Image:"; + public const string PreProcessProperty = "Pre_Process"; } public static class Defaults diff --git a/src/MaIN.Services/Services/LLMService/ChatMemoryOptions.cs b/src/MaIN.Services/Services/LLMService/ChatMemoryOptions.cs new file mode 100644 index 00000000..da0b3610 --- /dev/null +++ b/src/MaIN.Services/Services/LLMService/ChatMemoryOptions.cs @@ -0,0 +1,11 @@ +namespace MaIN.Services.Services.LLMService; + +public class ChatMemoryOptions +{ + public Dictionary? TextData { get; set; } + public Dictionary? FileData { get; set; } + public Dictionary? StreamData { get; set; } + public List? WebUrls { get; set; } + public List? Memory { get; set; } + public bool PreProcess { get; set; } = false; +} \ No newline at end of file diff --git a/src/MaIN.Services/Services/LLMService/Memory/DocumentProcessor.cs b/src/MaIN.Services/Services/LLMService/Memory/DocumentProcessor.cs new file mode 100644 index 00000000..da9075f2 --- /dev/null +++ b/src/MaIN.Services/Services/LLMService/Memory/DocumentProcessor.cs @@ -0,0 +1,841 @@ +using System.Text; +using System.Text.RegularExpressions; +using DocumentFormat.OpenXml.Packaging; +using DocumentFormat.OpenXml.Spreadsheet; +using DocumentFormat.OpenXml.Wordprocessing; +using Tesseract; +using UglyToad.PdfPig; +using UglyToad.PdfPig.Content; +using UglyToad.PdfPig.Core; +using UglyToad.PdfPig.Geometry; +using Text = DocumentFormat.OpenXml.Wordprocessing.Text; + +namespace MaIN.Services.Services.LLMService.Memory; + +public static class DocumentProcessor +{ + public static string ProcessDocument(string filePath) + { + string extension = Path.GetExtension(filePath).ToLower(); + + return extension switch + { + ".pdf" => ProcessPdf(filePath), + ".docx" => ProcessDocx(filePath), + ".xlsx" or ".xls" => ProcessExcel(filePath), + ".jpg" or ".jpeg" or ".png" or ".tiff" or ".bmp" => ProcessImage(filePath), + ".txt" => ProcessTextFile(filePath), + ".rtf" => ProcessRtf(filePath), + ".html" or ".htm" => ProcessHtml(filePath), + _ => throw new NotSupportedException($"Format {extension} not supported") + }; + } + + private static string ProcessPdf(string pdfPath) + { + var result = new StringBuilder(); + + using var document = PdfDocument.Open(pdfPath); + foreach (var page in document.GetPages()) + { + var words = page.GetWords().ToList(); + var tableRegions = FindTableRegions(words); + + var nonTableWords = words.Where(w => + !tableRegions.Any(r => r.Contains(w.BoundingBox))).ToList(); + + if (nonTableWords.Any()) + { + var textContent = ProcessTextContent(nonTableWords); + result.AppendLine(textContent); + } + + foreach (var region in tableRegions) + { + var tableWords = words.Where(w => region.Contains(w.BoundingBox)).ToList(); + var tableMarkdown = CreateMarkdownTable(tableWords); + result.AppendLine(tableMarkdown); + result.AppendLine(); + } + + result.AppendLine("---"); + } + + return result.ToString(); + } + + private static List FindTableRegions(List words) + { + var regions = new List(); + var rows = words + .GroupBy(w => Math.Round(w.BoundingBox.Bottom, 1)) + .OrderByDescending(g => g.Key) + .ToList(); + + for (int i = 0; i < rows.Count - 2; i++) + { + var row1 = rows[i].OrderBy(w => w.BoundingBox.Left).ToList(); + var row2 = rows[i + 1].OrderBy(w => w.BoundingBox.Left).ToList(); + var row3 = rows[i + 2].OrderBy(w => w.BoundingBox.Left).ToList(); + + if (HasColumnAlignment(row1, row2, row3)) + { + int startRow = i; + int endRow = i + 2; + + for (int j = i + 3; j < rows.Count; j++) + { + var nextRow = rows[j].OrderBy(w => w.BoundingBox.Left).ToList(); + if (HasColumnAlignment(row3, rows[j - 1].OrderBy(w => w.BoundingBox.Left).ToList(), nextRow)) + { + endRow = j; + } + else + { + break; + } + } + + var tableWords = new List(); + for (int r = startRow; r <= endRow; r++) + { + tableWords.AddRange(rows[r]); + } + + double left = tableWords.Min(w => w.BoundingBox.Left) - 5; + double bottom = tableWords.Min(w => w.BoundingBox.Bottom) - 5; + double right = tableWords.Max(w => w.BoundingBox.Right) + 5; + double top = tableWords.Max(w => w.BoundingBox.Top) + 5; + + regions.Add(new PdfRectangle(left, bottom, right, top)); + i = endRow; + } + } + + return regions; + } + + private static bool HasColumnAlignment(List row1, List row2, List row3) + { + var xPos1 = row1.Select(w => w.BoundingBox.Left).ToList(); + var xPos2 = row2.Select(w => w.BoundingBox.Left).ToList(); + var xPos3 = row3.Select(w => w.BoundingBox.Left).ToList(); + + double tolerance = 10.0; + int alignedCount = 0; + + foreach (var x1 in xPos1) + { + bool aligned2 = xPos2.Any(x2 => Math.Abs(x1 - x2) < tolerance); + bool aligned3 = xPos3.Any(x3 => Math.Abs(x1 - x3) < tolerance); + + if (aligned2 && aligned3) + { + alignedCount++; + } + } + + return alignedCount >= 2 && row1.Count >= 2 && row2.Count >= 2 && row3.Count >= 2; + } + + private static string CreateMarkdownTable(List tableWords) + { + var rows = tableWords + .GroupBy(w => Math.Round(w.BoundingBox.Bottom, 1)) + .OrderByDescending(g => g.Key) + .ToList(); + + if (!rows.Any()) return string.Empty; + + var columnPositions = GetColumnPositions(rows); + int columnCount = columnPositions.Count; + + if (columnCount < 2) + { + columnCount = 2; + columnPositions = new List { tableWords.Max(w => w.BoundingBox.Left) / 2 }; + } + + var sb = new StringBuilder(); + sb.Append("|"); + var headerRow = rows.First().OrderBy(w => w.BoundingBox.Left).ToList(); + + for (int i = 0; i < columnCount; i++) + { + double start = i == 0 ? 0 : columnPositions[i - 1]; + double end = i < columnPositions.Count - 1 ? columnPositions[i] : double.MaxValue; + + var cellWords = headerRow + .Where(w => w.BoundingBox.Left >= start && w.BoundingBox.Left < end) + .OrderBy(w => w.BoundingBox.Left) + .ToList(); + + string cellText = string.Join(" ", cellWords.Select(w => w.Text)); + sb.Append($" {CleanCellText(cellText)} ;"); + } + + sb.AppendLine(); + sb.Append("|"); + + for (int i = 0; i < columnCount; i++) + { + sb.Append(" --- |"); + } + + sb.AppendLine(); + + foreach (var row in rows.Skip(1)) + { + sb.Append("|"); + var rowWords = row.OrderBy(w => w.BoundingBox.Left).ToList(); + + for (int i = 0; i < columnCount; i++) + { + double start = i == 0 ? 0 : columnPositions[i - 1]; + double end = i < columnPositions.Count - 1 ? columnPositions[i] : double.MaxValue; + + var cellWords = rowWords + .Where(w => w.BoundingBox.Left >= start && w.BoundingBox.Left < end) + .OrderBy(w => w.BoundingBox.Left) + .ToList(); + + string cellText = string.Join(" ", cellWords.Select(w => w.Text)); + sb.Append($" {CleanCellText(cellText)} |"); + } + + sb.AppendLine(); + } + + return sb.ToString(); + } + + private static List GetColumnPositions(List> rows) + { + var allPositions = new List(); + foreach (var row in rows) + { + allPositions.AddRange(row.Select(w => w.BoundingBox.Left)); + } + + var tolerance = 10.0; + var clusters = new List>(); + + foreach (var pos in allPositions.OrderBy(p => p)) + { + bool added = false; + foreach (var cluster in clusters) + { + if (Math.Abs(cluster.Average() - pos) < tolerance) + { + cluster.Add(pos); + added = true; + break; + } + } + + if (!added) + { + clusters.Add(new List { pos }); + } + } + + return clusters + .Where(c => c.Count > Math.Max(2, rows.Count / 4)) + .OrderBy(c => c.Average()) + .Select(c => c.Average()) + .ToList(); + } + + private static string ProcessTextContent(List words) + { + var sb = new StringBuilder(); + var rows = words + .GroupBy(w => Math.Round(w.BoundingBox.Bottom, 1)) + .OrderByDescending(g => g.Key); + + double prevRowHeight = 0; + + foreach (var row in rows) + { + string line = string.Join(" ", row.OrderBy(w => w.BoundingBox.Left).Select(w => w.Text)); + + if (string.IsNullOrWhiteSpace(line)) continue; + + var firstWord = row.OrderBy(w => w.BoundingBox.Left).FirstOrDefault(); + double wordHeight = 0; + + if (firstWord != null) + { + wordHeight = firstWord.BoundingBox.Height; + var letters = firstWord.Letters.ToList(); + + if (letters.Any()) + { + double estimatedFontSize = letters.First().FontSize; + if (estimatedFontSize > 0) + { + wordHeight = estimatedFontSize; + } + } + } + + if (wordHeight > prevRowHeight * 1.2 && wordHeight > 10) + { + if (wordHeight > 14) + { + sb.AppendLine($"## {line}"); + } + else + { + sb.AppendLine($"### {line}"); + } + } + else if (line.TrimStart().StartsWith("•") || line.TrimStart().StartsWith("-") || + Regex.IsMatch(line.TrimStart(), @"^\d+\.")) + { + int index = Math.Max(1, line.TrimStart().IndexOfAny(['•', '-', '.'])); + if (index < line.TrimStart().Length - 1) + { + sb.AppendLine($"* {line.TrimStart().Substring(index + 1).Trim()}"); + } + else + { + sb.AppendLine($"* {line.TrimStart()}"); + } + } + else + { + sb.AppendLine(line); + sb.AppendLine(); + } + + prevRowHeight = wordHeight; + } + + return sb.ToString(); + } + + private static string CleanCellText(string text) + { + if (string.IsNullOrWhiteSpace(text)) return ""; + + text = text.Trim(); + text = Regex.Replace(text, @"\s+", " "); + text = text.Replace("|", "\\|"); + + return text; + } + + private static bool IsLikelyHeader(IEnumerable row, IEnumerable nextRow) + { + var enumerable = row.ToList(); + var words = nextRow as Word[] ?? nextRow.ToArray(); + if (!enumerable.Any() || !words.Any()) + return false; + + bool hasLettersFontInfo = enumerable.Any(w => w.Letters.Any()); + bool nextHasLettersFontInfo = words.Any(w => w.Letters.Any()); + + if (hasLettersFontInfo && nextHasLettersFontInfo) + { + double rowAvgFontSize = enumerable + .SelectMany(w => w.Letters) + .Where(l => l.FontSize > 0) + .Select(l => l.FontSize) + .DefaultIfEmpty(0) + .Average(); + + double nextRowAvgFontSize = words + .SelectMany(w => w.Letters) + .Where(l => l.FontSize > 0) + .Select(l => l.FontSize) + .DefaultIfEmpty(0) + .Average(); + + return rowAvgFontSize > nextRowAvgFontSize * 1.2; + } + + double rowAvgHeight = enumerable.Average(w => w.BoundingBox.Height); + double nextRowAvgHeight = words.Average(w => w.BoundingBox.Height); + + return rowAvgHeight > nextRowAvgHeight * 1.2; + } + + private static string ProcessExcel(string filePath) + { + var structuredContent = new StringBuilder(); + structuredContent.AppendLine("[SPREADSHEET_START]"); + + using var document = SpreadsheetDocument.Open(filePath, false); + var workbookPart = document.WorkbookPart; + var sheets = workbookPart!.Workbook.Descendants(); + var sharedStringTable = workbookPart.SharedStringTablePart?.SharedStringTable; + + foreach (var sheet in sheets) + { + var worksheetPart = (WorksheetPart)workbookPart!.GetPartById(sheet.Id!); + var sheetData = worksheetPart.Worksheet.Elements().First(); + + structuredContent.AppendLine($"[SHEET:{sheet.Name}]"); + + if (!sheetData.Elements().Any()) + { + structuredContent.AppendLine("[EMPTY_SHEET]"); + structuredContent.AppendLine($"[/SHEET:{sheet.Name}]"); + continue; + } + + bool firstRow = true; + int maxColumns = 0; + + foreach (var row in sheetData.Elements()) + { + int cellCount = row.Elements().Count(); + maxColumns = Math.Max(maxColumns, cellCount); + } + + foreach (var row in sheetData.Elements()) + { + var rowContent = new StringBuilder("|"); + var cells = row.Elements().ToList(); + + for (int i = 0; i < maxColumns; i++) + { + string cellValue = ""; + var cell = cells.FirstOrDefault(c => GetColumnIndex(GetColumnId(c.CellReference!)) == i); + + if (cell != null) + { + cellValue = GetCellValue(cell, sharedStringTable!); + } + + rowContent.Append($" {cellValue} |"); + } + + structuredContent.AppendLine(rowContent.ToString()); + + if (firstRow && maxColumns > 0) + { + var separatorRow = new StringBuilder("|"); + for (int i = 0; i < maxColumns; i++) + { + separatorRow.Append(" --- |"); + } + + structuredContent.AppendLine(separatorRow.ToString()); + firstRow = false; + } + } + + structuredContent.AppendLine($"[/SHEET:{sheet.Name}]"); + } + + structuredContent.AppendLine("[SPREADSHEET_END]"); + return structuredContent.ToString(); + } + + private static string GetColumnId(string cellReference) + { + if (string.IsNullOrEmpty(cellReference)) + return ""; + + return new string(cellReference.TakeWhile(char.IsLetter).ToArray()); + } + + private static int GetColumnIndex(string columnId) + { + int index = 0; + foreach (char c in columnId) + { + index = (index * 26) + (c - 'A' + 1); + } + + return index - 1; + } + + private static string GetCellValue(Cell cell, SharedStringTable sharedStringTable) + { + if (cell.CellValue == null) + return string.Empty; + + string value = cell.CellValue.Text; + + if (cell.DataType != null && cell.DataType.Value == CellValues.SharedString && sharedStringTable != null) + { + if (int.TryParse(value, out int ssid) && ssid >= 0 && ssid < sharedStringTable.Count()) + { + return sharedStringTable.ElementAt(ssid).InnerText; + } + } + + return value; + } + + private static string ProcessDocx(string filePath) + { + var structuredContent = new StringBuilder(); + structuredContent.AppendLine("[DOCUMENT_START]"); + + using var document = WordprocessingDocument.Open(filePath, false); + var body = document.MainDocumentPart?.Document.Body; + + structuredContent.AppendLine("[METADATA_START]"); + if (!string.IsNullOrEmpty(document.PackageProperties.Title)) + structuredContent.AppendLine($"Title: {document.PackageProperties.Title}"); + if (!string.IsNullOrEmpty(document.PackageProperties.Creator)) + structuredContent.AppendLine($"Author: {document.PackageProperties.Creator}"); + if (!string.IsNullOrEmpty(document.PackageProperties.Subject)) + structuredContent.AppendLine($"Subject: {document.PackageProperties.Subject}"); + structuredContent.AppendLine("[METADATA_END]"); + + foreach (var element in body?.Elements()!) + { + if (element is Paragraph paragraph) + { + string text = ExtractTextFromParagraph(paragraph); + if (string.IsNullOrWhiteSpace(text)) + continue; + + if (IsParagraphHeading(paragraph)) + { + structuredContent.AppendLine($"[HEADING]{text}[/HEADING]"); + } + else + { + structuredContent.AppendLine(text); + } + } + else if (element is DocumentFormat.OpenXml.Wordprocessing.Table table) + { + structuredContent.AppendLine("[TABLE_START]"); + FormatWordTableAsMarkdown(table, structuredContent); + structuredContent.AppendLine("[TABLE_END]"); + } + } + + structuredContent.AppendLine("[DOCUMENT_END]"); + return structuredContent.ToString(); + } + + private static string ExtractTextFromParagraph(Paragraph paragraph) + { + return string.Join(" ", paragraph.Descendants().Select(t => t.Text)); + } + + private static bool IsParagraphHeading(Paragraph paragraph) + { + var styleId = paragraph.ParagraphProperties?.ParagraphStyleId?.Val?.Value; + return styleId != null && (styleId.StartsWith("Heading") || styleId.StartsWith("Title")); + } + + private static void FormatWordTableAsMarkdown(DocumentFormat.OpenXml.Wordprocessing.Table table, + StringBuilder output) + { + bool isFirstRow = true; + + foreach (var row in table.Elements()) + { + StringBuilder rowBuilder = new StringBuilder("|"); + + foreach (var cell in row.Elements()) + { + string cellText = string.Join(" ", cell.Descendants().Select(t => t.Text)); + rowBuilder.Append($" {cellText.Trim()} |"); + } + + output.AppendLine(rowBuilder.ToString()); + + if (isFirstRow) + { + isFirstRow = false; + int cellCount = row.Elements().Count(); + StringBuilder separatorBuilder = new StringBuilder("|"); + + for (int i = 0; i < cellCount; i++) + { + separatorBuilder.Append(" --- |"); + } + + output.AppendLine(separatorBuilder.ToString()); + } + } + } + + private static string ProcessImage(string filePath) + { + var structuredContent = new StringBuilder(); + structuredContent.AppendLine("[IMAGE_DOCUMENT_START]"); + + try + { + using var engine = new TesseractEngine("./tessdata", "eng", EngineMode.Default); + using var img = Pix.LoadFromFile(filePath); + using var page = engine.Process(img); + + string text = page.GetText(); + var lines = text.Split('\n'); + bool inTable = false; + + foreach (var line in lines) + { + bool looksLikeTableRow = line.Contains('\t') || line.Contains(" "); + + if (looksLikeTableRow && !inTable) + { + structuredContent.AppendLine("[TABLE_START]"); + inTable = true; + } + else if (!looksLikeTableRow && inTable) + { + structuredContent.AppendLine("[TABLE_END]"); + inTable = false; + } + + if (!string.IsNullOrWhiteSpace(line)) + { + if (inTable) + { + string formattedLine = line.Trim(); + formattedLine = Regex.Replace(formattedLine, @"\s{3,}", " | "); + formattedLine = formattedLine.Replace('\t', '|'); + structuredContent.AppendLine($"|{formattedLine}|"); + } + else + { + structuredContent.AppendLine(line); + } + } + } + + if (inTable) + { + structuredContent.AppendLine("[TABLE_END]"); + } + } + catch (Exception ex) + { + structuredContent.AppendLine($"[OCR_ERROR: {ex.Message}]"); + } + + structuredContent.AppendLine("[IMAGE_DOCUMENT_END]"); + return structuredContent.ToString(); + } + + private static string ProcessTextFile(string filePath) + { + var structuredContent = new StringBuilder(); + structuredContent.AppendLine("[TEXT_DOCUMENT_START]"); + + string[] lines = File.ReadAllLines(filePath); + bool inTable = false; + + foreach (var line in lines) + { + bool looksLikeTableRow = IsLikelyTableRow(line); + + if (looksLikeTableRow && !inTable) + { + structuredContent.AppendLine("[TABLE_START]"); + inTable = true; + } + else if (!looksLikeTableRow && inTable) + { + structuredContent.AppendLine("[TABLE_END]"); + inTable = false; + } + + if (!string.IsNullOrWhiteSpace(line)) + { + if (inTable) + { + string formattedLine = FormatPlainTextTableRow(line); + structuredContent.AppendLine(formattedLine); + } + else + { + structuredContent.AppendLine(line); + } + } + } + + if (inTable) + { + structuredContent.AppendLine("[TABLE_END]"); + } + + structuredContent.AppendLine("[TEXT_DOCUMENT_END]"); + return structuredContent.ToString(); + } + + private static bool IsLikelyTableRow(string line) + { + if (line.Count(c => c == '\t') >= 2) + return true; + + var spaces = new List(); + int currentSpace = 0; + + foreach (var t in line) + { + if (t == ' ') + { + currentSpace++; + } + else + { + if (currentSpace >= 3) + { + spaces.Add(currentSpace); + } + + currentSpace = 0; + } + } + + if (currentSpace >= 3) + { + spaces.Add(currentSpace); + } + + return spaces.Count >= 2; + } + + private static string FormatPlainTextTableRow(string line) + { + string formatted = line.Replace('\t', '|'); + formatted = Regex.Replace(formatted, @"\s{3,}", "|"); + + if (!formatted.StartsWith("|")) + formatted = "|" + formatted; + + if (!formatted.EndsWith("|")) + formatted = formatted + "|"; + + return formatted; + } + + private static string ProcessRtf(string filePath) + { + var structuredContent = new StringBuilder(); + structuredContent.AppendLine("[RTF_DOCUMENT_START]"); + + try + { + string rtfText = File.ReadAllText(filePath); + string plainText = ConvertRtfToPlainText(rtfText); + string[] lines = plainText.Split(['\r', '\n'], StringSplitOptions.RemoveEmptyEntries); + + foreach (var line in lines) + { + if (!string.IsNullOrWhiteSpace(line)) + { + structuredContent.AppendLine(line.Trim()); + } + } + } + catch (Exception ex) + { + structuredContent.AppendLine($"[RTF_PROCESSING_ERROR: {ex.Message}]"); + } + + structuredContent.AppendLine("[RTF_DOCUMENT_END]"); + return structuredContent.ToString(); + } + + private static string ConvertRtfToPlainText(string rtfText) + { + string plainText = rtfText; + int headerEnd = plainText.IndexOf("\\viewkind4", StringComparison.Ordinal); + + if (headerEnd > 0) + { + plainText = plainText.Substring(headerEnd); + } + + plainText = Regex.Replace(plainText, @"\\[a-zA-Z]+[0-9]*", " "); + plainText = plainText.Replace("{", "").Replace("}", ""); + plainText = plainText.Replace("\\", ""); + plainText = Regex.Replace(plainText, @"\s+", " "); + + return plainText.Trim(); + } + + private static string ProcessHtml(string filePath) + { + var structuredContent = new StringBuilder(); + structuredContent.AppendLine("[HTML_DOCUMENT_START]"); + + try + { + string htmlText = File.ReadAllText(filePath); + string plainText = StripHtmlTags(htmlText); + ExtractTablesFromHtml(htmlText, structuredContent); + + var lines = plainText.Split(['\r', '\n'], StringSplitOptions.RemoveEmptyEntries); + foreach (var line in lines) + { + if (!string.IsNullOrWhiteSpace(line)) + { + structuredContent.AppendLine(line.Trim()); + } + } + } + catch (Exception ex) + { + structuredContent.AppendLine($"[HTML_PROCESSING_ERROR: {ex.Message}]"); + } + + structuredContent.AppendLine("[HTML_DOCUMENT_END]"); + return structuredContent.ToString(); + } + + private static string StripHtmlTags(string html) + { + return Regex.Replace(html, @"<[^>]+>", " "); + } + + private static void ExtractTablesFromHtml(string html, StringBuilder output) + { + var tableMatches = Regex.Matches(html, @"]*>(.*?)", RegexOptions.Singleline); + + foreach (Match tableMatch in tableMatches) + { + string tableHtml = tableMatch.Groups[1].Value; + output.AppendLine("[TABLE_START]"); + var rowMatches = Regex.Matches(tableHtml, @"]*>(.*?)", RegexOptions.Singleline); + bool isFirstRow = true; + + foreach (Match rowMatch in rowMatches) + { + string rowHtml = rowMatch.Groups[1].Value; + StringBuilder rowBuilder = new StringBuilder("|"); + var cellMatches = Regex.Matches(rowHtml, @"<(td|th)[^>]*>(.*?)", RegexOptions.Singleline); + + foreach (Match cellMatch in cellMatches) + { + string cellContent = cellMatch.Groups[2].Value; + cellContent = Regex.Replace(cellContent, @"<[^>]+>", ""); + rowBuilder.Append($" {cellContent.Trim()} |"); + } + + output.AppendLine(rowBuilder.ToString()); + + if (isFirstRow) + { + int cellCount = cellMatches.Count; + StringBuilder separatorBuilder = new StringBuilder("|"); + + for (int i = 0; i < cellCount; i++) + { + separatorBuilder.Append(" --- |"); + } + + output.AppendLine(separatorBuilder.ToString()); + isFirstRow = false; + } + } + + output.AppendLine("[TABLE_END]"); + } + } +} \ No newline at end of file diff --git a/src/MaIN.Services/Services/LLMService/Memory/MemoryService.cs b/src/MaIN.Services/Services/LLMService/Memory/MemoryService.cs index ba9a3d6a..c5880071 100644 --- a/src/MaIN.Services/Services/LLMService/Memory/MemoryService.cs +++ b/src/MaIN.Services/Services/LLMService/Memory/MemoryService.cs @@ -9,13 +9,17 @@ public async Task ImportDataToMemory( ChatMemoryOptions options, CancellationToken cancellationToken) { + if (options.PreProcess) + { + await PreprocessAvailableDocuments(options, cancellationToken); + } await ImportTextData(memory, options.TextData, cancellationToken); await ImportFileData(memory, options.FileData, cancellationToken); await ImportStreamData(memory, options.StreamData, cancellationToken); await ImportWebUrls(memory, options.WebUrls, cancellationToken); await ImportMemoryItems(memory, options.Memory, cancellationToken); } - + public string CleanResponseText(string text) { return text @@ -85,4 +89,23 @@ await memory.ImportTextAsync( cancellationToken: cancellationToken); } } + + private static async Task PreprocessAvailableDocuments(ChatMemoryOptions options, CancellationToken cancellationToken) + { + foreach (var file in options.FileData!) + { + options.TextData!.Add(file.Key ,DocumentProcessor.ProcessDocument(file.Value)); + options.FileData = []; + } + + foreach (var stream in options.StreamData!) + { + var fileStream = new FileStream(Path.GetTempPath()+$".{stream.Key}", FileMode.Create, FileAccess.Write); + await stream.Value.CopyToAsync(fileStream, cancellationToken); + await fileStream.DisposeAsync(); + options.TextData!.Add(stream.Key, DocumentProcessor.ProcessDocument(Path.GetTempPath()+$".{stream.Key}")); + options.StreamData = []; + } + } + } \ No newline at end of file diff --git a/src/MaIN.Services/Services/LLMService/OpenAiService.cs b/src/MaIN.Services/Services/LLMService/OpenAiService.cs index 9a06ce4a..881fc990 100644 --- a/src/MaIN.Services/Services/LLMService/OpenAiService.cs +++ b/src/MaIN.Services/Services/LLMService/OpenAiService.cs @@ -344,16 +344,6 @@ public class ChatRequestOptions public Func? TokenCallback { get; set; } } - -public class ChatMemoryOptions -{ - public Dictionary? TextData { get; set; } - public Dictionary? FileData { get; set; } - public Dictionary? StreamData { get; set; } - public List? WebUrls { get; set; } - public List? Memory { get; set; } -} - internal class ChatMessage(string role, string content) { public string Role { get; set; } = role; diff --git a/src/MaIN.Services/Services/LLMService/Utils/ChatHelper.cs b/src/MaIN.Services/Services/LLMService/Utils/ChatHelper.cs index 33ae77aa..b3ac6bca 100644 --- a/src/MaIN.Services/Services/LLMService/Utils/ChatHelper.cs +++ b/src/MaIN.Services/Services/LLMService/Utils/ChatHelper.cs @@ -67,12 +67,15 @@ public static ChatMemoryOptions ExtractMemoryOptions(Message message) var streamData = message.Files .Where(x => x.StreamContent != null) .ToDictionary(x => x.Name, x => x.StreamContent!); + + var preProcess = message.Properties.ContainsKey(Constants.ServiceConstants.Messages.PreProcessProperty); return new ChatMemoryOptions { TextData = textData, FileData = fileData, - StreamData = streamData + StreamData = streamData, + PreProcess = preProcess }; } } \ No newline at end of file From b075cdf9ee2d272bb4478e1418e4c09f86c02000 Mon Sep 17 00:00:00 2001 From: Piotr Stachaczynski Date: Tue, 22 Apr 2025 14:10:15 +0200 Subject: [PATCH 2/4] feat: cd with KM refactor --- Examples/Examples.SimpleConsole/Program.cs | 9 +- .../Services/LLMService/LLMService.cs | 6 +- .../LLMService/Memory/DocumentProcessor.cs | 363 +++++------------- .../LLMService/Memory/MemoryFactory.cs | 5 +- 4 files changed, 109 insertions(+), 274 deletions(-) diff --git a/Examples/Examples.SimpleConsole/Program.cs b/Examples/Examples.SimpleConsole/Program.cs index 0e164459..a3aa9bfd 100644 --- a/Examples/Examples.SimpleConsole/Program.cs +++ b/Examples/Examples.SimpleConsole/Program.cs @@ -10,13 +10,12 @@ // .CompleteAsync(interactive: true); var result = AIHub.Chat() - .WithModel("llama3.2:3b") - .WithMessage( - "Write this invoice as JSON.") + .WithModel("llama3.1:8b") + .WithMessage("Output this invoice as JSON") .WithMemoryParams(new MemoryParams() { - AnswerTokens = 2000, - ContextSize = 4500, + AnswerTokens = 1024, + ContextSize = 4096 }) .WithFiles(["3.pdf"], preProcess: true); diff --git a/src/MaIN.Services/Services/LLMService/LLMService.cs b/src/MaIN.Services/Services/LLMService/LLMService.cs index 1fac4ed9..f784da71 100644 --- a/src/MaIN.Services/Services/LLMService/LLMService.cs +++ b/src/MaIN.Services/Services/LLMService/LLMService.cs @@ -100,7 +100,11 @@ public Task CleanSessionCache(string? id) CancellationToken cancellationToken = default) { var model = KnownModels.GetModel(chat.Model); - var parameters = new ModelParams(Path.Combine(modelsPath, model.FileName)); + var parameters = new ModelParams(Path.Combine(modelsPath, model.FileName)) + { + GpuLayerCount = chat.MemoryParams.GpuLayerCount, + ContextSize = (uint)chat.MemoryParams.ContextSize + }; var llmModel = await LLamaWeights.LoadFromFileAsync(parameters, cancellationToken); var kernelMemory = memoryFactory.CreateMemoryWithModel( modelsPath, diff --git a/src/MaIN.Services/Services/LLMService/Memory/DocumentProcessor.cs b/src/MaIN.Services/Services/LLMService/Memory/DocumentProcessor.cs index da9075f2..9f433c29 100644 --- a/src/MaIN.Services/Services/LLMService/Memory/DocumentProcessor.cs +++ b/src/MaIN.Services/Services/LLMService/Memory/DocumentProcessor.cs @@ -6,8 +6,7 @@ using Tesseract; using UglyToad.PdfPig; using UglyToad.PdfPig.Content; -using UglyToad.PdfPig.Core; -using UglyToad.PdfPig.Geometry; +using Page = UglyToad.PdfPig.Content.Page; using Text = DocumentFormat.OpenXml.Wordprocessing.Text; namespace MaIN.Services.Services.LLMService.Memory; @@ -35,331 +34,163 @@ private static string ProcessPdf(string pdfPath) { var result = new StringBuilder(); - using var document = PdfDocument.Open(pdfPath); - foreach (var page in document.GetPages()) + using (var document = PdfDocument.Open(pdfPath)) { - var words = page.GetWords().ToList(); - var tableRegions = FindTableRegions(words); - - var nonTableWords = words.Where(w => - !tableRegions.Any(r => r.Contains(w.BoundingBox))).ToList(); - - if (nonTableWords.Any()) - { - var textContent = ProcessTextContent(nonTableWords); - result.AppendLine(textContent); - } - - foreach (var region in tableRegions) + foreach (var page in document.GetPages()) { - var tableWords = words.Where(w => region.Contains(w.BoundingBox)).ToList(); - var tableMarkdown = CreateMarkdownTable(tableWords); - result.AppendLine(tableMarkdown); - result.AppendLine(); + var pageText = ExtractPageText(page); + result.Append(pageText); } - - result.AppendLine("---"); } return result.ToString(); } - private static List FindTableRegions(List words) + private static string ExtractPageText(Page page) { - var regions = new List(); + var words = page.GetWords().ToList(); + var sb = new StringBuilder(); + var rows = words .GroupBy(w => Math.Round(w.BoundingBox.Bottom, 1)) .OrderByDescending(g => g.Key) .ToList(); - for (int i = 0; i < rows.Count - 2; i++) + foreach (var row in rows) { - var row1 = rows[i].OrderBy(w => w.BoundingBox.Left).ToList(); - var row2 = rows[i + 1].OrderBy(w => w.BoundingBox.Left).ToList(); - var row3 = rows[i + 2].OrderBy(w => w.BoundingBox.Left).ToList(); - - if (HasColumnAlignment(row1, row2, row3)) - { - int startRow = i; - int endRow = i + 2; - - for (int j = i + 3; j < rows.Count; j++) - { - var nextRow = rows[j].OrderBy(w => w.BoundingBox.Left).ToList(); - if (HasColumnAlignment(row3, rows[j - 1].OrderBy(w => w.BoundingBox.Left).ToList(), nextRow)) - { - endRow = j; - } - else - { - break; - } - } - - var tableWords = new List(); - for (int r = startRow; r <= endRow; r++) - { - tableWords.AddRange(rows[r]); - } + var lineWords = row.OrderBy(w => w.BoundingBox.Left).ToList(); + string line = string.Join(" ", lineWords.Select(w => w.Text)).Trim(); - double left = tableWords.Min(w => w.BoundingBox.Left) - 5; - double bottom = tableWords.Min(w => w.BoundingBox.Bottom) - 5; - double right = tableWords.Max(w => w.BoundingBox.Right) + 5; - double top = tableWords.Max(w => w.BoundingBox.Top) + 5; + if (string.IsNullOrWhiteSpace(line)) continue; - regions.Add(new PdfRectangle(left, bottom, right, top)); - i = endRow; + if (IsPotentialHeader(lineWords)) + { + sb.AppendLine($"# {line}"); } - } - - return regions; - } - - private static bool HasColumnAlignment(List row1, List row2, List row3) - { - var xPos1 = row1.Select(w => w.BoundingBox.Left).ToList(); - var xPos2 = row2.Select(w => w.BoundingBox.Left).ToList(); - var xPos3 = row3.Select(w => w.BoundingBox.Left).ToList(); - - double tolerance = 10.0; - int alignedCount = 0; - - foreach (var x1 in xPos1) - { - bool aligned2 = xPos2.Any(x2 => Math.Abs(x1 - x2) < tolerance); - bool aligned3 = xPos3.Any(x3 => Math.Abs(x1 - x3) < tolerance); - - if (aligned2 && aligned3) + else if (IsLabelValuePair(line)) + { + var parts = SplitLabelValue(line); + sb.AppendLine($"{parts.Item1}: {parts.Item2}"); + } + else if (IsListItem(line)) + { + sb.AppendLine($"- {line}"); + } + else if (IsDataRow(line)) + { + sb.AppendLine(FormatDataRowConcise(line)); + } + else { - alignedCount++; + sb.AppendLine(line); } } - return alignedCount >= 2 && row1.Count >= 2 && row2.Count >= 2 && row3.Count >= 2; + return sb.ToString(); } - private static string CreateMarkdownTable(List tableWords) + private static bool IsPotentialHeader(List words) { - var rows = tableWords - .GroupBy(w => Math.Round(w.BoundingBox.Bottom, 1)) - .OrderByDescending(g => g.Key) - .ToList(); - - if (!rows.Any()) return string.Empty; - - var columnPositions = GetColumnPositions(rows); - int columnCount = columnPositions.Count; - - if (columnCount < 2) - { - columnCount = 2; - columnPositions = new List { tableWords.Max(w => w.BoundingBox.Left) / 2 }; - } + if (!words.Any()) return false; - var sb = new StringBuilder(); - sb.Append("|"); - var headerRow = rows.First().OrderBy(w => w.BoundingBox.Left).ToList(); + var firstWord = words.First(); + double fontSize = 0; - for (int i = 0; i < columnCount; i++) + if (firstWord.Letters.Any()) { - double start = i == 0 ? 0 : columnPositions[i - 1]; - double end = i < columnPositions.Count - 1 ? columnPositions[i] : double.MaxValue; - - var cellWords = headerRow - .Where(w => w.BoundingBox.Left >= start && w.BoundingBox.Left < end) - .OrderBy(w => w.BoundingBox.Left) - .ToList(); - - string cellText = string.Join(" ", cellWords.Select(w => w.Text)); - sb.Append($" {CleanCellText(cellText)} ;"); + fontSize = firstWord.Letters.First().FontSize; } - - sb.AppendLine(); - sb.Append("|"); - - for (int i = 0; i < columnCount; i++) + else { - sb.Append(" --- |"); + fontSize = firstWord.BoundingBox.Height; } - sb.AppendLine(); - - foreach (var row in rows.Skip(1)) - { - sb.Append("|"); - var rowWords = row.OrderBy(w => w.BoundingBox.Left).ToList(); - - for (int i = 0; i < columnCount; i++) - { - double start = i == 0 ? 0 : columnPositions[i - 1]; - double end = i < columnPositions.Count - 1 ? columnPositions[i] : double.MaxValue; - - var cellWords = rowWords - .Where(w => w.BoundingBox.Left >= start && w.BoundingBox.Left < end) - .OrderBy(w => w.BoundingBox.Left) - .ToList(); - - string cellText = string.Join(" ", cellWords.Select(w => w.Text)); - sb.Append($" {CleanCellText(cellText)} |"); - } + bool isBold = firstWord.Letters.Any() && + firstWord.Letters.First().FontName!.ToLower().Contains("bold"); - sb.AppendLine(); - } + return fontSize > 12 || isBold; + } - return sb.ToString(); + private static bool IsLabelValuePair(string line) + { + return Regex.IsMatch(line, @"^.+:.+$"); } - private static List GetColumnPositions(List> rows) + private static Tuple SplitLabelValue(string line) { - var allPositions = new List(); - foreach (var row in rows) + var parts = line.Split([':'], 2); + + if (parts.Length == 2) { - allPositions.AddRange(row.Select(w => w.BoundingBox.Left)); + return new Tuple(parts[0].Trim(), parts[1].Trim()); } - var tolerance = 10.0; - var clusters = new List>(); - - foreach (var pos in allPositions.OrderBy(p => p)) - { - bool added = false; - foreach (var cluster in clusters) - { - if (Math.Abs(cluster.Average() - pos) < tolerance) - { - cluster.Add(pos); - added = true; - break; - } - } + return new Tuple(line, ""); + } - if (!added) - { - clusters.Add(new List { pos }); - } - } + private static bool IsListItem(string line) + { + return line.TrimStart().StartsWith("•") || + line.TrimStart().StartsWith("-") || + line.TrimStart().StartsWith("*") || + Regex.IsMatch(line.TrimStart(), @"^\d+\.\s"); + } - return clusters - .Where(c => c.Count > Math.Max(2, rows.Count / 4)) - .OrderBy(c => c.Average()) - .Select(c => c.Average()) - .ToList(); + private static bool IsDataRow(string line) + { + return ContainsNumberWithUnit(line) && Regex.Matches(line, @"\b\d+([.,]\d+)?\b").Count >= 2; } - private static string ProcessTextContent(List words) + private static bool ContainsNumberWithUnit(string line) { - var sb = new StringBuilder(); - var rows = words - .GroupBy(w => Math.Round(w.BoundingBox.Bottom, 1)) - .OrderByDescending(g => g.Key); + return Regex.IsMatch(line, @"\b\d+\s*[a-zA-Z]{1,3}\b"); + } - double prevRowHeight = 0; + private static string FormatDataRowConcise(string line) + { + var textMatch = Regex.Match(line, @"^(.*?)\s*\d"); + string descriptorText = textMatch.Success ? textMatch.Groups[1].Value.Trim() : ""; - foreach (var row in rows) - { - string line = string.Join(" ", row.OrderBy(w => w.BoundingBox.Left).Select(w => w.Text)); + var numUnitMatches = Regex.Matches(line, @"\b(\d+)\s*([a-zA-Z]{1,3})\b"); - if (string.IsNullOrWhiteSpace(line)) continue; + var numberMatches = Regex.Matches(line, @"\b(\d+([.,]\d+)?)\b"); - var firstWord = row.OrderBy(w => w.BoundingBox.Left).FirstOrDefault(); - double wordHeight = 0; + var sb = new StringBuilder(); + sb.Append("- "); + sb.Append(descriptorText); - if (firstWord != null) + if (numUnitMatches.Count > 0) + { + foreach (Match m in numUnitMatches) { - wordHeight = firstWord.BoundingBox.Height; - var letters = firstWord.Letters.ToList(); - - if (letters.Any()) - { - double estimatedFontSize = letters.First().FontSize; - if (estimatedFontSize > 0) - { - wordHeight = estimatedFontSize; - } - } + sb.Append($" | {m.Groups[1].Value} {m.Groups[2].Value}"); } + } - if (wordHeight > prevRowHeight * 1.2 && wordHeight > 10) - { - if (wordHeight > 14) - { - sb.AppendLine($"## {line}"); - } - else - { - sb.AppendLine($"### {line}"); - } - } - else if (line.TrimStart().StartsWith("•") || line.TrimStart().StartsWith("-") || - Regex.IsMatch(line.TrimStart(), @"^\d+\.")) + var processedIndices = new HashSet(); + foreach (Match numUnitMatch in numUnitMatches) + { + foreach (Match numMatch in numberMatches) { - int index = Math.Max(1, line.TrimStart().IndexOfAny(['•', '-', '.'])); - if (index < line.TrimStart().Length - 1) - { - sb.AppendLine($"* {line.TrimStart().Substring(index + 1).Trim()}"); - } - else + if (numMatch.Index >= numUnitMatch.Index && + numMatch.Index < numUnitMatch.Index + numUnitMatch.Length) { - sb.AppendLine($"* {line.TrimStart()}"); + processedIndices.Add(numMatch.Index); } } - else + } + + foreach (Match numMatch in numberMatches) + { + if (!processedIndices.Contains(numMatch.Index)) { - sb.AppendLine(line); - sb.AppendLine(); + sb.Append($" | {numMatch.Value}"); } - - prevRowHeight = wordHeight; } return sb.ToString(); } - private static string CleanCellText(string text) - { - if (string.IsNullOrWhiteSpace(text)) return ""; - - text = text.Trim(); - text = Regex.Replace(text, @"\s+", " "); - text = text.Replace("|", "\\|"); - - return text; - } - - private static bool IsLikelyHeader(IEnumerable row, IEnumerable nextRow) - { - var enumerable = row.ToList(); - var words = nextRow as Word[] ?? nextRow.ToArray(); - if (!enumerable.Any() || !words.Any()) - return false; - - bool hasLettersFontInfo = enumerable.Any(w => w.Letters.Any()); - bool nextHasLettersFontInfo = words.Any(w => w.Letters.Any()); - - if (hasLettersFontInfo && nextHasLettersFontInfo) - { - double rowAvgFontSize = enumerable - .SelectMany(w => w.Letters) - .Where(l => l.FontSize > 0) - .Select(l => l.FontSize) - .DefaultIfEmpty(0) - .Average(); - - double nextRowAvgFontSize = words - .SelectMany(w => w.Letters) - .Where(l => l.FontSize > 0) - .Select(l => l.FontSize) - .DefaultIfEmpty(0) - .Average(); - - return rowAvgFontSize > nextRowAvgFontSize * 1.2; - } - - double rowAvgHeight = enumerable.Average(w => w.BoundingBox.Height); - double nextRowAvgHeight = words.Average(w => w.BoundingBox.Height); - - return rowAvgHeight > nextRowAvgHeight * 1.2; - } private static string ProcessExcel(string filePath) { @@ -403,7 +234,7 @@ private static string ProcessExcel(string filePath) { string cellValue = ""; var cell = cells.FirstOrDefault(c => GetColumnIndex(GetColumnId(c.CellReference!)) == i); - + if (cell != null) { cellValue = GetCellValue(cell, sharedStringTable!); @@ -724,7 +555,7 @@ private static string ProcessRtf(string filePath) string rtfText = File.ReadAllText(filePath); string plainText = ConvertRtfToPlainText(rtfText); string[] lines = plainText.Split(['\r', '\n'], StringSplitOptions.RemoveEmptyEntries); - + foreach (var line in lines) { if (!string.IsNullOrWhiteSpace(line)) @@ -746,7 +577,7 @@ private static string ConvertRtfToPlainText(string rtfText) { string plainText = rtfText; int headerEnd = plainText.IndexOf("\\viewkind4", StringComparison.Ordinal); - + if (headerEnd > 0) { plainText = plainText.Substring(headerEnd); diff --git a/src/MaIN.Services/Services/LLMService/Memory/MemoryFactory.cs b/src/MaIN.Services/Services/LLMService/Memory/MemoryFactory.cs index 10e1f2e4..2d43a65b 100644 --- a/src/MaIN.Services/Services/LLMService/Memory/MemoryFactory.cs +++ b/src/MaIN.Services/Services/LLMService/Memory/MemoryFactory.cs @@ -21,7 +21,7 @@ public IKernelMemory CreateMemory(string modelsPath, string modelName) MaxMatchesCount = 5, FrequencyPenalty = 1, Temperature = 0.6f, - AnswerTokens = 500 + AnswerTokens = 1024 }); } @@ -87,7 +87,8 @@ private static LLamaSharpTextEmbeddingGenerator ConfigureGeneratorOptions(string var config = new LLamaSharpConfig(embeddingModelPath) { - DefaultInferenceParams = inferenceParams + DefaultInferenceParams = inferenceParams, + GpuLayerCount = 20, }; var parameters = new ModelParams(config.ModelPath) From 4a4519cdb5db62e5d14b8c786493db65a18994f7 Mon Sep 17 00:00:00 2001 From: Piotr Stachaczynski Date: Tue, 22 Apr 2025 21:32:22 +0200 Subject: [PATCH 3/4] feat: fix & cleanup --- Examples/Examples.SimpleConsole/Program.cs | 21 +++++-------------- src/MaIN.Core/Hub/Contexts/ChatContext.cs | 2 +- .../AgentSource/AgentFileSourceDetails.cs | 1 + .../Steps/Commands/FetchCommandHandler.cs | 3 ++- 4 files changed, 9 insertions(+), 18 deletions(-) diff --git a/Examples/Examples.SimpleConsole/Program.cs b/Examples/Examples.SimpleConsole/Program.cs index a3aa9bfd..dafa5be1 100644 --- a/Examples/Examples.SimpleConsole/Program.cs +++ b/Examples/Examples.SimpleConsole/Program.cs @@ -4,21 +4,10 @@ MaINBootstrapper.Initialize(); -// await AIHub.Chat() -// .WithModel("gemma2:2b") -// .WithMessage("Hello, World!") -// .CompleteAsync(interactive: true); +await AIHub.Chat() + .WithModel("gemma2:2b") + .WithMessage("Hello, World!") + .CompleteAsync(interactive: true); + -var result = AIHub.Chat() - .WithModel("llama3.1:8b") - .WithMessage("Output this invoice as JSON") - .WithMemoryParams(new MemoryParams() - { - AnswerTokens = 1024, - ContextSize = 4096 - }) - .WithFiles(["3.pdf"], preProcess: true); - -var chatResult = await result.CompleteAsync(); -Console.WriteLine(chatResult.Message.Content); diff --git a/src/MaIN.Core/Hub/Contexts/ChatContext.cs b/src/MaIN.Core/Hub/Contexts/ChatContext.cs index 480b4889..ab10f1e9 100644 --- a/src/MaIN.Core/Hub/Contexts/ChatContext.cs +++ b/src/MaIN.Core/Hub/Contexts/ChatContext.cs @@ -13,7 +13,7 @@ public class ChatContext private readonly IChatService _chatService; private bool _preProcess; private Chat _chat { get; set; } - private List _files { get; set; } + private List _files { get; set; } = []; internal ChatContext(IChatService chatService) { diff --git a/src/MaIN.Domain/Entities/Agents/AgentSource/AgentFileSourceDetails.cs b/src/MaIN.Domain/Entities/Agents/AgentSource/AgentFileSourceDetails.cs index 8207dd0c..14cc53ca 100644 --- a/src/MaIN.Domain/Entities/Agents/AgentSource/AgentFileSourceDetails.cs +++ b/src/MaIN.Domain/Entities/Agents/AgentSource/AgentFileSourceDetails.cs @@ -4,4 +4,5 @@ public class AgentFileSourceDetails : AgentSourceDetailsBase, IAgentSource { public required string Path { get; init; } public required string Name { get; init; } + public bool PreProcess { get; init; } = false; } \ No newline at end of file diff --git a/src/MaIN.Services/Services/Steps/Commands/FetchCommandHandler.cs b/src/MaIN.Services/Services/Steps/Commands/FetchCommandHandler.cs index fad9c197..482371b4 100644 --- a/src/MaIN.Services/Services/Steps/Commands/FetchCommandHandler.cs +++ b/src/MaIN.Services/Services/Steps/Commands/FetchCommandHandler.cs @@ -87,7 +87,8 @@ private async Task HandleFileSource(FetchCommand command, Dictionary { { fileData!.Name, fileData.Path } } + FileData = new Dictionary { { fileData!.Name, fileData.Path } }, + PreProcess = fileData.PreProcess } ); result!.Message.Role = command.ResponseType == FetchResponseType.AS_System ? "System" : "Assistant"; From abc5db36e02c02568e080073a87828d908866ef8 Mon Sep 17 00:00:00 2001 From: Piotr Stachaczynski Date: Wed, 23 Apr 2025 15:12:41 +0200 Subject: [PATCH 4/4] feat: more KM improv --- Examples/Examples/Chat/ChatWithFilesExample.cs | 2 +- Examples/Examples/Chat/ChatWithFilesFromStreamExample.cs | 2 +- src/MaIN.Core/MaIN.Core.csproj | 6 +++--- src/MaIN.Services/MaIN.Services.csproj | 8 +++----- src/MaIN.Services/Services/LLMService/LLMService.cs | 3 ++- .../Services/LLMService/Memory/MemoryFactory.cs | 3 +-- 6 files changed, 11 insertions(+), 13 deletions(-) diff --git a/Examples/Examples/Chat/ChatWithFilesExample.cs b/Examples/Examples/Chat/ChatWithFilesExample.cs index 75b7e247..0a0338ac 100644 --- a/Examples/Examples/Chat/ChatWithFilesExample.cs +++ b/Examples/Examples/Chat/ChatWithFilesExample.cs @@ -11,7 +11,7 @@ public async Task Start() List files = ["./Files/Nicolaus_Copernicus.pdf", "./Files/Galileo_Galilei.pdf"]; var result = await AIHub.Chat() - .WithModel("gemma2:2b") + .WithModel("gemma3:4b") .WithMessage("You have 2 documents in memory. Whats the difference of work between Galileo and Copernicus?. Give answer based on the documents.") .WithFiles(files) .CompleteAsync(); diff --git a/Examples/Examples/Chat/ChatWithFilesFromStreamExample.cs b/Examples/Examples/Chat/ChatWithFilesFromStreamExample.cs index 73b63a32..da850eb5 100644 --- a/Examples/Examples/Chat/ChatWithFilesFromStreamExample.cs +++ b/Examples/Examples/Chat/ChatWithFilesFromStreamExample.cs @@ -34,7 +34,7 @@ public async Task Start() } var result = await AIHub.Chat() - .WithModel("gemma2:2b") + .WithModel("qwen2.5:0.5b") .WithMessage("You have 2 documents in memory. Whats the difference of work between Galileo and Copernicus?. Give answer based on the documents.") .WithFiles(fileStreams) .CompleteAsync(); diff --git a/src/MaIN.Core/MaIN.Core.csproj b/src/MaIN.Core/MaIN.Core.csproj index 83bab8ad..8697f533 100644 --- a/src/MaIN.Core/MaIN.Core.csproj +++ b/src/MaIN.Core/MaIN.Core.csproj @@ -8,12 +8,12 @@ - + - - + + diff --git a/src/MaIN.Services/MaIN.Services.csproj b/src/MaIN.Services/MaIN.Services.csproj index 9660e372..adfd53bb 100644 --- a/src/MaIN.Services/MaIN.Services.csproj +++ b/src/MaIN.Services/MaIN.Services.csproj @@ -14,14 +14,12 @@ - + - - - - + + diff --git a/src/MaIN.Services/Services/LLMService/LLMService.cs b/src/MaIN.Services/Services/LLMService/LLMService.cs index f784da71..96cd103f 100644 --- a/src/MaIN.Services/Services/LLMService/LLMService.cs +++ b/src/MaIN.Services/Services/LLMService/LLMService.cs @@ -103,7 +103,8 @@ public Task CleanSessionCache(string? id) var parameters = new ModelParams(Path.Combine(modelsPath, model.FileName)) { GpuLayerCount = chat.MemoryParams.GpuLayerCount, - ContextSize = (uint)chat.MemoryParams.ContextSize + ContextSize = (uint)chat.MemoryParams.ContextSize, + Embeddings = true }; var llmModel = await LLamaWeights.LoadFromFileAsync(parameters, cancellationToken); var kernelMemory = memoryFactory.CreateMemoryWithModel( diff --git a/src/MaIN.Services/Services/LLMService/Memory/MemoryFactory.cs b/src/MaIN.Services/Services/LLMService/Memory/MemoryFactory.cs index 2d43a65b..968a85eb 100644 --- a/src/MaIN.Services/Services/LLMService/Memory/MemoryFactory.cs +++ b/src/MaIN.Services/Services/LLMService/Memory/MemoryFactory.cs @@ -50,7 +50,7 @@ public IKernelMemory CreateMemoryWithModel(string modelsPath, .With(parsingOptions) .Build(); } - + public IKernelMemory CreateMemoryWithOpenAi(string openAiKey, MemoryParams memoryParams) { var searchOptions = ConfigureSearchOptions(memoryParams); @@ -118,7 +118,6 @@ private static TextPartitioningOptions ConfigureParsingOptions() return new TextPartitioningOptions { MaxTokensPerParagraph = 300, - MaxTokensPerLine = 100, }; }