diff --git a/Examples/Examples.SimpleConsole/Program.cs b/Examples/Examples.SimpleConsole/Program.cs index d1c17f64..dafa5be1 100644 --- a/Examples/Examples.SimpleConsole/Program.cs +++ b/Examples/Examples.SimpleConsole/Program.cs @@ -1,5 +1,6 @@ using MaIN.Core; using MaIN.Core.Hub; +using MaIN.Domain.Entities; MaINBootstrapper.Initialize(); @@ -8,3 +9,5 @@ await AIHub.Chat() .WithMessage("Hello, World!") .CompleteAsync(interactive: true); + + diff --git a/Examples/Examples/Chat/ChatWithFilesExample.cs b/Examples/Examples/Chat/ChatWithFilesExample.cs index 75b7e247..0a0338ac 100644 --- a/Examples/Examples/Chat/ChatWithFilesExample.cs +++ b/Examples/Examples/Chat/ChatWithFilesExample.cs @@ -11,7 +11,7 @@ public async Task Start() List files = ["./Files/Nicolaus_Copernicus.pdf", "./Files/Galileo_Galilei.pdf"]; var result = await AIHub.Chat() - .WithModel("gemma2:2b") + .WithModel("gemma3:4b") .WithMessage("You have 2 documents in memory. Whats the difference of work between Galileo and Copernicus?. Give answer based on the documents.") .WithFiles(files) .CompleteAsync(); diff --git a/Examples/Examples/Chat/ChatWithFilesFromStreamExample.cs b/Examples/Examples/Chat/ChatWithFilesFromStreamExample.cs index 73b63a32..da850eb5 100644 --- a/Examples/Examples/Chat/ChatWithFilesFromStreamExample.cs +++ b/Examples/Examples/Chat/ChatWithFilesFromStreamExample.cs @@ -34,7 +34,7 @@ public async Task Start() } var result = await AIHub.Chat() - .WithModel("gemma2:2b") + .WithModel("qwen2.5:0.5b") .WithMessage("You have 2 documents in memory. Whats the difference of work between Galileo and Copernicus?. Give answer based on the documents.") .WithFiles(fileStreams) .CompleteAsync(); diff --git a/Releases/0.1.5.md b/Releases/0.1.5.md new file mode 100644 index 00000000..5e6aa22f --- /dev/null +++ b/Releases/0.1.5.md @@ -0,0 +1,3 @@ +# 0.1.5 release + +- Enable pre processing of documents, it can greatly improve KM performance on small models \ No newline at end of file diff --git a/src/MaIN.Core/.nuspec b/src/MaIN.Core/.nuspec index 5d207368..3934eb82 100644 --- a/src/MaIN.Core/.nuspec +++ b/src/MaIN.Core/.nuspec @@ -2,7 +2,7 @@ MaIN.NET - 0.1.4 + 0.1.5 Wisedev Wisedev favicon.png diff --git a/src/MaIN.Core/Hub/Contexts/ChatContext.cs b/src/MaIN.Core/Hub/Contexts/ChatContext.cs index a61d05a0..ab10f1e9 100644 --- a/src/MaIN.Core/Hub/Contexts/ChatContext.cs +++ b/src/MaIN.Core/Hub/Contexts/ChatContext.cs @@ -1,5 +1,6 @@ using MaIN.Domain.Entities; using MaIN.Domain.Models; +using MaIN.Services.Constants; using MaIN.Services.Dtos; using MaIN.Services.Services.Abstract; using MaIN.Services.Services.Models; @@ -10,8 +11,9 @@ namespace MaIN.Core.Hub.Contexts; public class ChatContext { private readonly IChatService _chatService; + private bool _preProcess; private Chat _chat { get; set; } - private List _files { get; set; } + private List _files { get; set; } = []; internal ChatContext(IChatService chatService) { @@ -84,7 +86,7 @@ public ChatContext WithSystemPrompt(string systemPrompt) return this; } - public ChatContext WithFiles(List fileStreams) + public ChatContext WithFiles(List fileStreams, bool preProcess = false) { var files = fileStreams.Select(p => new FileInfo() { @@ -94,17 +96,19 @@ public ChatContext WithFiles(List fileStreams) StreamContent = p }).ToList(); + _preProcess = preProcess; _files = files; return this; } - public ChatContext WithFiles(List files) + public ChatContext WithFiles(List files, bool preProcess = false) { _files = files; + _preProcess = preProcess; return this; } - public ChatContext WithFiles(List filePaths) + public ChatContext WithFiles(List filePaths, bool preProcess = false) { var files = filePaths.Select(p => new FileInfo() { @@ -113,6 +117,7 @@ public ChatContext WithFiles(List filePaths) Extension = Path.GetExtension(p) }).ToList(); + _preProcess = preProcess; _files = files; return this; } @@ -135,6 +140,11 @@ public async Task CompleteAsync( throw new InvalidOperationException("Chat has no messages."); //TODO good candidate for domain exception } _chat.Messages.Last().Files = _files; + if(_preProcess) + { + _chat.Messages.Last().Properties.Add(ServiceConstants.Messages.PreProcessProperty, string.Empty); + } + if (!await ChatExists(_chat.Id)) { await _chatService.Create(_chat); diff --git a/src/MaIN.Core/MaIN.Core.csproj b/src/MaIN.Core/MaIN.Core.csproj index 83bab8ad..8697f533 100644 --- a/src/MaIN.Core/MaIN.Core.csproj +++ b/src/MaIN.Core/MaIN.Core.csproj @@ -8,12 +8,12 @@ - + - - + + diff --git a/src/MaIN.Domain/Entities/Agents/AgentSource/AgentFileSourceDetails.cs b/src/MaIN.Domain/Entities/Agents/AgentSource/AgentFileSourceDetails.cs index 8207dd0c..14cc53ca 100644 --- a/src/MaIN.Domain/Entities/Agents/AgentSource/AgentFileSourceDetails.cs +++ b/src/MaIN.Domain/Entities/Agents/AgentSource/AgentFileSourceDetails.cs @@ -4,4 +4,5 @@ public class AgentFileSourceDetails : AgentSourceDetailsBase, IAgentSource { public required string Path { get; init; } public required string Name { get; init; } + public bool PreProcess { get; init; } = false; } \ No newline at end of file diff --git a/src/MaIN.Services/Constants/ServiceConstants.cs b/src/MaIN.Services/Constants/ServiceConstants.cs index 65670d70..385c4ca8 100644 --- a/src/MaIN.Services/Constants/ServiceConstants.cs +++ b/src/MaIN.Services/Constants/ServiceConstants.cs @@ -20,6 +20,7 @@ public static class ApiUrls public static class Messages { public const string GeneratedImageContent = "Generated Image:"; + public const string PreProcessProperty = "Pre_Process"; } public static class Defaults diff --git a/src/MaIN.Services/MaIN.Services.csproj b/src/MaIN.Services/MaIN.Services.csproj index 9660e372..adfd53bb 100644 --- a/src/MaIN.Services/MaIN.Services.csproj +++ b/src/MaIN.Services/MaIN.Services.csproj @@ -14,14 +14,12 @@ - + - - - - + + diff --git a/src/MaIN.Services/Services/LLMService/ChatMemoryOptions.cs b/src/MaIN.Services/Services/LLMService/ChatMemoryOptions.cs new file mode 100644 index 00000000..da0b3610 --- /dev/null +++ b/src/MaIN.Services/Services/LLMService/ChatMemoryOptions.cs @@ -0,0 +1,11 @@ +namespace MaIN.Services.Services.LLMService; + +public class ChatMemoryOptions +{ + public Dictionary? TextData { get; set; } + public Dictionary? FileData { get; set; } + public Dictionary? StreamData { get; set; } + public List? WebUrls { get; set; } + public List? Memory { get; set; } + public bool PreProcess { get; set; } = false; +} \ No newline at end of file diff --git a/src/MaIN.Services/Services/LLMService/LLMService.cs b/src/MaIN.Services/Services/LLMService/LLMService.cs index 1fac4ed9..96cd103f 100644 --- a/src/MaIN.Services/Services/LLMService/LLMService.cs +++ b/src/MaIN.Services/Services/LLMService/LLMService.cs @@ -100,7 +100,12 @@ public Task CleanSessionCache(string? id) CancellationToken cancellationToken = default) { var model = KnownModels.GetModel(chat.Model); - var parameters = new ModelParams(Path.Combine(modelsPath, model.FileName)); + var parameters = new ModelParams(Path.Combine(modelsPath, model.FileName)) + { + GpuLayerCount = chat.MemoryParams.GpuLayerCount, + ContextSize = (uint)chat.MemoryParams.ContextSize, + Embeddings = true + }; var llmModel = await LLamaWeights.LoadFromFileAsync(parameters, cancellationToken); var kernelMemory = memoryFactory.CreateMemoryWithModel( modelsPath, diff --git a/src/MaIN.Services/Services/LLMService/Memory/DocumentProcessor.cs b/src/MaIN.Services/Services/LLMService/Memory/DocumentProcessor.cs new file mode 100644 index 00000000..9f433c29 --- /dev/null +++ b/src/MaIN.Services/Services/LLMService/Memory/DocumentProcessor.cs @@ -0,0 +1,672 @@ +using System.Text; +using System.Text.RegularExpressions; +using DocumentFormat.OpenXml.Packaging; +using DocumentFormat.OpenXml.Spreadsheet; +using DocumentFormat.OpenXml.Wordprocessing; +using Tesseract; +using UglyToad.PdfPig; +using UglyToad.PdfPig.Content; +using Page = UglyToad.PdfPig.Content.Page; +using Text = DocumentFormat.OpenXml.Wordprocessing.Text; + +namespace MaIN.Services.Services.LLMService.Memory; + +public static class DocumentProcessor +{ + public static string ProcessDocument(string filePath) + { + string extension = Path.GetExtension(filePath).ToLower(); + + return extension switch + { + ".pdf" => ProcessPdf(filePath), + ".docx" => ProcessDocx(filePath), + ".xlsx" or ".xls" => ProcessExcel(filePath), + ".jpg" or ".jpeg" or ".png" or ".tiff" or ".bmp" => ProcessImage(filePath), + ".txt" => ProcessTextFile(filePath), + ".rtf" => ProcessRtf(filePath), + ".html" or ".htm" => ProcessHtml(filePath), + _ => throw new NotSupportedException($"Format {extension} not supported") + }; + } + + private static string ProcessPdf(string pdfPath) + { + var result = new StringBuilder(); + + using (var document = PdfDocument.Open(pdfPath)) + { + foreach (var page in document.GetPages()) + { + var pageText = ExtractPageText(page); + result.Append(pageText); + } + } + + return result.ToString(); + } + + private static string ExtractPageText(Page page) + { + var words = page.GetWords().ToList(); + var sb = new StringBuilder(); + + var rows = words + .GroupBy(w => Math.Round(w.BoundingBox.Bottom, 1)) + .OrderByDescending(g => g.Key) + .ToList(); + + foreach (var row in rows) + { + var lineWords = row.OrderBy(w => w.BoundingBox.Left).ToList(); + string line = string.Join(" ", lineWords.Select(w => w.Text)).Trim(); + + if (string.IsNullOrWhiteSpace(line)) continue; + + if (IsPotentialHeader(lineWords)) + { + sb.AppendLine($"# {line}"); + } + else if (IsLabelValuePair(line)) + { + var parts = SplitLabelValue(line); + sb.AppendLine($"{parts.Item1}: {parts.Item2}"); + } + else if (IsListItem(line)) + { + sb.AppendLine($"- {line}"); + } + else if (IsDataRow(line)) + { + sb.AppendLine(FormatDataRowConcise(line)); + } + else + { + sb.AppendLine(line); + } + } + + return sb.ToString(); + } + + private static bool IsPotentialHeader(List words) + { + if (!words.Any()) return false; + + var firstWord = words.First(); + double fontSize = 0; + + if (firstWord.Letters.Any()) + { + fontSize = firstWord.Letters.First().FontSize; + } + else + { + fontSize = firstWord.BoundingBox.Height; + } + + bool isBold = firstWord.Letters.Any() && + firstWord.Letters.First().FontName!.ToLower().Contains("bold"); + + return fontSize > 12 || isBold; + } + + private static bool IsLabelValuePair(string line) + { + return Regex.IsMatch(line, @"^.+:.+$"); + } + + private static Tuple SplitLabelValue(string line) + { + var parts = line.Split([':'], 2); + + if (parts.Length == 2) + { + return new Tuple(parts[0].Trim(), parts[1].Trim()); + } + + return new Tuple(line, ""); + } + + private static bool IsListItem(string line) + { + return line.TrimStart().StartsWith("•") || + line.TrimStart().StartsWith("-") || + line.TrimStart().StartsWith("*") || + Regex.IsMatch(line.TrimStart(), @"^\d+\.\s"); + } + + private static bool IsDataRow(string line) + { + return ContainsNumberWithUnit(line) && Regex.Matches(line, @"\b\d+([.,]\d+)?\b").Count >= 2; + } + + private static bool ContainsNumberWithUnit(string line) + { + return Regex.IsMatch(line, @"\b\d+\s*[a-zA-Z]{1,3}\b"); + } + + private static string FormatDataRowConcise(string line) + { + var textMatch = Regex.Match(line, @"^(.*?)\s*\d"); + string descriptorText = textMatch.Success ? textMatch.Groups[1].Value.Trim() : ""; + + var numUnitMatches = Regex.Matches(line, @"\b(\d+)\s*([a-zA-Z]{1,3})\b"); + + var numberMatches = Regex.Matches(line, @"\b(\d+([.,]\d+)?)\b"); + + var sb = new StringBuilder(); + sb.Append("- "); + sb.Append(descriptorText); + + if (numUnitMatches.Count > 0) + { + foreach (Match m in numUnitMatches) + { + sb.Append($" | {m.Groups[1].Value} {m.Groups[2].Value}"); + } + } + + var processedIndices = new HashSet(); + foreach (Match numUnitMatch in numUnitMatches) + { + foreach (Match numMatch in numberMatches) + { + if (numMatch.Index >= numUnitMatch.Index && + numMatch.Index < numUnitMatch.Index + numUnitMatch.Length) + { + processedIndices.Add(numMatch.Index); + } + } + } + + foreach (Match numMatch in numberMatches) + { + if (!processedIndices.Contains(numMatch.Index)) + { + sb.Append($" | {numMatch.Value}"); + } + } + + return sb.ToString(); + } + + + private static string ProcessExcel(string filePath) + { + var structuredContent = new StringBuilder(); + structuredContent.AppendLine("[SPREADSHEET_START]"); + + using var document = SpreadsheetDocument.Open(filePath, false); + var workbookPart = document.WorkbookPart; + var sheets = workbookPart!.Workbook.Descendants(); + var sharedStringTable = workbookPart.SharedStringTablePart?.SharedStringTable; + + foreach (var sheet in sheets) + { + var worksheetPart = (WorksheetPart)workbookPart!.GetPartById(sheet.Id!); + var sheetData = worksheetPart.Worksheet.Elements().First(); + + structuredContent.AppendLine($"[SHEET:{sheet.Name}]"); + + if (!sheetData.Elements().Any()) + { + structuredContent.AppendLine("[EMPTY_SHEET]"); + structuredContent.AppendLine($"[/SHEET:{sheet.Name}]"); + continue; + } + + bool firstRow = true; + int maxColumns = 0; + + foreach (var row in sheetData.Elements()) + { + int cellCount = row.Elements().Count(); + maxColumns = Math.Max(maxColumns, cellCount); + } + + foreach (var row in sheetData.Elements()) + { + var rowContent = new StringBuilder("|"); + var cells = row.Elements().ToList(); + + for (int i = 0; i < maxColumns; i++) + { + string cellValue = ""; + var cell = cells.FirstOrDefault(c => GetColumnIndex(GetColumnId(c.CellReference!)) == i); + + if (cell != null) + { + cellValue = GetCellValue(cell, sharedStringTable!); + } + + rowContent.Append($" {cellValue} |"); + } + + structuredContent.AppendLine(rowContent.ToString()); + + if (firstRow && maxColumns > 0) + { + var separatorRow = new StringBuilder("|"); + for (int i = 0; i < maxColumns; i++) + { + separatorRow.Append(" --- |"); + } + + structuredContent.AppendLine(separatorRow.ToString()); + firstRow = false; + } + } + + structuredContent.AppendLine($"[/SHEET:{sheet.Name}]"); + } + + structuredContent.AppendLine("[SPREADSHEET_END]"); + return structuredContent.ToString(); + } + + private static string GetColumnId(string cellReference) + { + if (string.IsNullOrEmpty(cellReference)) + return ""; + + return new string(cellReference.TakeWhile(char.IsLetter).ToArray()); + } + + private static int GetColumnIndex(string columnId) + { + int index = 0; + foreach (char c in columnId) + { + index = (index * 26) + (c - 'A' + 1); + } + + return index - 1; + } + + private static string GetCellValue(Cell cell, SharedStringTable sharedStringTable) + { + if (cell.CellValue == null) + return string.Empty; + + string value = cell.CellValue.Text; + + if (cell.DataType != null && cell.DataType.Value == CellValues.SharedString && sharedStringTable != null) + { + if (int.TryParse(value, out int ssid) && ssid >= 0 && ssid < sharedStringTable.Count()) + { + return sharedStringTable.ElementAt(ssid).InnerText; + } + } + + return value; + } + + private static string ProcessDocx(string filePath) + { + var structuredContent = new StringBuilder(); + structuredContent.AppendLine("[DOCUMENT_START]"); + + using var document = WordprocessingDocument.Open(filePath, false); + var body = document.MainDocumentPart?.Document.Body; + + structuredContent.AppendLine("[METADATA_START]"); + if (!string.IsNullOrEmpty(document.PackageProperties.Title)) + structuredContent.AppendLine($"Title: {document.PackageProperties.Title}"); + if (!string.IsNullOrEmpty(document.PackageProperties.Creator)) + structuredContent.AppendLine($"Author: {document.PackageProperties.Creator}"); + if (!string.IsNullOrEmpty(document.PackageProperties.Subject)) + structuredContent.AppendLine($"Subject: {document.PackageProperties.Subject}"); + structuredContent.AppendLine("[METADATA_END]"); + + foreach (var element in body?.Elements()!) + { + if (element is Paragraph paragraph) + { + string text = ExtractTextFromParagraph(paragraph); + if (string.IsNullOrWhiteSpace(text)) + continue; + + if (IsParagraphHeading(paragraph)) + { + structuredContent.AppendLine($"[HEADING]{text}[/HEADING]"); + } + else + { + structuredContent.AppendLine(text); + } + } + else if (element is DocumentFormat.OpenXml.Wordprocessing.Table table) + { + structuredContent.AppendLine("[TABLE_START]"); + FormatWordTableAsMarkdown(table, structuredContent); + structuredContent.AppendLine("[TABLE_END]"); + } + } + + structuredContent.AppendLine("[DOCUMENT_END]"); + return structuredContent.ToString(); + } + + private static string ExtractTextFromParagraph(Paragraph paragraph) + { + return string.Join(" ", paragraph.Descendants().Select(t => t.Text)); + } + + private static bool IsParagraphHeading(Paragraph paragraph) + { + var styleId = paragraph.ParagraphProperties?.ParagraphStyleId?.Val?.Value; + return styleId != null && (styleId.StartsWith("Heading") || styleId.StartsWith("Title")); + } + + private static void FormatWordTableAsMarkdown(DocumentFormat.OpenXml.Wordprocessing.Table table, + StringBuilder output) + { + bool isFirstRow = true; + + foreach (var row in table.Elements()) + { + StringBuilder rowBuilder = new StringBuilder("|"); + + foreach (var cell in row.Elements()) + { + string cellText = string.Join(" ", cell.Descendants().Select(t => t.Text)); + rowBuilder.Append($" {cellText.Trim()} |"); + } + + output.AppendLine(rowBuilder.ToString()); + + if (isFirstRow) + { + isFirstRow = false; + int cellCount = row.Elements().Count(); + StringBuilder separatorBuilder = new StringBuilder("|"); + + for (int i = 0; i < cellCount; i++) + { + separatorBuilder.Append(" --- |"); + } + + output.AppendLine(separatorBuilder.ToString()); + } + } + } + + private static string ProcessImage(string filePath) + { + var structuredContent = new StringBuilder(); + structuredContent.AppendLine("[IMAGE_DOCUMENT_START]"); + + try + { + using var engine = new TesseractEngine("./tessdata", "eng", EngineMode.Default); + using var img = Pix.LoadFromFile(filePath); + using var page = engine.Process(img); + + string text = page.GetText(); + var lines = text.Split('\n'); + bool inTable = false; + + foreach (var line in lines) + { + bool looksLikeTableRow = line.Contains('\t') || line.Contains(" "); + + if (looksLikeTableRow && !inTable) + { + structuredContent.AppendLine("[TABLE_START]"); + inTable = true; + } + else if (!looksLikeTableRow && inTable) + { + structuredContent.AppendLine("[TABLE_END]"); + inTable = false; + } + + if (!string.IsNullOrWhiteSpace(line)) + { + if (inTable) + { + string formattedLine = line.Trim(); + formattedLine = Regex.Replace(formattedLine, @"\s{3,}", " | "); + formattedLine = formattedLine.Replace('\t', '|'); + structuredContent.AppendLine($"|{formattedLine}|"); + } + else + { + structuredContent.AppendLine(line); + } + } + } + + if (inTable) + { + structuredContent.AppendLine("[TABLE_END]"); + } + } + catch (Exception ex) + { + structuredContent.AppendLine($"[OCR_ERROR: {ex.Message}]"); + } + + structuredContent.AppendLine("[IMAGE_DOCUMENT_END]"); + return structuredContent.ToString(); + } + + private static string ProcessTextFile(string filePath) + { + var structuredContent = new StringBuilder(); + structuredContent.AppendLine("[TEXT_DOCUMENT_START]"); + + string[] lines = File.ReadAllLines(filePath); + bool inTable = false; + + foreach (var line in lines) + { + bool looksLikeTableRow = IsLikelyTableRow(line); + + if (looksLikeTableRow && !inTable) + { + structuredContent.AppendLine("[TABLE_START]"); + inTable = true; + } + else if (!looksLikeTableRow && inTable) + { + structuredContent.AppendLine("[TABLE_END]"); + inTable = false; + } + + if (!string.IsNullOrWhiteSpace(line)) + { + if (inTable) + { + string formattedLine = FormatPlainTextTableRow(line); + structuredContent.AppendLine(formattedLine); + } + else + { + structuredContent.AppendLine(line); + } + } + } + + if (inTable) + { + structuredContent.AppendLine("[TABLE_END]"); + } + + structuredContent.AppendLine("[TEXT_DOCUMENT_END]"); + return structuredContent.ToString(); + } + + private static bool IsLikelyTableRow(string line) + { + if (line.Count(c => c == '\t') >= 2) + return true; + + var spaces = new List(); + int currentSpace = 0; + + foreach (var t in line) + { + if (t == ' ') + { + currentSpace++; + } + else + { + if (currentSpace >= 3) + { + spaces.Add(currentSpace); + } + + currentSpace = 0; + } + } + + if (currentSpace >= 3) + { + spaces.Add(currentSpace); + } + + return spaces.Count >= 2; + } + + private static string FormatPlainTextTableRow(string line) + { + string formatted = line.Replace('\t', '|'); + formatted = Regex.Replace(formatted, @"\s{3,}", "|"); + + if (!formatted.StartsWith("|")) + formatted = "|" + formatted; + + if (!formatted.EndsWith("|")) + formatted = formatted + "|"; + + return formatted; + } + + private static string ProcessRtf(string filePath) + { + var structuredContent = new StringBuilder(); + structuredContent.AppendLine("[RTF_DOCUMENT_START]"); + + try + { + string rtfText = File.ReadAllText(filePath); + string plainText = ConvertRtfToPlainText(rtfText); + string[] lines = plainText.Split(['\r', '\n'], StringSplitOptions.RemoveEmptyEntries); + + foreach (var line in lines) + { + if (!string.IsNullOrWhiteSpace(line)) + { + structuredContent.AppendLine(line.Trim()); + } + } + } + catch (Exception ex) + { + structuredContent.AppendLine($"[RTF_PROCESSING_ERROR: {ex.Message}]"); + } + + structuredContent.AppendLine("[RTF_DOCUMENT_END]"); + return structuredContent.ToString(); + } + + private static string ConvertRtfToPlainText(string rtfText) + { + string plainText = rtfText; + int headerEnd = plainText.IndexOf("\\viewkind4", StringComparison.Ordinal); + + if (headerEnd > 0) + { + plainText = plainText.Substring(headerEnd); + } + + plainText = Regex.Replace(plainText, @"\\[a-zA-Z]+[0-9]*", " "); + plainText = plainText.Replace("{", "").Replace("}", ""); + plainText = plainText.Replace("\\", ""); + plainText = Regex.Replace(plainText, @"\s+", " "); + + return plainText.Trim(); + } + + private static string ProcessHtml(string filePath) + { + var structuredContent = new StringBuilder(); + structuredContent.AppendLine("[HTML_DOCUMENT_START]"); + + try + { + string htmlText = File.ReadAllText(filePath); + string plainText = StripHtmlTags(htmlText); + ExtractTablesFromHtml(htmlText, structuredContent); + + var lines = plainText.Split(['\r', '\n'], StringSplitOptions.RemoveEmptyEntries); + foreach (var line in lines) + { + if (!string.IsNullOrWhiteSpace(line)) + { + structuredContent.AppendLine(line.Trim()); + } + } + } + catch (Exception ex) + { + structuredContent.AppendLine($"[HTML_PROCESSING_ERROR: {ex.Message}]"); + } + + structuredContent.AppendLine("[HTML_DOCUMENT_END]"); + return structuredContent.ToString(); + } + + private static string StripHtmlTags(string html) + { + return Regex.Replace(html, @"<[^>]+>", " "); + } + + private static void ExtractTablesFromHtml(string html, StringBuilder output) + { + var tableMatches = Regex.Matches(html, @"]*>(.*?)", RegexOptions.Singleline); + + foreach (Match tableMatch in tableMatches) + { + string tableHtml = tableMatch.Groups[1].Value; + output.AppendLine("[TABLE_START]"); + var rowMatches = Regex.Matches(tableHtml, @"]*>(.*?)", RegexOptions.Singleline); + bool isFirstRow = true; + + foreach (Match rowMatch in rowMatches) + { + string rowHtml = rowMatch.Groups[1].Value; + StringBuilder rowBuilder = new StringBuilder("|"); + var cellMatches = Regex.Matches(rowHtml, @"<(td|th)[^>]*>(.*?)", RegexOptions.Singleline); + + foreach (Match cellMatch in cellMatches) + { + string cellContent = cellMatch.Groups[2].Value; + cellContent = Regex.Replace(cellContent, @"<[^>]+>", ""); + rowBuilder.Append($" {cellContent.Trim()} |"); + } + + output.AppendLine(rowBuilder.ToString()); + + if (isFirstRow) + { + int cellCount = cellMatches.Count; + StringBuilder separatorBuilder = new StringBuilder("|"); + + for (int i = 0; i < cellCount; i++) + { + separatorBuilder.Append(" --- |"); + } + + output.AppendLine(separatorBuilder.ToString()); + isFirstRow = false; + } + } + + output.AppendLine("[TABLE_END]"); + } + } +} \ No newline at end of file diff --git a/src/MaIN.Services/Services/LLMService/Memory/MemoryFactory.cs b/src/MaIN.Services/Services/LLMService/Memory/MemoryFactory.cs index 10e1f2e4..968a85eb 100644 --- a/src/MaIN.Services/Services/LLMService/Memory/MemoryFactory.cs +++ b/src/MaIN.Services/Services/LLMService/Memory/MemoryFactory.cs @@ -21,7 +21,7 @@ public IKernelMemory CreateMemory(string modelsPath, string modelName) MaxMatchesCount = 5, FrequencyPenalty = 1, Temperature = 0.6f, - AnswerTokens = 500 + AnswerTokens = 1024 }); } @@ -50,7 +50,7 @@ public IKernelMemory CreateMemoryWithModel(string modelsPath, .With(parsingOptions) .Build(); } - + public IKernelMemory CreateMemoryWithOpenAi(string openAiKey, MemoryParams memoryParams) { var searchOptions = ConfigureSearchOptions(memoryParams); @@ -87,7 +87,8 @@ private static LLamaSharpTextEmbeddingGenerator ConfigureGeneratorOptions(string var config = new LLamaSharpConfig(embeddingModelPath) { - DefaultInferenceParams = inferenceParams + DefaultInferenceParams = inferenceParams, + GpuLayerCount = 20, }; var parameters = new ModelParams(config.ModelPath) @@ -117,7 +118,6 @@ private static TextPartitioningOptions ConfigureParsingOptions() return new TextPartitioningOptions { MaxTokensPerParagraph = 300, - MaxTokensPerLine = 100, }; } diff --git a/src/MaIN.Services/Services/LLMService/Memory/MemoryService.cs b/src/MaIN.Services/Services/LLMService/Memory/MemoryService.cs index ba9a3d6a..c5880071 100644 --- a/src/MaIN.Services/Services/LLMService/Memory/MemoryService.cs +++ b/src/MaIN.Services/Services/LLMService/Memory/MemoryService.cs @@ -9,13 +9,17 @@ public async Task ImportDataToMemory( ChatMemoryOptions options, CancellationToken cancellationToken) { + if (options.PreProcess) + { + await PreprocessAvailableDocuments(options, cancellationToken); + } await ImportTextData(memory, options.TextData, cancellationToken); await ImportFileData(memory, options.FileData, cancellationToken); await ImportStreamData(memory, options.StreamData, cancellationToken); await ImportWebUrls(memory, options.WebUrls, cancellationToken); await ImportMemoryItems(memory, options.Memory, cancellationToken); } - + public string CleanResponseText(string text) { return text @@ -85,4 +89,23 @@ await memory.ImportTextAsync( cancellationToken: cancellationToken); } } + + private static async Task PreprocessAvailableDocuments(ChatMemoryOptions options, CancellationToken cancellationToken) + { + foreach (var file in options.FileData!) + { + options.TextData!.Add(file.Key ,DocumentProcessor.ProcessDocument(file.Value)); + options.FileData = []; + } + + foreach (var stream in options.StreamData!) + { + var fileStream = new FileStream(Path.GetTempPath()+$".{stream.Key}", FileMode.Create, FileAccess.Write); + await stream.Value.CopyToAsync(fileStream, cancellationToken); + await fileStream.DisposeAsync(); + options.TextData!.Add(stream.Key, DocumentProcessor.ProcessDocument(Path.GetTempPath()+$".{stream.Key}")); + options.StreamData = []; + } + } + } \ No newline at end of file diff --git a/src/MaIN.Services/Services/LLMService/OpenAiService.cs b/src/MaIN.Services/Services/LLMService/OpenAiService.cs index 9a06ce4a..881fc990 100644 --- a/src/MaIN.Services/Services/LLMService/OpenAiService.cs +++ b/src/MaIN.Services/Services/LLMService/OpenAiService.cs @@ -344,16 +344,6 @@ public class ChatRequestOptions public Func? TokenCallback { get; set; } } - -public class ChatMemoryOptions -{ - public Dictionary? TextData { get; set; } - public Dictionary? FileData { get; set; } - public Dictionary? StreamData { get; set; } - public List? WebUrls { get; set; } - public List? Memory { get; set; } -} - internal class ChatMessage(string role, string content) { public string Role { get; set; } = role; diff --git a/src/MaIN.Services/Services/LLMService/Utils/ChatHelper.cs b/src/MaIN.Services/Services/LLMService/Utils/ChatHelper.cs index 33ae77aa..b3ac6bca 100644 --- a/src/MaIN.Services/Services/LLMService/Utils/ChatHelper.cs +++ b/src/MaIN.Services/Services/LLMService/Utils/ChatHelper.cs @@ -67,12 +67,15 @@ public static ChatMemoryOptions ExtractMemoryOptions(Message message) var streamData = message.Files .Where(x => x.StreamContent != null) .ToDictionary(x => x.Name, x => x.StreamContent!); + + var preProcess = message.Properties.ContainsKey(Constants.ServiceConstants.Messages.PreProcessProperty); return new ChatMemoryOptions { TextData = textData, FileData = fileData, - StreamData = streamData + StreamData = streamData, + PreProcess = preProcess }; } } \ No newline at end of file diff --git a/src/MaIN.Services/Services/Steps/Commands/FetchCommandHandler.cs b/src/MaIN.Services/Services/Steps/Commands/FetchCommandHandler.cs index fad9c197..482371b4 100644 --- a/src/MaIN.Services/Services/Steps/Commands/FetchCommandHandler.cs +++ b/src/MaIN.Services/Services/Steps/Commands/FetchCommandHandler.cs @@ -87,7 +87,8 @@ private async Task HandleFileSource(FetchCommand command, Dictionary { { fileData!.Name, fileData.Path } } + FileData = new Dictionary { { fileData!.Name, fileData.Path } }, + PreProcess = fileData.PreProcess } ); result!.Message.Role = command.ResponseType == FetchResponseType.AS_System ? "System" : "Assistant";