diff --git a/apps/api/src/questionnaire/questionnaire.controller.spec.ts b/apps/api/src/questionnaire/questionnaire.controller.spec.ts index 4e6ea5a3e1..080526fec3 100644 --- a/apps/api/src/questionnaire/questionnaire.controller.spec.ts +++ b/apps/api/src/questionnaire/questionnaire.controller.spec.ts @@ -321,13 +321,16 @@ describe('QuestionnaireController', () => { source: 'internal', }; mockService.uploadAndParse.mockResolvedValue({ - questionnaireId: 'q1', - totalQuestions: 10, + runId: 'run_123', + publicAccessToken: 'token_123', }); const result = await controller.uploadAndParse(dto as any, 'org_1'); - expect(result).toEqual({ questionnaireId: 'q1', totalQuestions: 10 }); + expect(result).toEqual({ + runId: 'run_123', + publicAccessToken: 'token_123', + }); }); it('should override body organizationId with auth-derived org', async () => { @@ -339,8 +342,8 @@ describe('QuestionnaireController', () => { source: 'internal', }; mockService.uploadAndParse.mockResolvedValue({ - questionnaireId: 'q1', - totalQuestions: 10, + runId: 'run_123', + publicAccessToken: 'token_123', }); await controller.uploadAndParse(dto as any, 'org_1'); diff --git a/apps/api/src/questionnaire/questionnaire.service.spec.ts b/apps/api/src/questionnaire/questionnaire.service.spec.ts index 21d009943c..2bc89940ed 100644 --- a/apps/api/src/questionnaire/questionnaire.service.spec.ts +++ b/apps/api/src/questionnaire/questionnaire.service.spec.ts @@ -36,9 +36,14 @@ jest.mock('@/trigger/questionnaire/answer-question-helpers', () => ({ generateAnswerWithRAGBatch: jest.fn(), })); +jest.mock('@trigger.dev/sdk', () => ({ + tasks: { + trigger: jest.fn(), + }, +})); + jest.mock('./utils/content-extractor', () => ({ extractContentFromFile: jest.fn(), - extractQuestionsWithAI: jest.fn(), })); jest.mock('./utils/question-parser', () => ({ @@ -59,13 +64,13 @@ jest.mock('./utils/questionnaire-storage', () => ({ import { db } from '@db'; import { syncManualAnswerToVector } from '@/vector-store/lib'; import { answerQuestion } from '@/trigger/questionnaire/answer-question'; +import { tasks } from '@trigger.dev/sdk'; import { updateAnsweredCount, persistQuestionnaireResult, uploadQuestionnaireFile, saveGeneratedAnswer, } from './utils/questionnaire-storage'; -import { extractQuestionsWithAI } from './utils/content-extractor'; import { generateExportFile } from './utils/export-generator'; const mockDb = db as jest.Mocked; @@ -218,7 +223,7 @@ describe('QuestionnaireService', () => { (mockDb.questionnaire.findUnique as jest.Mock).mockResolvedValue(null); await expect(service.deleteById('missing', 'org_1')).rejects.toThrow( - 'Questionnaire not found', + 'Questionnaire with ID missing not found', ); expect(mockDb.questionnaire.delete).not.toHaveBeenCalled(); }); @@ -454,16 +459,15 @@ describe('QuestionnaireService', () => { }); describe('uploadAndParse', () => { - it('should upload file, parse questions, and persist', async () => { + it('should upload file and trigger async parsing', async () => { (uploadQuestionnaireFile as jest.Mock).mockResolvedValue({ s3Key: 'key', fileSize: 1024, }); - (extractQuestionsWithAI as jest.Mock).mockResolvedValue([ - { question: 'Q1?', answer: null }, - { question: 'Q2?', answer: null }, - ]); - (persistQuestionnaireResult as jest.Mock).mockResolvedValue('q1'); + (tasks.trigger as jest.Mock).mockResolvedValue({ + id: 'run_123', + publicAccessToken: 'token_123', + }); const result = await service.uploadAndParse({ organizationId: 'org_1', @@ -473,23 +477,23 @@ describe('QuestionnaireService', () => { source: 'internal', } as any); - expect(result).toEqual({ questionnaireId: 'q1', totalQuestions: 2 }); + expect(result).toEqual({ + runId: 'run_123', + publicAccessToken: 'token_123', + }); expect(uploadQuestionnaireFile).toHaveBeenCalled(); - expect(extractQuestionsWithAI).toHaveBeenCalledWith( - 'base64data', - 'application/pdf', - expect.any(Object), - ); - expect(persistQuestionnaireResult).toHaveBeenCalled(); - }); - - it('should throw when persist returns null', async () => { - (uploadQuestionnaireFile as jest.Mock).mockResolvedValue({ + expect(tasks.trigger).toHaveBeenCalledWith('parse-questionnaire', { + inputType: 's3', + organizationId: 'org_1', s3Key: 'key', + fileName: 'test.pdf', + fileType: 'application/pdf', fileSize: 1024, }); - (extractQuestionsWithAI as jest.Mock).mockResolvedValue([]); - (persistQuestionnaireResult as jest.Mock).mockResolvedValue(null); + }); + + it('should throw when upload fails', async () => { + (uploadQuestionnaireFile as jest.Mock).mockResolvedValue(null); await expect( service.uploadAndParse({ @@ -498,7 +502,7 @@ describe('QuestionnaireService', () => { fileType: 'application/pdf', fileData: 'base64data', } as any), - ).rejects.toThrow('Failed to save questionnaire'); + ).rejects.toThrow('Failed to upload questionnaire file to S3'); }); }); diff --git a/apps/api/src/questionnaire/questionnaire.service.ts b/apps/api/src/questionnaire/questionnaire.service.ts index 6714c441d5..ba0e91dac2 100644 --- a/apps/api/src/questionnaire/questionnaire.service.ts +++ b/apps/api/src/questionnaire/questionnaire.service.ts @@ -5,10 +5,7 @@ import { generateAnswerWithRAGBatch } from '@/trigger/questionnaire/answer-quest import { tasks } from '@trigger.dev/sdk'; import type { parseQuestionnaireTask } from '@/trigger/questionnaire/parse-questionnaire'; import { ParseQuestionnaireDto } from './dto/parse-questionnaire.dto'; -import { - ExportQuestionnaireDto, - type QuestionnaireExportFormat, -} from './dto/export-questionnaire.dto'; +import { ExportQuestionnaireDto } from './dto/export-questionnaire.dto'; import { AnswerSingleQuestionDto } from './dto/answer-single-question.dto'; import { SaveAnswerDto } from './dto/save-answer.dto'; import { DeleteAnswerDto } from './dto/delete-answer.dto'; @@ -24,13 +21,9 @@ import AdmZip from 'adm-zip'; // Import shared utilities import { extractContentFromFile, - extractQuestionsWithAI, type ContentExtractionLogger, } from './utils/content-extractor'; -import { - parseQuestionsAndAnswers, - type QuestionAnswer as ParsedQA, -} from './utils/question-parser'; +import { parseQuestionsAndAnswers } from './utils/question-parser'; import { generateExportFile, type ExportFormat, @@ -85,12 +78,15 @@ export class QuestionnaireService { async parseQuestionnaire( dto: ParseQuestionnaireDto, ): Promise { - // Use faster AI-powered extraction (combines extraction + parsing in one step) - const questionsAndAnswers = await extractQuestionsWithAI( + const extractedContent = await extractContentFromFile( dto.fileData, dto.fileType, this.contentLogger, ); + const questionsAndAnswers = await parseQuestionsAndAnswers( + extractedContent, + this.contentLogger, + ); return { vendorName: dto.vendorName, @@ -122,24 +118,33 @@ export class QuestionnaireService { ); } - console.log(Date.now(), 'Parsing questionnaire'); - // Use faster AI-powered extraction (combines extraction + parsing in one step) - const questionsAndAnswers = await extractQuestionsWithAI( + this.logger.log('Parsing questionnaire for auto-answer export'); + const extractedContent = await extractContentFromFile( dto.fileData, dto.fileType, this.contentLogger, ); - console.log(Date.now(), 'Parsed questionnaire'); + const questionsAndAnswers = await parseQuestionsAndAnswers( + extractedContent, + this.contentLogger, + ); + this.logger.log('Parsed questionnaire for auto-answer export', { + questionCount: questionsAndAnswers.length, + }); - console.log(Date.now(), 'Generating answers for questions'); + this.logger.log('Generating answers for parsed questionnaire', { + questionCount: questionsAndAnswers.length, + }); const answered = await this.generateAnswersForQuestions( questionsAndAnswers.map((qa) => ({ question: qa.question, - answer: qa.answer, + answer: null, })), dto.organizationId, ); - console.log(Date.now(), 'Generated answers for questions'); + this.logger.log('Generated questionnaire answers', { + questionCount: answered.length, + }); const vendorName = dto.vendorName || dto.fileName || 'questionnaire'; @@ -186,7 +191,7 @@ export class QuestionnaireService { // Single format export (default behavior) const exportFile = await generateExportFile( answered.map((a) => ({ question: a.question, answer: a.answer })), - dto.format as ExportFormat, + dto.format, vendorName, ); @@ -432,7 +437,7 @@ export class QuestionnaireService { return await generateExportFile( questionsAndAnswers, - dto.format as ExportFormat, + dto.format, questionnaire.filename, ); } diff --git a/apps/api/src/questionnaire/utils/constants.ts b/apps/api/src/questionnaire/utils/constants.ts index 972d96ea76..8431868f11 100644 --- a/apps/api/src/questionnaire/utils/constants.ts +++ b/apps/api/src/questionnaire/utils/constants.ts @@ -2,10 +2,11 @@ * Shared constants for questionnaire module */ -// Chunk sizes for question-aware parsing -export const MAX_CHUNK_SIZE_CHARS = 80_000; +// Chunk sizes for questionnaire item classification +export const MAX_CHUNK_SIZE_CHARS = 25_000; export const MIN_CHUNK_SIZE_CHARS = 5_000; export const MAX_QUESTIONS_PER_CHUNK = 1; +export const MAX_CLASSIFICATION_CONCURRENCY = 4; // File size limits export const MAX_FILE_SIZE_BYTES = 100 * 1024 * 1024; // 100MB @@ -30,10 +31,10 @@ CRITICAL RULES: 8. Always write in first person plural (we, our, us) as if speaking on behalf of the organization. 9. Keep answers to 1-3 sentences maximum unless the question explicitly requires more detail.`; -export const QUESTION_PARSING_SYSTEM_PROMPT = `You parse vendor questionnaires from Excel, PDF, or document text. Extract all items the respondent is expected to answer (questions, prompts, or compliance statements) and pair each with its answer if one exists. +export const QUESTION_PARSING_SYSTEM_PROMPT = `You parse vendor questionnaires from Excel, PDF, images, CSV, or document text. Your job is to classify content and return only answerable questionnaire items. What counts as an item to extract: -1. Interrogative questions ending with "?" or starting with What/How/Why/When/Where/Is/Are/Do/Does/Can/Will/Should. +1. Interrogative questions in any language. A question mark is helpful but not required. 2. Form fields like "1.1 Vendor Name", "Contact Email", "Company Address" (numbered or labeled fields requesting information). 3. Compliance/requirement statements that the respondent must confirm or describe their compliance with — vendor questionnaires often consist entirely of these. Examples: - "The organization must X" @@ -53,10 +54,13 @@ Input format hints: Rules: 1. Find the column containing the actual item text, not just IDs/numbers (e.g., skip "SQ14.3", keep the full sentence). 2. Extract the FULL text of each item. -3. Match each item to its Response/Answer text from the same row. If empty or missing, set answer = null. -4. Skip pure section headers ("Information Security Program", "General Information") UNLESS they are also items the respondent must answer. -5. Skip metadata rows (Company Name, Date, file headers). -6. NEVER return zero items if the document has any rows of substantive content — extract every row that looks like an item the respondent must address.`; +3. For upload-to-autofill parsing, always set saved answers to null. The user expects us to generate answers later. +4. Never use scoring/options values as answers, e.g. "(Oui : 0, Non : 3)" or "(Yes : 0, No : 1)". +5. Never use placeholders as answers, e.g. "A remplir", "A compléter", "To be completed". +6. Do not treat guidance, instructions, examples, mode opératoire, remediation plans, or calculated score/formula cells as answerable items. +7. Skip pure section headers ("Information Security Program", "General Information") UNLESS the text itself asks the respondent to provide information. +8. Skip metadata rows (Company Name, Date, file headers) unless the field is clearly part of the vendor questionnaire response surface. +9. Be high recall for answerable items, but do not include instructions, examples, scoring, or metadata just to avoid returning zero items.`; // Vision extraction prompt for PDFs and images export const VISION_EXTRACTION_PROMPT = `Transcribe this document into plain text. Output ONLY the document's text content — no summaries, no analysis, no commentary about what the document is or whether it contains questions. diff --git a/apps/api/src/questionnaire/utils/content-extractor.spec.ts b/apps/api/src/questionnaire/utils/content-extractor.spec.ts index 9dbdf62064..ad1fd5a069 100644 --- a/apps/api/src/questionnaire/utils/content-extractor.spec.ts +++ b/apps/api/src/questionnaire/utils/content-extractor.spec.ts @@ -1,5 +1,7 @@ import { extractContentFromFile } from './content-extractor'; import ExcelJS from 'exceljs'; +import { PDFDocument } from 'pdf-lib'; +import { generateText } from 'ai'; // Mock AI dependencies jest.mock('@ai-sdk/openai', () => ({ openai: jest.fn() })); @@ -50,6 +52,31 @@ describe('content-extractor: extractContentFromFile', () => { expect(result).toContain('Rating?'); }); + it('should ignore scoring columns and placeholders in BPCE-style sheets', async () => { + const workbook = new ExcelJS.Workbook(); + const worksheet = workbook.addWorksheet('Risk assessment SSI'); + worksheet.getCell('B10').value = + 'Le prestataire effectue-t-il des revues régulières des comptes à privilèges ?'; + worksheet.getCell('F10').value = { + formula: 'IF(E10="NON",3,0)', + result: 0, + }; + worksheet.getCell('K10').value = '(Oui : 0, Non : 3)'; + worksheet.getCell('M10').value = 'A remplir'; + worksheet.getCell('O10').value = + "La gestion des comptes à privilèges consiste à contrôler l'accès aux comptes."; + + const arrayBuffer = await workbook.xlsx.writeBuffer(); + const base64 = Buffer.from(arrayBuffer).toString('base64'); + const result = await extractContentFromFile(base64, XLSX_MIME); + + expect(result).toContain('[B10 Question]'); + expect(result).toContain('revues régulières des comptes à privilèges'); + expect(result).not.toContain('(Oui : 0, Non : 3)'); + expect(result).not.toContain('A remplir'); + expect(result).not.toContain('[F10'); + }); + it('should extract content from multiple sheets', async () => { const buffer = await createTestExcelBuffer([ { @@ -85,6 +112,32 @@ describe('content-extractor: extractContentFromFile', () => { expect(result).toContain('What is 2+2?,4'); }); + it('should fall back to OpenAI when Claude PDF extraction is overloaded', async () => { + const pdf = await PDFDocument.create(); + pdf.addPage(); + const bytes = await pdf.save(); + const mockGenerateText = generateText as jest.Mock; + mockGenerateText + .mockRejectedValueOnce(new Error('Overloaded')) + .mockResolvedValueOnce({ text: 'Extracted PDF text' }); + + const result = await extractContentFromFile( + Buffer.from(bytes).toString('base64'), + 'application/pdf', + ); + + expect(result).toBe('Extracted PDF text'); + expect(mockGenerateText).toHaveBeenCalledTimes(2); + }); + + it('should reject legacy XLS files with a clear message', async () => { + const base64 = Buffer.from('legacy-binary-xls').toString('base64'); + + await expect( + extractContentFromFile(base64, 'application/vnd.ms-excel'), + ).rejects.toThrow('Legacy Excel files'); + }); + it('should handle plain text files', async () => { const text = 'Some compliance document content'; const base64 = Buffer.from(text).toString('base64'); diff --git a/apps/api/src/questionnaire/utils/content-extractor.ts b/apps/api/src/questionnaire/utils/content-extractor.ts index 201d59b67f..cf5e39a24d 100644 --- a/apps/api/src/questionnaire/utils/content-extractor.ts +++ b/apps/api/src/questionnaire/utils/content-extractor.ts @@ -1,11 +1,12 @@ import { openai } from '@ai-sdk/openai'; import { anthropic } from '@ai-sdk/anthropic'; -import { createGroq } from '@ai-sdk/groq'; -import { generateText, generateObject, jsonSchema } from 'ai'; +import { generateText } from 'ai'; import ExcelJS from 'exceljs'; import AdmZip from 'adm-zip'; import mammoth from 'mammoth'; +import { PDFDocument } from 'pdf-lib'; import { PARSING_MODEL, VISION_EXTRACTION_PROMPT } from './constants'; +import { parseQuestionsAndAnswers } from './question-parser'; /** * Loads an Excel workbook from a buffer. @@ -17,35 +18,6 @@ async function loadWorkbook(data: Uint8Array): Promise { return workbook; } -// Initialize Groq - ultra fast inference -const groq = createGroq(); - -// Schema for question extraction -const questionExtractionSchema = jsonSchema<{ - questions: { question: string; answer: string | null }[]; -}>({ - type: 'object', - properties: { - questions: { - type: 'array', - items: { - type: 'object', - properties: { - question: { type: 'string', description: 'The full question text' }, - answer: { - anyOf: [{ type: 'string' }, { type: 'null' }], - description: 'The answer/response if provided, null if empty', - }, - }, - required: ['question', 'answer'], - additionalProperties: false, - }, - }, - }, - required: ['questions'], - additionalProperties: false, -}); - export interface ContentExtractionLogger { info: (message: string, meta?: Record) => void; warn: (message: string, meta?: Record) => void; @@ -59,6 +31,268 @@ const defaultLogger: ContentExtractionLogger = { error: () => {}, }; +interface ExtractedExcelCell { + address: string; + columnIndex: number; + value: string; + isFormula: boolean; +} + +function isRecord(value: unknown): value is Record { + return typeof value === 'object' && value !== null; +} + +function normalizeCellText(value: string): string { + return value.replace(/\s+/g, ' ').trim(); +} + +async function mapWithConcurrency( + items: T[], + concurrency: number, + worker: (item: T) => Promise, +): Promise { + const results: R[] = []; + let nextIndex = 0; + + const runWorker = async (): Promise => { + while (nextIndex < items.length) { + const currentIndex = nextIndex; + nextIndex += 1; + results[currentIndex] = await worker(items[currentIndex]); + } + }; + + await Promise.all( + Array.from({ length: Math.min(concurrency, items.length) }, () => + runWorker(), + ), + ); + + return results; +} + +function extractExcelCellValue(value: unknown): string { + if (value === null || value === undefined) { + return ''; + } + + if ( + typeof value === 'string' || + typeof value === 'number' || + typeof value === 'boolean' + ) { + return String(value); + } + + if (value instanceof Date) { + return value.toISOString(); + } + + if (!isRecord(value)) { + return ''; + } + + if (Array.isArray(value.richText)) { + return value.richText + .map((part) => + isRecord(part) && typeof part.text === 'string' ? part.text : '', + ) + .join(''); + } + + if (value.result !== undefined) { + return extractExcelCellValue(value.result); + } + + if (typeof value.text === 'string') { + return value.text; + } + + return ''; +} + +function getExcelCellText(cell: ExcelJS.Cell): string { + const extracted = normalizeCellText(extractExcelCellValue(cell.value)); + if (extracted) { + return extracted; + } + + try { + return normalizeCellText(cell.text); + } catch { + return ''; + } +} + +function hasFormulaValue(value: unknown): boolean { + return isRecord(value) && typeof value.formula === 'string'; +} + +function columnNameFromIndex(index: number): string { + let current = index; + let name = ''; + + while (current > 0) { + const remainder = (current - 1) % 26; + name = String.fromCharCode(65 + remainder) + name; + current = Math.floor((current - 1) / 26); + } + + return name; +} + +function normalizeForClassification(value: string): string { + return value + .normalize('NFKD') + .replace(/[\u0300-\u036f]/g, '') + .toLowerCase() + .replace(/\s+/g, ' ') + .trim(); +} + +function isPlaceholderCell(value: string): boolean { + const normalized = normalizeForClassification(value); + return /^(?:\d+#\s*-\s*)?a\s+(?:remplir|completer)$/.test(normalized); +} + +function isScoringOptionsCell(value: string): boolean { + const normalized = normalizeForClassification(value); + return /^\((?:oui|yes|non|no|n\/a|na)\s*:\s*-?\d+(?:\s*,\s*(?:oui|yes|non|no|n\/a|na)\s*:\s*-?\d+)*\)$/.test( + normalized, + ); +} + +function headerLooksLike(header: string, keywords: string[]): boolean { + const normalized = normalizeForClassification(header); + return keywords.some((keyword) => normalized.includes(keyword)); +} + +function inferCellLabel(cell: ExtractedExcelCell, header?: string): string { + if ( + header && + headerLooksLike(header, ['question', 'prompt', 'requirement']) + ) { + return 'Question'; + } + + if (header && headerLooksLike(header, ['response', 'answer', 'reply'])) { + return 'Response'; + } + + if (header && headerLooksLike(header, ['comment', 'explanation'])) { + return 'Comment'; + } + + if (header && headerLooksLike(header, ['mode operatoire', 'guidance'])) { + return 'Guidance'; + } + + if (/[??]/.test(cell.value)) { + return 'Question'; + } + + if (normalizeForClassification(cell.value).startsWith('exemple')) { + return 'Example'; + } + + return 'Cell'; +} + +function findHeaderRow(rows: ExtractedExcelCell[][]): { + rowIndex: number; + headersByColumn: Map; +} { + const headerKeywords = [ + 'question', + 'response', + 'answer', + 'comment', + 'commentaires', + 'attachment', + 'reply', + 'mode operatoire', + 'reponse', + ]; + + for (let i = 0; i < Math.min(10, rows.length); i++) { + const row = rows[i]; + const matchCount = headerKeywords.filter((keyword) => + row.some((cell) => + normalizeForClassification(cell.value).includes(keyword), + ), + ).length; + + if (matchCount >= 2) { + return { + rowIndex: i, + headersByColumn: new Map( + row.map((cell) => [cell.columnIndex, cell.value] as const), + ), + }; + } + } + + return { rowIndex: -1, headersByColumn: new Map() }; +} + +function formatExcelSheet( + name: string, + rows: ExtractedExcelCell[][], +): string | null { + if (rows.length === 0) { + return null; + } + + const formattedRows: string[] = []; + const { rowIndex: headerRowIndex, headersByColumn } = findHeaderRow(rows); + + for (let i = 0; i < rows.length; i++) { + const row = rows[i]; + + if (i === headerRowIndex) { + formattedRows.push( + `[COLUMNS: ${row.map((cell) => `${cell.address} ${cell.value}`).join(', ')}]`, + ); + continue; + } + + const parts: string[] = []; + const seenValues = new Set(); + + for (const cell of row) { + if ( + cell.isFormula || + isPlaceholderCell(cell.value) || + isScoringOptionsCell(cell.value) + ) { + continue; + } + + const normalizedValue = normalizeForClassification(cell.value); + if (seenValues.has(normalizedValue)) { + continue; + } + seenValues.add(normalizedValue); + + const header = headersByColumn.get(cell.columnIndex); + const label = inferCellLabel(cell, header); + const headerPrefix = header ? ` ${header}` : ''; + parts.push(`[${cell.address} ${label}${headerPrefix}] ${cell.value}`); + } + + if (parts.length > 0) { + const rowNumber = row[0]?.address.match(/\d+$/)?.[0] ?? String(i + 1); + formattedRows.push(`[ROW ${rowNumber}] ${parts.join(' | ')}`); + } + } + + if (formattedRows.length === 0) { + return null; + } + + return `=== Sheet: ${name} ===\n${formattedRows.join('\n')}`; +} + /** * Extracts content from a file based on its MIME type * Supports: Excel, CSV, text, PDF, and images @@ -70,6 +304,12 @@ export async function extractContentFromFile( ): Promise { const fileBuffer = Buffer.from(fileData, 'base64'); + if (fileType === 'application/vnd.ms-excel') { + throw new Error( + 'Legacy Excel files (.xls) are not reliably supported. Please convert the questionnaire to .xlsx, CSV, PDF, or DOCX.', + ); + } + // Handle Excel files (.xlsx, .xls) if (isExcelFile(fileType)) { return extractFromExcel(fileBuffer, fileType, logger); @@ -125,49 +365,12 @@ export async function extractQuestionsWithAI( const startTime = Date.now(); try { - // For Excel files - use simple library extraction then AI parsing - if (isExcelFile(fileType)) { - const fileBuffer = Buffer.from(fileData, 'base64'); - const rawContent = await extractExcelRawContent(fileBuffer, logger); - - logger.info('Extracted raw Excel content', { - contentLength: rawContent.length, - extractionMs: Date.now() - startTime, - }); - - // Use Groq for ultra-fast AI parsing (~5-10 seconds) - return await parseQuestionsWithGroq(rawContent, logger); - } - - // For CSV - simple text parsing - if (isCsvFile(fileType)) { - const fileBuffer = Buffer.from(fileData, 'base64'); - const content = fileBuffer.toString('utf-8'); - return await parseQuestionsWithGroq(content, logger); - } - - // For Word documents (.docx) - extract text then AI parsing - if (isDocxFile(fileType)) { - const fileBuffer = Buffer.from(fileData, 'base64'); - const result = await mammoth.extractRawText({ buffer: fileBuffer }); - logger.info('Extracted DOCX content', { - contentLength: result.value.length, - extractionMs: Date.now() - startTime, - }); - return await parseQuestionsWithGroq(result.value, logger); - } - - // For PDFs - use Claude's native PDF support - if (isPdfFile(fileType)) { - return await parseQuestionsFromPdf(fileData, logger); - } - - // For images - use OpenAI vision - if (isImageFile(fileType)) { - return await parseQuestionsWithVision(fileData, fileType, logger); - } - - throw new Error(`Unsupported file type for AI extraction: ${fileType}`); + const content = await extractContentFromFile(fileData, fileType, logger); + logger.info('Extracted content for questionnaire classification', { + contentLength: content.length, + extractionMs: Date.now() - startTime, + }); + return parseQuestionsAndAnswers(content, logger); } catch (error) { logger.error('AI extraction failed', { error: error instanceof Error ? error.message : 'Unknown error', @@ -178,367 +381,26 @@ export async function extractQuestionsWithAI( } /** - * Simple raw content extraction - just dump all cell values - * No smart header detection, let AI figure it out + * Structured raw content extraction for Excel files. + * ExcelJS is the primary path; XML is only a fallback for unusual workbooks. */ async function extractExcelRawContent( fileBuffer: Buffer, logger: ContentExtractionLogger, ): Promise { - // Try custom XML parser first (handles rich text) try { - const zip = new AdmZip(fileBuffer); - const sharedStrings = extractSharedStrings(fileBuffer); - const sheetNames = extractSheetNames(zip); - const allContent: string[] = []; - - for (let sheetIdx = 0; sheetIdx < sheetNames.length; sheetIdx++) { - const sheetName = sheetNames[sheetIdx]; - const rows = extractSheetData(zip, sheetIdx, sharedStrings); - - if (rows.length === 0) continue; - - allContent.push(`\n--- ${sheetName} ---`); - - for (const row of rows) { - const nonEmpty = row.filter((cell) => cell.trim()); - if (nonEmpty.length > 0) { - allContent.push(nonEmpty.join(' | ')); - } - } - } - - const result = allContent.join('\n'); - - // If custom parser got content, use it + const result = await extractFromExcelStandard(fileBuffer); if (result.length > 100) { return result; } + logger.info('ExcelJS returned minimal raw content, trying XML fallback'); } catch (error) { - logger.warn('Custom XML parser failed', { - error: error instanceof Error ? error.message : 'Unknown', - }); - } - - // Fallback to exceljs library - const workbook = await loadWorkbook(fileBuffer); - const allContent: string[] = []; - - for (const worksheet of workbook.worksheets) { - allContent.push(`\n--- ${worksheet.name} ---`); - - worksheet.eachRow((row) => { - const cells = row.values as unknown[]; - const nonEmpty = cells - .slice(1) - .map((c) => String(c ?? '').trim()) - .filter((c) => c); - if (nonEmpty.length > 0) { - allContent.push(nonEmpty.join(' | ')); - } - }); - } - - return allContent.join('\n'); -} - -const QUESTION_PROMPT = `Extract all questions/fields and their answers from this questionnaire or form data. - -Rules: -- Extract BOTH traditional questions (ending with "?") AND form fields that request information -- Form fields like "1.1 Vendor Name", "2.3 Company Address", "Contact Email" are valid questions - they request user input -- Numbered items (1.1, 1.2, 2.1, etc.) followed by a label are questions -- Items marked with "*" or "(required)" are definitely questions -- Items with notes like "(Single selection allowed)", "(Allows other)", "(Multiple selections allowed)" are questions -- Match each question/field to its response/answer from the same row or adjacent cell -- If no answer is provided, set answer to null -- Skip pure section headers (like "Section 1: General Information") but keep numbered fields within sections -- Keep the full question/field text including any notes about selection type - -Content: -`; - -/** - * Parse questions using Groq (PRIMARY - ultra fast) - * For large content: chunks and processes in parallel - */ -async function parseQuestionsWithGroq( - content: string, - logger: ContentExtractionLogger, -): Promise<{ question: string; answer: string | null }[]> { - const startTime = Date.now(); - const CHUNK_SIZE = 25000; // Leave room for prompt in 32k context - - try { - // If content fits in one chunk, process directly - if (content.length <= CHUNK_SIZE) { - logger.info('Starting Groq parsing (single chunk)...', { - contentLength: content.length, - }); - return await parseChunkWithGroq(content, logger); - } - - // Split content into chunks for parallel processing - const chunks = splitIntoChunks(content, CHUNK_SIZE); - logger.info('Starting Groq parsing (chunked)...', { - contentLength: content.length, - chunkCount: chunks.length, - }); - - // Process all chunks in parallel - const chunkResults = await Promise.all( - chunks.map((chunk, idx) => { - logger.info(`Processing chunk ${idx + 1}/${chunks.length}...`); - return parseChunkWithGroq(chunk, logger); - }), - ); - - // Merge and deduplicate results - const allQuestions = chunkResults.flat(); - const uniqueQuestions = deduplicateQuestions(allQuestions); - - logger.info('Groq chunked parsing complete', { - totalQuestions: uniqueQuestions.length, - chunksProcessed: chunks.length, - durationMs: Date.now() - startTime, - }); - - return uniqueQuestions; - } catch (error) { - logger.error('Groq parsing failed, trying Claude', { + logger.warn('ExcelJS raw extraction failed, trying XML fallback', { error: error instanceof Error ? error.message : 'Unknown', - durationMs: Date.now() - startTime, }); - - // Fallback to Claude (has 200k context, no chunking needed) - return parseQuestionsWithClaude(content, logger); } -} -/** - * Parse a single chunk with Groq - */ -async function parseChunkWithGroq( - content: string, - logger: ContentExtractionLogger, -): Promise<{ question: string; answer: string | null }[]> { - const { object } = await generateObject({ - model: groq('openai/gpt-oss-120b'), - schema: questionExtractionSchema, - prompt: QUESTION_PROMPT + content, - }); - - const result = object as { - questions: { question: string; answer: string | null }[]; - }; - - return (result.questions || []) - .map((q) => ({ - question: q.question?.trim() || '', - answer: q.answer?.trim() || null, - })) - .filter((q) => q.question); -} - -/** - * Split content into chunks, trying to break at section boundaries - */ -function splitIntoChunks(content: string, maxChunkSize: number): string[] { - const chunks: string[] = []; - const lines = content.split('\n'); - let currentChunk: string[] = []; - let currentSize = 0; - - for (const line of lines) { - const lineSize = line.length + 1; // +1 for newline - - // If adding this line would exceed limit, start new chunk - if (currentSize + lineSize > maxChunkSize && currentChunk.length > 0) { - chunks.push(currentChunk.join('\n')); - currentChunk = []; - currentSize = 0; - } - - currentChunk.push(line); - currentSize += lineSize; - } - - // Don't forget the last chunk - if (currentChunk.length > 0) { - chunks.push(currentChunk.join('\n')); - } - - return chunks; -} - -/** - * Deduplicate questions by comparing normalized text - */ -function deduplicateQuestions( - questions: { question: string; answer: string | null }[], -): { question: string; answer: string | null }[] { - const seen = new Set(); - const unique: { question: string; answer: string | null }[] = []; - - for (const q of questions) { - // Normalize: lowercase, remove extra spaces, remove numbering prefix - const normalized = q.question - .toLowerCase() - .replace(/^\d+\.\d+\s*/, '') // Remove "1.1 " prefix - .replace(/\s+/g, ' ') - .trim(); - - if (!seen.has(normalized)) { - seen.add(normalized); - unique.push(q); - } - } - - return unique; -} - -/** - * Parse questions using Claude (fallback - excellent quality) - */ -async function parseQuestionsWithClaude( - content: string, - logger: ContentExtractionLogger, -): Promise<{ question: string; answer: string | null }[]> { - const startTime = Date.now(); - - try { - logger.info('Starting Claude parsing...', { - contentLength: content.length, - }); - - const { object } = await generateObject({ - model: anthropic('claude-sonnet-4-6'), - schema: questionExtractionSchema, - prompt: QUESTION_PROMPT + content.substring(0, 80000), - }); - - const result = object as { - questions: { question: string; answer: string | null }[]; - }; - - logger.info('Claude parsing complete', { - questionCount: result.questions?.length || 0, - durationMs: Date.now() - startTime, - model: 'claude-sonnet-4-6', - }); - - return (result.questions || []) - .map((q) => ({ - question: q.question?.trim() || '', - answer: q.answer?.trim() || null, - })) - .filter((q) => q.question); - } catch (error) { - logger.error('Claude parsing failed, trying OpenAI', { - error: error instanceof Error ? error.message : 'Unknown', - durationMs: Date.now() - startTime, - }); - - // Fallback to OpenAI - return parseQuestionsWithOpenAI(content, logger); - } -} - -/** - * Fallback: Parse questions using OpenAI - */ -async function parseQuestionsWithOpenAI( - content: string, - logger: ContentExtractionLogger, -): Promise<{ question: string; answer: string | null }[]> { - const startTime = Date.now(); - - const { object } = await generateObject({ - model: openai('gpt-4o-mini'), - schema: questionExtractionSchema, - prompt: `Extract all questions/fields and their answers from this questionnaire or form. - -Include: -- Traditional questions ending with "?" -- Form fields like "1.1 Vendor Name", "Contact Email" that request input -- Numbered items (1.1, 1.2) followed by field labels -- Items marked with "*" or selection notes like "(Single selection allowed)" - -Match each to its response if provided. Set answer to null if empty. - -${content.substring(0, 80000)}`, - }); - - const result = object as { - questions: { question: string; answer: string | null }[]; - }; - - logger.info('OpenAI parsing complete', { - questionCount: result.questions?.length || 0, - durationMs: Date.now() - startTime, - }); - - return (result.questions || []) - .map((q) => ({ - question: q.question?.trim() || '', - answer: q.answer?.trim() || null, - })) - .filter((q) => q.question); -} - -/** - * Parse questions from PDF/image using vision - */ -async function parseQuestionsWithVision( - fileData: string, - fileType: string, - logger: ContentExtractionLogger, -): Promise<{ question: string; answer: string | null }[]> { - const startTime = Date.now(); - - const { object } = await generateObject({ - model: openai('gpt-4o'), - schema: questionExtractionSchema, - messages: [ - { - role: 'user', - content: [ - { - type: 'text', - text: `Extract all questions/fields and their answers from this questionnaire or form document. - -Include: -- Traditional questions ending with "?" -- Form fields like "1.1 Vendor Name", "Contact Email" that request input -- Numbered items (1.1, 1.2, 2.1) followed by field labels -- Items marked with "*" or selection notes like "(Single selection allowed)" - -Match each to its response if provided. Set answer to null if empty.`, - }, - { - type: 'image', - image: `data:${fileType};base64,${fileData}`, - }, - ], - }, - ], - }); - - const result = object as { - questions: { question: string; answer: string | null }[]; - }; - - logger.info('Vision parsing complete', { - questionCount: result.questions?.length || 0, - durationMs: Date.now() - startTime, - }); - - return (result.questions || []) - .map((q) => ({ - question: q.question?.trim() || '', - answer: q.answer?.trim() || null, - })) - .filter((q) => q.question); + return extractFromExcelXml(fileBuffer, logger); } // File type detection helpers @@ -585,7 +447,7 @@ function extractSheetNames(zip: AdmZip): string[] { const content = workbookEntry.getData().toString('utf8'); const names: string[] = []; const sheetPattern = /]+name="([^"]*)"[^>]*\/>/g; - let m; + let m: RegExpExecArray | null; while ((m = sheetPattern.exec(content)) !== null) { names.push(m[1]); @@ -644,16 +506,16 @@ function extractSheetData( zip: AdmZip, sheetIndex: number, sharedStrings: string[], -): string[][] { +): ExtractedExcelCell[][] { const sheetEntry = zip.getEntry(`xl/worksheets/sheet${sheetIndex + 1}.xml`); if (!sheetEntry) return []; const content = sheetEntry.getData().toString('utf8'); - const rows: Map> = new Map(); + const rows: Map = new Map(); - // Match all cell elements: ... - const cellPattern = /]*>[\s\S]*?<\/c>/g; - let match; + // Match normal and self-closing cells without accidentally spanning columns. + const cellPattern = /]*(?:\/>|>[\s\S]*?<\/c>)/g; + let match: RegExpExecArray | null; while ((match = cellPattern.exec(content)) !== null) { const col = match[1]; @@ -667,8 +529,7 @@ function extractSheetData( } colNum -= 1; // 0-indexed - // Only process first 10 columns - if (colNum > 9) continue; + if (colNum > 29) continue; // Check if this is a shared string cell (t="s") const isSharedString = /t="s"/.test(cellXml); @@ -687,114 +548,113 @@ function extractSheetData( } } + const trimmedValue = normalizeCellText(value); + if (!trimmedValue) { + continue; + } + if (!rows.has(rowNum)) { - rows.set(rowNum, new Map()); + rows.set(rowNum, []); } - rows.get(rowNum)!.set(colNum, value.trim()); + + rows.get(rowNum)!.push({ + address: `${col}${rowNum + 1}`, + columnIndex: colNum + 1, + value: trimmedValue, + isFormula: /)/.test(cellXml), + }); } - // Convert to 2D array - const maxRow = Math.max(...Array.from(rows.keys()), 0); - const result: string[][] = []; + const result: ExtractedExcelCell[][] = []; + const maxRow = Math.max(...Array.from(rows.keys()), -1); for (let r = 0; r <= maxRow; r++) { - const row: string[] = []; - const rowData = rows.get(r); - for (let c = 0; c <= 9; c++) { - row.push(rowData?.get(c) || ''); - } - result.push(row); + result.push(rows.get(r) ?? []); } return result; } /** - * Fallback extraction using exceljs library (for simple Excel files) + * Fallback extraction using raw worksheet XML for unusual Excel files. */ -async function extractFromExcelStandard( +function extractFromExcelXml( fileBuffer: Buffer, - _logger: ContentExtractionLogger, -): Promise { + logger: ContentExtractionLogger, +): string { + try { + const zip = new AdmZip(fileBuffer); + const sharedStrings = extractSharedStrings(fileBuffer); + const sheetNames = extractSheetNames(zip); + const sheets: string[] = []; + + logger.info('Trying XML Excel fallback', { + sharedStringCount: sharedStrings.length, + sheetCount: sheetNames.length, + }); + + for (let sheetIdx = 0; sheetIdx < sheetNames.length; sheetIdx++) { + const sheet = formatExcelSheet( + sheetNames[sheetIdx], + extractSheetData(zip, sheetIdx, sharedStrings), + ); + if (sheet) { + sheets.push(sheet); + } + } + + return sheets.join('\n\n'); + } catch (error) { + throw new Error( + `Failed to parse Excel file: ${ + error instanceof Error ? error.message : 'Unknown error' + }`, + ); + } +} + +/** + * Primary extraction using ExcelJS so rows, columns, merged cells, and formulas + * are interpreted by the workbook parser instead of regex over raw XML. + */ +async function extractFromExcelStandard(fileBuffer: Buffer): Promise { const workbook = await loadWorkbook(fileBuffer); const sheets: string[] = []; for (const worksheet of workbook.worksheets) { - const jsonData: string[][] = []; + const rows: ExtractedExcelCell[][] = []; + worksheet.eachRow((row) => { - const cells = row.values as unknown[]; - jsonData.push( - cells - .slice(1) - .map((cell) => - cell !== null && cell !== undefined ? String(cell).trim() : '', - ), - ); - }); + const cells: ExtractedExcelCell[] = []; - if (jsonData.length === 0) continue; - - const formattedRows: string[] = []; - let headerRowIndex = -1; - let columnHeaders: string[] = []; - - // Find header row - for (let i = 0; i < Math.min(10, jsonData.length); i++) { - const row = jsonData[i]; - const rowLower = row.map((cell) => cell.toLowerCase()); - const headerKeywords = [ - 'question', - 'response', - 'answer', - 'comment', - 'attachment', - ]; - const matchCount = headerKeywords.filter((kw) => - rowLower.some((cell) => cell.includes(kw)), - ).length; - - if (matchCount >= 2) { - headerRowIndex = i; - columnHeaders = row; - break; - } - } + for ( + let columnIndex = 1; + columnIndex <= Math.min(row.cellCount, 30); + columnIndex++ + ) { + const cell = row.getCell(columnIndex); + const value = getExcelCellText(cell); - // Process rows - for (let i = 0; i < jsonData.length; i++) { - const cells = jsonData[i]; - const hasContent = cells.some((cell) => cell !== ''); - if (!hasContent) continue; + if (!value) { + continue; + } - if (i === headerRowIndex) { - formattedRows.push(`[COLUMNS: ${cells.filter((c) => c).join(', ')}]`); - continue; + cells.push({ + address: `${columnNameFromIndex(columnIndex)}${row.number}`, + columnIndex, + value, + isFormula: hasFormulaValue(cell.value), + }); } - if (headerRowIndex !== -1 && i > headerRowIndex) { - const parts: string[] = []; - for (let j = 0; j < Math.min(cells.length, 10); j++) { - const header = columnHeaders[j] || `Col${j + 1}`; - const value = cells[j] || ''; - if (value) { - parts.push(`[${header}] ${value}`); - } - } - if (parts.length > 0) { - formattedRows.push(parts.join(' | ')); - } - } else { - const nonEmptyCells = cells.filter((c) => c).slice(0, 10); - if (nonEmptyCells.length > 0) { - formattedRows.push(nonEmptyCells.join(' | ')); - } + if (cells.length > 0) { + rows.push(cells); } - } + }); - if (formattedRows.length > 0) { - sheets.push( - `=== Sheet: ${worksheet.name} ===\n${formattedRows.join('\n')}`, - ); + const sheet = formatExcelSheet(worksheet.name, rows); + if (sheet) { + sheets.push(sheet); } } @@ -812,103 +672,7 @@ async function extractFromExcel( logger.info('Processing Excel file', { fileType, fileSizeMB }); - let result = ''; - - try { - // First try: Custom XML parser (handles rich text with namespace prefixes) - const zip = new AdmZip(fileBuffer); - const sharedStrings = extractSharedStrings(fileBuffer); - - logger.info('Extracted shared strings', { - count: sharedStrings.length, - }); - - const sheetNames = extractSheetNames(zip); - const sheets: string[] = []; - - for (let sheetIdx = 0; sheetIdx < sheetNames.length; sheetIdx++) { - const sheetName = sheetNames[sheetIdx]; - const rows = extractSheetData(zip, sheetIdx, sharedStrings); - - if (rows.length === 0) continue; - - const formattedRows: string[] = []; - let headerRowIndex = -1; - let columnHeaders: string[] = []; - - // Find header row - for (let i = 0; i < Math.min(10, rows.length); i++) { - const rowLower = rows[i].map((cell) => cell.toLowerCase()); - const headerKeywords = [ - 'question', - 'response', - 'answer', - 'comment', - 'attachment', - ]; - const matchCount = headerKeywords.filter((kw) => - rowLower.some((cell) => cell.includes(kw)), - ).length; - - if (matchCount >= 2) { - headerRowIndex = i; - columnHeaders = rows[i]; - break; - } - } - - // Process rows - for (let i = 0; i < rows.length; i++) { - const row = rows[i]; - const hasContent = row.some((cell) => cell !== ''); - if (!hasContent) continue; - - if (i === headerRowIndex) { - formattedRows.push(`[COLUMNS: ${row.filter((c) => c).join(', ')}]`); - continue; - } - - if (headerRowIndex !== -1 && i > headerRowIndex) { - const parts: string[] = []; - for (let j = 0; j < columnHeaders.length; j++) { - const header = columnHeaders[j] || `Col${j + 1}`; - const value = row[j] || ''; - if (value) { - parts.push(`[${header}] ${value}`); - } - } - if (parts.length > 0) { - formattedRows.push(parts.join(' | ')); - } - } else { - const nonEmptyCells = row.filter((c) => c); - if (nonEmptyCells.length > 0) { - formattedRows.push(nonEmptyCells.join(' | ')); - } - } - } - - if (formattedRows.length > 0) { - sheets.push(`=== Sheet: ${sheetName} ===\n${formattedRows.join('\n')}`); - } - } - - result = sheets.join('\n\n'); - - // If custom parser returned empty/minimal content, try standard library - if (result.length < 100) { - logger.info( - 'Custom parser returned minimal content, trying standard library', - ); - result = await extractFromExcelStandard(fileBuffer, logger); - } - } catch (error) { - // Fallback to exceljs library if custom parser fails - logger.warn('Custom Excel parser failed, using standard library', { - error: error instanceof Error ? error.message : 'Unknown error', - }); - result = await extractFromExcelStandard(fileBuffer, logger); - } + const result = await extractExcelRawContent(fileBuffer, logger); const extractionTime = ((Date.now() - excelStartTime) / 1000).toFixed(2); logger.info('Excel file processed', { @@ -935,10 +699,8 @@ async function extractFromPdf( fileData: string, logger: ContentExtractionLogger, ): Promise { - const fileSizeMB = ( - Buffer.from(fileData, 'base64').length / - (1024 * 1024) - ).toFixed(2); + const fileBuffer = Buffer.from(fileData, 'base64'); + const fileSizeMB = (fileBuffer.length / (1024 * 1024)).toFixed(2); logger.info('Extracting content from PDF using Claude', { fileSizeMB, @@ -947,26 +709,22 @@ async function extractFromPdf( const startTime = Date.now(); try { - const { text } = await generateText({ - model: anthropic('claude-sonnet-4-6'), - messages: [ - { - role: 'user', - content: [ - { type: 'text', text: VISION_EXTRACTION_PROMPT }, - { - type: 'file', - data: fileData, - mediaType: 'application/pdf', - }, - ], - }, - ], - }); + const pdf = await PDFDocument.load(fileBuffer); + const pageCount = pdf.getPageCount(); + const shouldSplit = pageCount > 10 || fileBuffer.length > 10 * 1024 * 1024; + const text = shouldSplit + ? await extractPdfByPage({ pdf, pageCount, logger }) + : await extractPdfText({ + fileData, + logger, + label: 'PDF document', + }); const extractionTime = ((Date.now() - startTime) / 1000).toFixed(2); logger.info('Content extracted from PDF', { extractedLength: text.length, + pageCount, + splitByPage: shouldSplit, extractionTimeSeconds: extractionTime, }); @@ -984,37 +742,69 @@ async function extractFromPdf( } } -/** - * Extract questions directly from a PDF using Claude's native multi-page support - */ -async function parseQuestionsFromPdf( - fileData: string, - logger: ContentExtractionLogger, -): Promise<{ question: string; answer: string | null }[]> { - const startTime = Date.now(); +async function extractPdfByPage(params: { + pdf: PDFDocument; + pageCount: number; + logger: ContentExtractionLogger; +}): Promise { + const pageIndexes = Array.from( + { length: params.pageCount }, + (_, index) => index, + ); + const pageTexts = await mapWithConcurrency( + pageIndexes, + 1, + async (pageIndex) => { + const pagePdf = await PDFDocument.create(); + const [page] = await pagePdf.copyPages(params.pdf, [pageIndex]); + pagePdf.addPage(page); + const bytes = await pagePdf.save(); + const text = await extractPdfText({ + fileData: Buffer.from(bytes).toString('base64'), + logger: params.logger, + label: `PDF page ${pageIndex + 1}`, + }); + return `--- PDF Page ${pageIndex + 1} ---\n${text}`; + }, + ); + + return pageTexts.join('\n\n'); +} + +async function extractPdfText(params: { + fileData: string; + logger: ContentExtractionLogger; + label: string; +}): Promise { + try { + return await extractPdfWithClaude(params); + } catch (error) { + params.logger.warn('Claude PDF extraction failed, trying OpenAI fallback', { + label: params.label, + error: error instanceof Error ? error.message : 'Unknown error', + }); + return extractPdfWithOpenAI(params); + } +} - const { object } = await generateObject({ +async function extractPdfWithClaude(params: { + fileData: string; + logger: ContentExtractionLogger; + label: string; +}): Promise { + params.logger.info('Extracting PDF text with Claude', { + label: params.label, + }); + const { text } = await generateText({ model: anthropic('claude-sonnet-4-6'), - schema: questionExtractionSchema, messages: [ { role: 'user', content: [ - { - type: 'text', - text: `Extract all questions/fields and their answers from this questionnaire or form document. - -Include: -- Traditional questions ending with "?" -- Form fields like "1.1 Vendor Name", "Contact Email" that request input -- Numbered items (1.1, 1.2, 2.1) followed by field labels -- Items marked with "*" or selection notes like "(Single selection allowed)" - -Match each to its response if provided. Set answer to null if empty.`, - }, + { type: 'text', text: VISION_EXTRACTION_PROMPT }, { type: 'file', - data: fileData, + data: params.fileData, mediaType: 'application/pdf', }, ], @@ -1022,21 +812,35 @@ Match each to its response if provided. Set answer to null if empty.`, ], }); - const result = object as { - questions: { question: string; answer: string | null }[]; - }; + return text; +} - logger.info('PDF question parsing complete', { - questionCount: result.questions?.length || 0, - durationMs: Date.now() - startTime, +async function extractPdfWithOpenAI(params: { + fileData: string; + logger: ContentExtractionLogger; + label: string; +}): Promise { + params.logger.info('Extracting PDF text with OpenAI fallback', { + label: params.label, + }); + const { text } = await generateText({ + model: openai(PARSING_MODEL), + messages: [ + { + role: 'user', + content: [ + { type: 'text', text: VISION_EXTRACTION_PROMPT }, + { + type: 'file', + data: params.fileData, + mediaType: 'application/pdf', + }, + ], + }, + ], }); - return (result.questions || []) - .map((q) => ({ - question: q.question?.trim() || '', - answer: q.answer?.trim() || null, - })) - .filter((q) => q.question); + return text; } async function extractFromVision( diff --git a/apps/api/src/questionnaire/utils/question-parser.spec.ts b/apps/api/src/questionnaire/utils/question-parser.spec.ts index c930f4b46d..2bf3561042 100644 --- a/apps/api/src/questionnaire/utils/question-parser.spec.ts +++ b/apps/api/src/questionnaire/utils/question-parser.spec.ts @@ -8,12 +8,14 @@ import { looksLikeQuestionLine, buildQuestionAwareChunks, estimateQuestionCount, + sanitizeParsedAnswer, + parseChunkQuestionsAndAnswers, + parseQuestionsAndAnswers, } from './question-parser'; +import { generateObject } from 'ai'; const CHUNK_OPTS = { maxChunkChars: 80_000, - minChunkChars: 5_000, - maxQuestionsPerChunk: 1, }; describe('looksLikeQuestionLine', () => { @@ -44,9 +46,9 @@ describe('looksLikeQuestionLine', () => { expect( looksLikeQuestionLine('What security certifications do you hold'), ).toBe(true); - expect( - looksLikeQuestionLine('How do you handle data breaches'), - ).toBe(true); + expect(looksLikeQuestionLine('How do you handle data breaches')).toBe( + true, + ); expect(looksLikeQuestionLine('Is data encrypted in transit')).toBe(true); expect(looksLikeQuestionLine('Are backups tested regularly')).toBe(true); expect(looksLikeQuestionLine('Does the company have SOC 2')).toBe(true); @@ -154,18 +156,14 @@ describe('looksLikeQuestionLine', () => { }); it('does NOT match ambiguous "We X" lines (could be answers)', () => { - expect( - looksLikeQuestionLine('We retain data for 90 days.'), - ).toBe(false); + expect(looksLikeQuestionLine('We retain data for 90 days.')).toBe(false); expect( looksLikeQuestionLine('We follow our IRP documented in SOC 2.'), ).toBe(false); }); it('does NOT false-positive on section headers and metadata', () => { - expect( - looksLikeQuestionLine('Information Security Program'), - ).toBe(false); + expect(looksLikeQuestionLine('Information Security Program')).toBe(false); expect(looksLikeQuestionLine('General Information')).toBe(false); expect(looksLikeQuestionLine('Section 2: Data Protection')).toBe(false); expect(looksLikeQuestionLine('Acme Corp')).toBe(false); @@ -191,7 +189,7 @@ describe('buildQuestionAwareChunks', () => { expect(buildQuestionAwareChunks(' ', CHUNK_OPTS)).toEqual([]); }); - it('chunks interrogative questions one per chunk', () => { + it('chunks content by size instead of question-mark heuristics', () => { const content = [ 'What is your data retention policy?', 'We retain data for 90 days.', @@ -201,14 +199,12 @@ describe('buildQuestionAwareChunks', () => { 'Yes, AES-256.', ].join('\n'); - const chunks = buildQuestionAwareChunks(content, CHUNK_OPTS); - expect(chunks.length).toBe(3); + const chunks = buildQuestionAwareChunks(content, { maxChunkChars: 80 }); + expect(chunks.length).toBeGreaterThan(1); expect(chunks[0].content).toContain('data retention policy'); - expect(chunks[1].content).toContain('security incidents'); - expect(chunks[2].content).toContain('encrypt data at rest'); }); - it('chunks compliance statements one per chunk', () => { + it('keeps compliance statements available for classifier review', () => { const content = [ 'The organization must determine roles and responsibilities for PII processing.', 'The organization has entered into a contract with the PII data controller.', @@ -216,10 +212,10 @@ describe('buildQuestionAwareChunks', () => { ].join('\n'); const chunks = buildQuestionAwareChunks(content, CHUNK_OPTS); - expect(chunks.length).toBe(3); + expect(chunks.length).toBe(1); expect(chunks[0].content).toContain('roles and responsibilities'); - expect(chunks[1].content).toContain('entered into a contract'); - expect(chunks[2].content).toContain('risk analysis'); + expect(chunks[0].content).toContain('entered into a contract'); + expect(chunks[0].content).toContain('risk analysis'); }); it('handles mixed interrogative + compliance content', () => { @@ -231,10 +227,10 @@ describe('buildQuestionAwareChunks', () => { ].join('\n'); const chunks = buildQuestionAwareChunks(content, CHUNK_OPTS); - expect(chunks.length).toBe(4); + expect(chunks.length).toBe(1); }); - it('keeps non-question context lines with the preceding question', () => { + it('preserves non-question context lines for classifier context', () => { const content = [ 'What is your encryption standard?', 'Please provide details about key management.', @@ -243,10 +239,10 @@ describe('buildQuestionAwareChunks', () => { ].join('\n'); const chunks = buildQuestionAwareChunks(content, CHUNK_OPTS); - expect(chunks.length).toBe(2); + expect(chunks.length).toBe(1); expect(chunks[0].content).toContain('key management'); expect(chunks[0].content).toContain('rotation schedule'); - expect(chunks[1].content).toContain('key rotation'); + expect(chunks[0].content).toContain('key rotation'); }); it('falls back to single chunk when no patterns match', () => { @@ -263,11 +259,102 @@ describe('buildQuestionAwareChunks', () => { }); }); +describe('parseChunkQuestionsAndAnswers', () => { + const mockGenerateObject = generateObject as jest.Mock; + + beforeEach(() => { + mockGenerateObject.mockReset(); + }); + + it('returns only answerable items with null answers', async () => { + mockGenerateObject.mockResolvedValue({ + object: { + items: [ + { + text: 'Décrire le processus de gestion des incidents', + classification: 'answerable_item', + confidence: 0.96, + }, + { + text: 'Gestion des actifs', + classification: 'section_header', + confidence: 0.92, + }, + { + text: '(Oui : 0, Non : 3)', + classification: 'scoring', + confidence: 1, + }, + ], + }, + }); + + await expect(parseChunkQuestionsAndAnswers('chunk', 0, 1)).resolves.toEqual( + [ + { + question: 'Décrire le processus de gestion des incidents', + answer: null, + }, + ], + ); + }); +}); + +describe('parseQuestionsAndAnswers', () => { + const mockGenerateObject = generateObject as jest.Mock; + + beforeEach(() => { + mockGenerateObject.mockReset(); + }); + + it('deduplicates answerable items across chunks', async () => { + mockGenerateObject.mockResolvedValue({ + object: { + items: [ + { + text: 'Do you encrypt data at rest?', + classification: 'answerable_item', + }, + ], + }, + }); + + const content = [ + 'Do you encrypt data at rest?', + 'Do you encrypt data at rest?', + ].join('\n'); + + await expect(parseQuestionsAndAnswers(content)).resolves.toEqual([ + { question: 'Do you encrypt data at rest?', answer: null }, + ]); + }); + + it('classifies large content in multiple bounded chunks', async () => { + mockGenerateObject.mockResolvedValue({ + object: { + items: [ + { + text: 'Provide your incident response process', + classification: 'answerable_item', + }, + ], + }, + }); + + const content = Array.from( + { length: 900 }, + (_, index) => `Row ${index}: Provide your incident response process`, + ).join('\n'); + + await parseQuestionsAndAnswers(content); + + expect(mockGenerateObject.mock.calls.length).toBeGreaterThan(1); + }); +}); + describe('estimateQuestionCount', () => { it('counts question marks when present', () => { - expect( - estimateQuestionCount('Q1? Q2? Q3?'), - ).toBe(3); + expect(estimateQuestionCount('Q1? Q2? Q3?')).toBe(3); }); it('counts lines matching looksLikeQuestionLine when no question marks', () => { @@ -284,3 +371,24 @@ describe('estimateQuestionCount', () => { expect(estimateQuestionCount(text)).toBe(3); }); }); + +describe('sanitizeParsedAnswer', () => { + it('converts scoring/options values to null', () => { + expect(sanitizeParsedAnswer('(Oui : 0, Non : 3)')).toBeNull(); + expect(sanitizeParsedAnswer('(Oui : 0, Non : 80, N/A : 0)')).toBeNull(); + expect(sanitizeParsedAnswer('(Yes : 0, No : 1)')).toBeNull(); + }); + + it('converts empty placeholders to null', () => { + expect(sanitizeParsedAnswer('A remplir')).toBeNull(); + expect(sanitizeParsedAnswer('A compléter')).toBeNull(); + expect(sanitizeParsedAnswer('1# - A remplir')).toBeNull(); + expect(sanitizeParsedAnswer('To be completed')).toBeNull(); + }); + + it('preserves real answers', () => { + expect(sanitizeParsedAnswer('Yes, reviewed quarterly')).toBe( + 'Yes, reviewed quarterly', + ); + }); +}); diff --git a/apps/api/src/questionnaire/utils/question-parser.ts b/apps/api/src/questionnaire/utils/question-parser.ts index 4b0ee35f8b..e725db3e6d 100644 --- a/apps/api/src/questionnaire/utils/question-parser.ts +++ b/apps/api/src/questionnaire/utils/question-parser.ts @@ -2,8 +2,7 @@ import { openai } from '@ai-sdk/openai'; import { generateObject, jsonSchema } from 'ai'; import { MAX_CHUNK_SIZE_CHARS, - MIN_CHUNK_SIZE_CHARS, - MAX_QUESTIONS_PER_CHUNK, + MAX_CLASSIFICATION_CONCURRENCY, PARSING_MODEL, QUESTION_PARSING_SYSTEM_PROMPT, } from './constants'; @@ -18,6 +17,21 @@ export interface ChunkInfo { questionCount: number; } +export type QuestionnaireItemClassification = + | 'answerable_item' + | 'metadata' + | 'section_header' + | 'instruction' + | 'guidance' + | 'example' + | 'scoring' + | 'noise'; + +interface ClassifiedQuestionnaireItem { + text: string; + classification: QuestionnaireItemClassification; +} + export interface QuestionParserLogger { info: (message: string, meta?: Record) => void; warn: (message: string, meta?: Record) => void; @@ -31,6 +45,38 @@ const defaultLogger: QuestionParserLogger = { error: () => {}, }; +export function sanitizeParsedAnswer( + answer: string | null | undefined, +): string | null { + const trimmed = answer?.trim(); + if (!trimmed) { + return null; + } + + const normalized = trimmed + .normalize('NFKD') + .replace(/[\u0300-\u036f]/g, '') + .toLowerCase() + .replace(/\s+/g, ' ') + .trim(); + + const isPlaceholder = + /^(?:\d+#\s*-\s*)?a\s+(?:remplir|completer)$/.test(normalized) || + normalized === 'to be completed' || + normalized === 'to fill'; + + const isScoringOptions = + /^\((?:oui|yes|non|no|n\/a|na)\s*:\s*-?\d+(?:\s*,\s*(?:oui|yes|non|no|n\/a|na)\s*:\s*-?\d+)*\)$/.test( + normalized, + ); + + if (isPlaceholder || isScoringOptions) { + return null; + } + + return trimmed; +} + /** * Parses questions and answers from extracted content using LLM * Handles large content by chunking and processing in parallel @@ -41,8 +87,6 @@ export async function parseQuestionsAndAnswers( ): Promise { const chunkInfos = buildQuestionAwareChunks(content, { maxChunkChars: MAX_CHUNK_SIZE_CHARS, - minChunkChars: MIN_CHUNK_SIZE_CHARS, - maxQuestionsPerChunk: MAX_QUESTIONS_PER_CHUNK, }); if (chunkInfos.length === 0) { @@ -50,30 +94,19 @@ export async function parseQuestionsAndAnswers( return []; } - if (chunkInfos.length === 1) { - logger.info('Processing content as a single chunk', { - contentLength: chunkInfos[0].content.length, - estimatedQuestions: chunkInfos[0].questionCount, - }); - return parseChunkQuestionsAndAnswers(chunkInfos[0].content, 0, 1); - } - - logger.info( - 'Chunking content by individual questions for parallel processing', - { - contentLength: content.length, - totalChunks: chunkInfos.length, - questionsPerChunk: 1, - }, - ); + logger.info('Classifying questionnaire content in chunks', { + contentLength: content.length, + totalChunks: chunkInfos.length, + concurrency: MAX_CLASSIFICATION_CONCURRENCY, + }); - // Process all chunks in parallel for maximum speed const parseStartTime = Date.now(); - const allPromises = chunkInfos.map((chunk, index) => - parseChunkQuestionsAndAnswers(chunk.content, index, chunkInfos.length), + const allResults = await mapWithConcurrency( + chunkInfos.map((chunk, index) => ({ chunk, index })), + MAX_CLASSIFICATION_CONCURRENCY, + ({ chunk, index }) => + parseChunkQuestionsAndAnswers(chunk.content, index, chunkInfos.length), ); - - const allResults = await Promise.all(allPromises); const parseTime = ((Date.now() - parseStartTime) / 1000).toFixed(2); const totalRawQuestions = allResults.reduce( @@ -81,7 +114,7 @@ export async function parseQuestionsAndAnswers( 0, ); - logger.info('All chunks processed in parallel', { + logger.info('All chunks classified', { totalChunks: chunkInfos.length, parseTimeSeconds: parseTime, totalQuestions: totalRawQuestions, @@ -92,7 +125,7 @@ export async function parseQuestionsAndAnswers( for (const qaArray of allResults) { for (const qa of qaArray) { - const normalizedQuestion = qa.question.toLowerCase().trim(); + const normalizedQuestion = normalizeQuestionForDedupe(qa.question); if (!seenQuestions.has(normalizedQuestion)) { seenQuestions.set(normalizedQuestion, qa); } @@ -123,57 +156,68 @@ export async function parseChunkQuestionsAndAnswers( schema: jsonSchema({ type: 'object', properties: { - questionsAndAnswers: { + items: { type: 'array', items: { type: 'object', properties: { - question: { type: 'string', description: 'The question text' }, - answer: { - anyOf: [{ type: 'string' }, { type: 'null' }], + text: { + type: 'string', description: - 'The answer to the question. Use null if no answer is provided.', + 'The exact text of the content block or row being classified.', + }, + classification: { + type: 'string', + enum: [ + 'answerable_item', + 'metadata', + 'section_header', + 'instruction', + 'guidance', + 'example', + 'scoring', + 'noise', + ], + description: + 'Whether this text is an answerable questionnaire item or non-answerable content.', }, }, - required: ['question', 'answer'], + required: ['text', 'classification'], additionalProperties: false, }, }, }, - required: ['questionsAndAnswers'], + required: ['items'], additionalProperties: false, }), system: QUESTION_PARSING_SYSTEM_PROMPT, prompt: buildParsingPrompt(chunk, chunkIndex, totalChunks), }); - const parsed = (object as { questionsAndAnswers?: QuestionAnswer[] }) - ?.questionsAndAnswers; + const parsed = (object as { items?: ClassifiedQuestionnaireItem[] })?.items; - // Handle case where LLM returns unexpected response if (!parsed || !Array.isArray(parsed)) { return []; } - // Post-process to ensure empty strings are converted to null return parsed .filter( - (qa) => qa && typeof qa.question === 'string' && qa.question.trim(), + (item) => + item && + item.classification === 'answerable_item' && + typeof item.text === 'string' && + item.text.trim(), ) - .map((qa) => ({ - question: qa.question.trim(), - answer: - qa.answer && typeof qa.answer === 'string' && qa.answer.trim() !== '' - ? qa.answer.trim() - : null, + .map((item) => ({ + question: item.text.trim(), + answer: null, })); } catch (error) { - // Log error but don't fail the entire parsing console.error( `Error parsing chunk ${chunkIndex + 1}/${totalChunks}:`, error, ); - return []; + throw error; } } @@ -183,13 +227,14 @@ function buildParsingPrompt( totalChunks: number, ): string { const instructions = `Instructions: -- Extract every item the respondent must address from this questionnaire data, paired with its answer. -- Items include: interrogative questions (ending in "?" or starting with What/How/Do/Is/Are/etc.), form fields ("1.1 Vendor Name"), AND compliance/requirement statements ("The organization must X", "The organization has X", "We have X"). All are valid items. -- IMPORTANT: Extract the FULL item text, not just IDs like "SQ14.3". Find the column with the actual sentences. -- Match each item to its corresponding Response/Answer/Comment from the same row. -- If the Response/Answer is empty or missing, set answer to null. -- Skip pure section headers (e.g., "Information Security Program") and metadata (Company Name, Date). -- If the document is a single-column checklist of statements, treat each row as one item with answer = null.`; +- Classify the provided questionnaire rows/blocks. +- Return only blocks that the respondent is expected to answer as classification = "answerable_item". +- An answerable item can be a question, request, field label, or compliance statement. It does not need a question mark and can be in any language. +- Extract the FULL item text, not just IDs like "SQ14.3". +- Classify pure section headers, metadata, instructions, scoring/options, examples, guidance, remediation plans, and placeholders as non-answerable. +- Never classify values like "(Oui : 0, Non : 3)", "A remplir", "A compléter", or "To be completed" as answerable. +- Do not return existing answers from the source file. This upload flow generates answers later, so persisted answers must be null. +- Prefer high recall for real answerable items, but do not include obvious metadata or instructions just to avoid returning zero items.`; if (totalChunks > 1) { return `${instructions} @@ -210,11 +255,7 @@ ${chunk}`; */ export function buildQuestionAwareChunks( content: string, - options: { - maxChunkChars: number; - minChunkChars: number; - maxQuestionsPerChunk: number; - }, + options: { maxChunkChars: number }, ): ChunkInfo[] { const trimmedContent = content.trim(); if (!trimmedContent) { @@ -224,7 +265,7 @@ export function buildQuestionAwareChunks( const chunks: ChunkInfo[] = []; const lines = trimmedContent.split(/\r?\n/); let currentChunk: string[] = []; - let currentQuestionFound = false; + let currentSize = 0; const pushChunk = () => { const chunkText = currentChunk.join('\n').trim(); @@ -233,30 +274,24 @@ export function buildQuestionAwareChunks( } chunks.push({ content: chunkText, - questionCount: 1, + questionCount: estimateQuestionCount(chunkText), }); currentChunk = []; - currentQuestionFound = false; + currentSize = 0; }; for (const line of lines) { - const trimmedLine = line.trim(); - const isEmpty = trimmedLine.length === 0; - const looksLikeQuestion = !isEmpty && looksLikeQuestionLine(trimmedLine); - - // If we find a new question and we already have a question in the current chunk, start a new chunk - if (looksLikeQuestion && currentQuestionFound && currentChunk.length > 0) { + const lineSize = line.length + 1; + if ( + currentChunk.length > 0 && + currentSize + lineSize > options.maxChunkChars + ) { pushChunk(); } - // Add line to current chunk (including empty lines for context) - if (!isEmpty || currentChunk.length > 0) { + if (line.trim() || currentChunk.length > 0) { currentChunk.push(line); - } - - // Mark that we've found a question in this chunk - if (looksLikeQuestion) { - currentQuestionFound = true; + currentSize += lineSize; } } @@ -265,15 +300,41 @@ export function buildQuestionAwareChunks( pushChunk(); } - // If no questions were detected, return the entire content as a single chunk - return chunks.length > 0 - ? chunks - : [ - { - content: trimmedContent, - questionCount: estimateQuestionCount(trimmedContent), - }, - ]; + return chunks; +} + +async function mapWithConcurrency( + items: T[], + concurrency: number, + worker: (item: T) => Promise, +): Promise { + const results: R[] = []; + let nextIndex = 0; + + const runWorker = async (): Promise => { + while (nextIndex < items.length) { + const currentIndex = nextIndex; + nextIndex += 1; + results[currentIndex] = await worker(items[currentIndex]); + } + }; + + await Promise.all( + Array.from({ length: Math.min(concurrency, items.length) }, () => + runWorker(), + ), + ); + + return results; +} + +function normalizeQuestionForDedupe(question: string): string { + return question + .toLowerCase() + .replace(/^\[[^\]]+\]\s*/, '') + .replace(/^\d+\.\d+\s*/, '') + .replace(/\s+/g, ' ') + .trim(); } /** @@ -287,7 +348,7 @@ export function looksLikeQuestionLine(line: string): boolean { const questionLabel = /question\s*:/i; // Line starts with optional number prefix, then explicit question/q label - const explicitQuestionPrefix = /^(?:\d+\s*[\).\]]\s*)?(?:question|q)\b/i; + const explicitQuestionPrefix = /^(?:\d+\s*[.)\]]\s*)?(?:question|q)\b/i; // Interrogative words at the START of line const interrogativePrefix = @@ -296,7 +357,7 @@ export function looksLikeQuestionLine(line: string): boolean { // Numbered questions: "06. Do you have...", "1) What is...", "Q1: How do..." // This handles questions where a number/prefix comes before the interrogative const numberedQuestionWithInterrogative = - /^(?:\d+\s*[\).\]:]\s*|[qQ]\d*\s*[\).\]:]\s*)(?:what|why|how|when|where|is|are|does|do|can|will|should|have|list|describe|explain|if)\b/i; + /^(?:\d+\s*[.):\]]\s*|[qQ]\d*\s*[.):\]]\s*)(?:what|why|how|when|where|is|are|does|do|can|will|should|have|list|describe|explain|if)\b/i; // Form-style numbered fields: "1.1 Vendor Name", "2.3 Contact Email", "1.4 Company Address" // Pattern: number.number followed by a word (the field label) diff --git a/apps/api/src/trigger/questionnaire/parse-questionnaire.ts b/apps/api/src/trigger/questionnaire/parse-questionnaire.ts index 399730255a..3c251cc8f4 100644 --- a/apps/api/src/trigger/questionnaire/parse-questionnaire.ts +++ b/apps/api/src/trigger/questionnaire/parse-questionnaire.ts @@ -1,7 +1,7 @@ import { extractS3KeyFromUrl } from '@/app/s3'; import { GetObjectCommand, S3Client } from '@aws-sdk/client-s3'; import { db } from '@db'; -import { logger, tags, task } from '@trigger.dev/sdk'; +import { logger, metadata, tags, task } from '@trigger.dev/sdk'; // Import shared utilities import { @@ -264,6 +264,7 @@ export const parseQuestionnaireTask = task({ let extractedContent: string; // Extract content based on input type + metadata.set('status', 'extracting').set('progress', 10); switch (payload.inputType) { case 'file': { if (!payload.fileData || !payload.fileType) { @@ -319,6 +320,10 @@ export const parseQuestionnaireTask = task({ inputType: payload.inputType, contentLength: extractedContent.length, }); + metadata + .set('status', 'classifying_answerable_items') + .set('progress', 45) + .set('extractedContentLength', extractedContent.length); // Parse questions and answers from extracted content const parseStartTime = Date.now(); @@ -335,6 +340,10 @@ export const parseQuestionnaireTask = task({ parseTimeSeconds: parseTime, totalTimeSeconds: totalTime, }); + metadata + .set('status', 'saving_questionnaire') + .set('progress', 80) + .set('questionCount', questionsAndAnswers.length); // Create questionnaire record in database let questionnaireId: string; @@ -367,9 +376,9 @@ export const parseQuestionnaireTask = task({ create: questionsAndAnswers.map( (qa: QuestionAnswer, index: number) => ({ question: qa.question, - answer: qa.answer || null, + answer: null, questionIndex: index, - status: qa.answer ? 'generated' : 'untouched', + status: 'untouched', }), ), }, @@ -382,6 +391,10 @@ export const parseQuestionnaireTask = task({ questionnaireId, questionCount: questionsAndAnswers.length, }); + metadata + .set('status', 'completed') + .set('progress', 100) + .set('questionnaireId', questionnaireId); } catch (error) { logger.error('Failed to create questionnaire record', { error: error instanceof Error ? error.message : 'Unknown error', diff --git a/apps/app/src/app/(app)/[orgId]/questionnaire/components/QuestionnaireUpload.tsx b/apps/app/src/app/(app)/[orgId]/questionnaire/components/QuestionnaireUpload.tsx index 6b9d3115bb..fceff1cd0d 100644 --- a/apps/app/src/app/(app)/[orgId]/questionnaire/components/QuestionnaireUpload.tsx +++ b/apps/app/src/app/(app)/[orgId]/questionnaire/components/QuestionnaireUpload.tsx @@ -1,13 +1,7 @@ 'use client'; -import { Button } from '@trycompai/ui/button'; -import { cn } from '@trycompai/ui/cn'; -import { - FileText, - Loader2, - Upload, - X, -} from 'lucide-react'; +import { Button, cn } from '@trycompai/design-system'; +import { Close, Document, Upload } from '@trycompai/design-system/icons'; import type { FileRejection } from 'react-dropzone'; import Dropzone from 'react-dropzone'; @@ -36,7 +30,7 @@ export function QuestionnaireUpload({ {selectedFile ? (
- +

{selectedFile.name}

@@ -45,15 +39,15 @@ export function QuestionnaireUpload({

{!isLoading && ( - +
+
)}
) : ( @@ -65,7 +59,6 @@ export function QuestionnaireUpload({ disabled={isLoading} accept={{ 'application/pdf': ['.pdf'], - 'application/vnd.ms-excel': ['.xls'], 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': ['.xlsx'], 'text/csv': ['.csv'], }} @@ -83,13 +76,13 @@ export function QuestionnaireUpload({ >
- +

{isDragActive ? 'Drop file here' : 'Drag & drop or click to select'}

- PDF, Excel, CSV (max 100MB) + PDF, XLSX, CSV (max 100MB)

@@ -99,36 +92,29 @@ export function QuestionnaireUpload({ )}
- +
+ +
);