Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 8 additions & 5 deletions apps/api/src/questionnaire/questionnaire.controller.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -321,13 +321,16 @@ describe('QuestionnaireController', () => {
source: 'internal',
};
mockService.uploadAndParse.mockResolvedValue({
questionnaireId: 'q1',
totalQuestions: 10,
runId: 'run_123',
publicAccessToken: 'token_123',
});

const result = await controller.uploadAndParse(dto as any, 'org_1');

expect(result).toEqual({ questionnaireId: 'q1', totalQuestions: 10 });
expect(result).toEqual({
runId: 'run_123',
publicAccessToken: 'token_123',
});
});

it('should override body organizationId with auth-derived org', async () => {
Expand All @@ -339,8 +342,8 @@ describe('QuestionnaireController', () => {
source: 'internal',
};
mockService.uploadAndParse.mockResolvedValue({
questionnaireId: 'q1',
totalQuestions: 10,
runId: 'run_123',
publicAccessToken: 'token_123',
});

await controller.uploadAndParse(dto as any, 'org_1');
Expand Down
50 changes: 27 additions & 23 deletions apps/api/src/questionnaire/questionnaire.service.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,14 @@ jest.mock('@/trigger/questionnaire/answer-question-helpers', () => ({
generateAnswerWithRAGBatch: jest.fn(),
}));

jest.mock('@trigger.dev/sdk', () => ({
tasks: {
trigger: jest.fn(),
},
}));

jest.mock('./utils/content-extractor', () => ({
extractContentFromFile: jest.fn(),
extractQuestionsWithAI: jest.fn(),
}));

jest.mock('./utils/question-parser', () => ({
Expand All @@ -59,13 +64,13 @@ jest.mock('./utils/questionnaire-storage', () => ({
import { db } from '@db';
import { syncManualAnswerToVector } from '@/vector-store/lib';
import { answerQuestion } from '@/trigger/questionnaire/answer-question';
import { tasks } from '@trigger.dev/sdk';
import {
updateAnsweredCount,
persistQuestionnaireResult,
uploadQuestionnaireFile,
saveGeneratedAnswer,
} from './utils/questionnaire-storage';
import { extractQuestionsWithAI } from './utils/content-extractor';
import { generateExportFile } from './utils/export-generator';

const mockDb = db as jest.Mocked<typeof db>;
Expand Down Expand Up @@ -218,7 +223,7 @@ describe('QuestionnaireService', () => {
(mockDb.questionnaire.findUnique as jest.Mock).mockResolvedValue(null);

await expect(service.deleteById('missing', 'org_1')).rejects.toThrow(
'Questionnaire not found',
'Questionnaire with ID missing not found',
);
expect(mockDb.questionnaire.delete).not.toHaveBeenCalled();
});
Expand Down Expand Up @@ -454,16 +459,15 @@ describe('QuestionnaireService', () => {
});

describe('uploadAndParse', () => {
it('should upload file, parse questions, and persist', async () => {
it('should upload file and trigger async parsing', async () => {
(uploadQuestionnaireFile as jest.Mock).mockResolvedValue({
s3Key: 'key',
fileSize: 1024,
});
(extractQuestionsWithAI as jest.Mock).mockResolvedValue([
{ question: 'Q1?', answer: null },
{ question: 'Q2?', answer: null },
]);
(persistQuestionnaireResult as jest.Mock).mockResolvedValue('q1');
(tasks.trigger as jest.Mock).mockResolvedValue({
id: 'run_123',
publicAccessToken: 'token_123',
});

const result = await service.uploadAndParse({
organizationId: 'org_1',
Expand All @@ -473,23 +477,23 @@ describe('QuestionnaireService', () => {
source: 'internal',
} as any);

expect(result).toEqual({ questionnaireId: 'q1', totalQuestions: 2 });
expect(result).toEqual({
runId: 'run_123',
publicAccessToken: 'token_123',
});
expect(uploadQuestionnaireFile).toHaveBeenCalled();
expect(extractQuestionsWithAI).toHaveBeenCalledWith(
'base64data',
'application/pdf',
expect.any(Object),
);
expect(persistQuestionnaireResult).toHaveBeenCalled();
});

it('should throw when persist returns null', async () => {
(uploadQuestionnaireFile as jest.Mock).mockResolvedValue({
expect(tasks.trigger).toHaveBeenCalledWith('parse-questionnaire', {
inputType: 's3',
organizationId: 'org_1',
s3Key: 'key',
fileName: 'test.pdf',
fileType: 'application/pdf',
fileSize: 1024,
});
(extractQuestionsWithAI as jest.Mock).mockResolvedValue([]);
(persistQuestionnaireResult as jest.Mock).mockResolvedValue(null);
});

it('should throw when upload fails', async () => {
(uploadQuestionnaireFile as jest.Mock).mockResolvedValue(null);

await expect(
service.uploadAndParse({
Expand All @@ -498,7 +502,7 @@ describe('QuestionnaireService', () => {
fileType: 'application/pdf',
fileData: 'base64data',
} as any),
).rejects.toThrow('Failed to save questionnaire');
).rejects.toThrow('Failed to upload questionnaire file to S3');
});
});

Expand Down
45 changes: 25 additions & 20 deletions apps/api/src/questionnaire/questionnaire.service.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,7 @@ import { generateAnswerWithRAGBatch } from '@/trigger/questionnaire/answer-quest
import { tasks } from '@trigger.dev/sdk';
import type { parseQuestionnaireTask } from '@/trigger/questionnaire/parse-questionnaire';
import { ParseQuestionnaireDto } from './dto/parse-questionnaire.dto';
import {
ExportQuestionnaireDto,
type QuestionnaireExportFormat,
} from './dto/export-questionnaire.dto';
import { ExportQuestionnaireDto } from './dto/export-questionnaire.dto';
import { AnswerSingleQuestionDto } from './dto/answer-single-question.dto';
import { SaveAnswerDto } from './dto/save-answer.dto';
import { DeleteAnswerDto } from './dto/delete-answer.dto';
Expand All @@ -24,13 +21,9 @@ import AdmZip from 'adm-zip';
// Import shared utilities
import {
extractContentFromFile,
extractQuestionsWithAI,
type ContentExtractionLogger,
} from './utils/content-extractor';
import {
parseQuestionsAndAnswers,
type QuestionAnswer as ParsedQA,
} from './utils/question-parser';
import { parseQuestionsAndAnswers } from './utils/question-parser';
import {
generateExportFile,
type ExportFormat,
Expand Down Expand Up @@ -85,12 +78,15 @@ export class QuestionnaireService {
async parseQuestionnaire(
dto: ParseQuestionnaireDto,
): Promise<ParsedQuestionnaireResult> {
// Use faster AI-powered extraction (combines extraction + parsing in one step)
const questionsAndAnswers = await extractQuestionsWithAI(
const extractedContent = await extractContentFromFile(
dto.fileData,
dto.fileType,
this.contentLogger,
);
const questionsAndAnswers = await parseQuestionsAndAnswers(
extractedContent,
this.contentLogger,
);

return {
vendorName: dto.vendorName,
Expand Down Expand Up @@ -122,24 +118,33 @@ export class QuestionnaireService {
);
}

console.log(Date.now(), 'Parsing questionnaire');
// Use faster AI-powered extraction (combines extraction + parsing in one step)
const questionsAndAnswers = await extractQuestionsWithAI(
this.logger.log('Parsing questionnaire for auto-answer export');
const extractedContent = await extractContentFromFile(
dto.fileData,
dto.fileType,
this.contentLogger,
);
console.log(Date.now(), 'Parsed questionnaire');
const questionsAndAnswers = await parseQuestionsAndAnswers(
extractedContent,
this.contentLogger,
);
this.logger.log('Parsed questionnaire for auto-answer export', {
questionCount: questionsAndAnswers.length,
});

console.log(Date.now(), 'Generating answers for questions');
this.logger.log('Generating answers for parsed questionnaire', {
questionCount: questionsAndAnswers.length,
});
const answered = await this.generateAnswersForQuestions(
questionsAndAnswers.map((qa) => ({
question: qa.question,
answer: qa.answer,
answer: null,
})),
dto.organizationId,
);
console.log(Date.now(), 'Generated answers for questions');
this.logger.log('Generated questionnaire answers', {
questionCount: answered.length,
});

const vendorName = dto.vendorName || dto.fileName || 'questionnaire';

Expand Down Expand Up @@ -186,7 +191,7 @@ export class QuestionnaireService {
// Single format export (default behavior)
const exportFile = await generateExportFile(
answered.map((a) => ({ question: a.question, answer: a.answer })),
dto.format as ExportFormat,
dto.format,
vendorName,
);

Expand Down Expand Up @@ -432,7 +437,7 @@ export class QuestionnaireService {

return await generateExportFile(
questionsAndAnswers,
dto.format as ExportFormat,
dto.format,
questionnaire.filename,
);
}
Expand Down
20 changes: 12 additions & 8 deletions apps/api/src/questionnaire/utils/constants.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,11 @@
* Shared constants for questionnaire module
*/

// Chunk sizes for question-aware parsing
export const MAX_CHUNK_SIZE_CHARS = 80_000;
// Chunk sizes for questionnaire item classification
export const MAX_CHUNK_SIZE_CHARS = 25_000;
export const MIN_CHUNK_SIZE_CHARS = 5_000;
export const MAX_QUESTIONS_PER_CHUNK = 1;
export const MAX_CLASSIFICATION_CONCURRENCY = 4;

// File size limits
export const MAX_FILE_SIZE_BYTES = 100 * 1024 * 1024; // 100MB
Expand All @@ -30,10 +31,10 @@ CRITICAL RULES:
8. Always write in first person plural (we, our, us) as if speaking on behalf of the organization.
9. Keep answers to 1-3 sentences maximum unless the question explicitly requires more detail.`;

export const QUESTION_PARSING_SYSTEM_PROMPT = `You parse vendor questionnaires from Excel, PDF, or document text. Extract all items the respondent is expected to answer (questions, prompts, or compliance statements) and pair each with its answer if one exists.
export const QUESTION_PARSING_SYSTEM_PROMPT = `You parse vendor questionnaires from Excel, PDF, images, CSV, or document text. Your job is to classify content and return only answerable questionnaire items.

What counts as an item to extract:
1. Interrogative questions ending with "?" or starting with What/How/Why/When/Where/Is/Are/Do/Does/Can/Will/Should.
1. Interrogative questions in any language. A question mark is helpful but not required.
2. Form fields like "1.1 Vendor Name", "Contact Email", "Company Address" (numbered or labeled fields requesting information).
3. Compliance/requirement statements that the respondent must confirm or describe their compliance with — vendor questionnaires often consist entirely of these. Examples:
- "The organization must X"
Expand All @@ -53,10 +54,13 @@ Input format hints:
Rules:
1. Find the column containing the actual item text, not just IDs/numbers (e.g., skip "SQ14.3", keep the full sentence).
2. Extract the FULL text of each item.
3. Match each item to its Response/Answer text from the same row. If empty or missing, set answer = null.
4. Skip pure section headers ("Information Security Program", "General Information") UNLESS they are also items the respondent must answer.
5. Skip metadata rows (Company Name, Date, file headers).
6. NEVER return zero items if the document has any rows of substantive content — extract every row that looks like an item the respondent must address.`;
3. For upload-to-autofill parsing, always set saved answers to null. The user expects us to generate answers later.
4. Never use scoring/options values as answers, e.g. "(Oui : 0, Non : 3)" or "(Yes : 0, No : 1)".
5. Never use placeholders as answers, e.g. "A remplir", "A compléter", "To be completed".
6. Do not treat guidance, instructions, examples, mode opératoire, remediation plans, or calculated score/formula cells as answerable items.
7. Skip pure section headers ("Information Security Program", "General Information") UNLESS the text itself asks the respondent to provide information.
8. Skip metadata rows (Company Name, Date, file headers) unless the field is clearly part of the vendor questionnaire response surface.
9. Be high recall for answerable items, but do not include instructions, examples, scoring, or metadata just to avoid returning zero items.`;

// Vision extraction prompt for PDFs and images
export const VISION_EXTRACTION_PROMPT = `Transcribe this document into plain text. Output ONLY the document's text content — no summaries, no analysis, no commentary about what the document is or whether it contains questions.
Expand Down
53 changes: 53 additions & 0 deletions apps/api/src/questionnaire/utils/content-extractor.spec.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import { extractContentFromFile } from './content-extractor';
import ExcelJS from 'exceljs';
import { PDFDocument } from 'pdf-lib';
import { generateText } from 'ai';

// Mock AI dependencies
jest.mock('@ai-sdk/openai', () => ({ openai: jest.fn() }));
Expand Down Expand Up @@ -50,6 +52,31 @@ describe('content-extractor: extractContentFromFile', () => {
expect(result).toContain('Rating?');
});

it('should ignore scoring columns and placeholders in BPCE-style sheets', async () => {
const workbook = new ExcelJS.Workbook();
const worksheet = workbook.addWorksheet('Risk assessment SSI');
worksheet.getCell('B10').value =
'Le prestataire effectue-t-il des revues régulières des comptes à privilèges ?';
worksheet.getCell('F10').value = {
formula: 'IF(E10="NON",3,0)',
result: 0,
};
worksheet.getCell('K10').value = '(Oui : 0, Non : 3)';
worksheet.getCell('M10').value = 'A remplir';
worksheet.getCell('O10').value =
"La gestion des comptes à privilèges consiste à contrôler l'accès aux comptes.";

const arrayBuffer = await workbook.xlsx.writeBuffer();
const base64 = Buffer.from(arrayBuffer).toString('base64');
const result = await extractContentFromFile(base64, XLSX_MIME);

expect(result).toContain('[B10 Question]');
expect(result).toContain('revues régulières des comptes à privilèges');
expect(result).not.toContain('(Oui : 0, Non : 3)');
expect(result).not.toContain('A remplir');
expect(result).not.toContain('[F10');
});

it('should extract content from multiple sheets', async () => {
const buffer = await createTestExcelBuffer([
{
Expand Down Expand Up @@ -85,6 +112,32 @@ describe('content-extractor: extractContentFromFile', () => {
expect(result).toContain('What is 2+2?,4');
});

it('should fall back to OpenAI when Claude PDF extraction is overloaded', async () => {
const pdf = await PDFDocument.create();
pdf.addPage();
const bytes = await pdf.save();
const mockGenerateText = generateText as jest.Mock;
mockGenerateText
.mockRejectedValueOnce(new Error('Overloaded'))
.mockResolvedValueOnce({ text: 'Extracted PDF text' });

const result = await extractContentFromFile(
Buffer.from(bytes).toString('base64'),
'application/pdf',
);

expect(result).toBe('Extracted PDF text');
expect(mockGenerateText).toHaveBeenCalledTimes(2);
});

it('should reject legacy XLS files with a clear message', async () => {
const base64 = Buffer.from('legacy-binary-xls').toString('base64');

await expect(
extractContentFromFile(base64, 'application/vnd.ms-excel'),
).rejects.toThrow('Legacy Excel files');
});

it('should handle plain text files', async () => {
const text = 'Some compliance document content';
const base64 = Buffer.from(text).toString('base64');
Expand Down
Loading
Loading