trycompai · tofikwest · May 12, 2026 · May 12, 2026 · May 12, 2026 · May 12, 2026
diff --git a/apps/api/src/questionnaire/questionnaire.controller.spec.ts b/apps/api/src/questionnaire/questionnaire.controller.spec.ts
@@ -321,13 +321,16 @@ describe('QuestionnaireController', () => {
         source: 'internal',
       };
       mockService.uploadAndParse.mockResolvedValue({
-        questionnaireId: 'q1',
-        totalQuestions: 10,
+        runId: 'run_123',
+        publicAccessToken: 'token_123',
       });
 
       const result = await controller.uploadAndParse(dto as any, 'org_1');
 
-      expect(result).toEqual({ questionnaireId: 'q1', totalQuestions: 10 });
+      expect(result).toEqual({
+        runId: 'run_123',
+        publicAccessToken: 'token_123',
+      });
     });
 
     it('should override body organizationId with auth-derived org', async () => {
@@ -339,8 +342,8 @@ describe('QuestionnaireController', () => {
         source: 'internal',
       };
       mockService.uploadAndParse.mockResolvedValue({
-        questionnaireId: 'q1',
-        totalQuestions: 10,
+        runId: 'run_123',
+        publicAccessToken: 'token_123',
       });
 
       await controller.uploadAndParse(dto as any, 'org_1');

diff --git a/apps/api/src/questionnaire/questionnaire.service.spec.ts b/apps/api/src/questionnaire/questionnaire.service.spec.ts
@@ -36,9 +36,14 @@ jest.mock('@/trigger/questionnaire/answer-question-helpers', () => ({
   generateAnswerWithRAGBatch: jest.fn(),
 }));
 
+jest.mock('@trigger.dev/sdk', () => ({
+  tasks: {
+    trigger: jest.fn(),
+  },
+}));
+
 jest.mock('./utils/content-extractor', () => ({
   extractContentFromFile: jest.fn(),
-  extractQuestionsWithAI: jest.fn(),
 }));
 
 jest.mock('./utils/question-parser', () => ({
@@ -59,13 +64,13 @@ jest.mock('./utils/questionnaire-storage', () => ({
 import { db } from '@db';
 import { syncManualAnswerToVector } from '@/vector-store/lib';
 import { answerQuestion } from '@/trigger/questionnaire/answer-question';
+import { tasks } from '@trigger.dev/sdk';
 import {
   updateAnsweredCount,
   persistQuestionnaireResult,
   uploadQuestionnaireFile,
   saveGeneratedAnswer,
 } from './utils/questionnaire-storage';
-import { extractQuestionsWithAI } from './utils/content-extractor';
 import { generateExportFile } from './utils/export-generator';
 
 const mockDb = db as jest.Mocked<typeof db>;
@@ -218,7 +223,7 @@ describe('QuestionnaireService', () => {
       (mockDb.questionnaire.findUnique as jest.Mock).mockResolvedValue(null);
 
       await expect(service.deleteById('missing', 'org_1')).rejects.toThrow(
-        'Questionnaire not found',
+        'Questionnaire with ID missing not found',
       );
       expect(mockDb.questionnaire.delete).not.toHaveBeenCalled();
     });
@@ -454,16 +459,15 @@ describe('QuestionnaireService', () => {
   });
 
   describe('uploadAndParse', () => {
-    it('should upload file, parse questions, and persist', async () => {
+    it('should upload file and trigger async parsing', async () => {
       (uploadQuestionnaireFile as jest.Mock).mockResolvedValue({
         s3Key: 'key',
         fileSize: 1024,
       });
-      (extractQuestionsWithAI as jest.Mock).mockResolvedValue([
-        { question: 'Q1?', answer: null },
-        { question: 'Q2?', answer: null },
-      ]);
-      (persistQuestionnaireResult as jest.Mock).mockResolvedValue('q1');
+      (tasks.trigger as jest.Mock).mockResolvedValue({
+        id: 'run_123',
+        publicAccessToken: 'token_123',
+      });
 
       const result = await service.uploadAndParse({
         organizationId: 'org_1',
@@ -473,23 +477,23 @@ describe('QuestionnaireService', () => {
         source: 'internal',
       } as any);
 
-      expect(result).toEqual({ questionnaireId: 'q1', totalQuestions: 2 });
+      expect(result).toEqual({
+        runId: 'run_123',
+        publicAccessToken: 'token_123',
+      });
       expect(uploadQuestionnaireFile).toHaveBeenCalled();
-      expect(extractQuestionsWithAI).toHaveBeenCalledWith(
-        'base64data',
-        'application/pdf',
-        expect.any(Object),
-      );
-      expect(persistQuestionnaireResult).toHaveBeenCalled();
-    });
-
-    it('should throw when persist returns null', async () => {
-      (uploadQuestionnaireFile as jest.Mock).mockResolvedValue({
+      expect(tasks.trigger).toHaveBeenCalledWith('parse-questionnaire', {
+        inputType: 's3',
+        organizationId: 'org_1',
         s3Key: 'key',
+        fileName: 'test.pdf',
+        fileType: 'application/pdf',
         fileSize: 1024,
       });
-      (extractQuestionsWithAI as jest.Mock).mockResolvedValue([]);
-      (persistQuestionnaireResult as jest.Mock).mockResolvedValue(null);
+    });
+
+    it('should throw when upload fails', async () => {
+      (uploadQuestionnaireFile as jest.Mock).mockResolvedValue(null);
 
       await expect(
         service.uploadAndParse({
@@ -498,7 +502,7 @@ describe('QuestionnaireService', () => {
           fileType: 'application/pdf',
           fileData: 'base64data',
         } as any),
-      ).rejects.toThrow('Failed to save questionnaire');
+      ).rejects.toThrow('Failed to upload questionnaire file to S3');
     });
   });
 

diff --git a/apps/api/src/questionnaire/questionnaire.service.ts b/apps/api/src/questionnaire/questionnaire.service.ts
@@ -5,10 +5,7 @@ import { generateAnswerWithRAGBatch } from '@/trigger/questionnaire/answer-quest
 import { tasks } from '@trigger.dev/sdk';
 import type { parseQuestionnaireTask } from '@/trigger/questionnaire/parse-questionnaire';
 import { ParseQuestionnaireDto } from './dto/parse-questionnaire.dto';
-import {
-  ExportQuestionnaireDto,
-  type QuestionnaireExportFormat,
-} from './dto/export-questionnaire.dto';
+import { ExportQuestionnaireDto } from './dto/export-questionnaire.dto';
 import { AnswerSingleQuestionDto } from './dto/answer-single-question.dto';
 import { SaveAnswerDto } from './dto/save-answer.dto';
 import { DeleteAnswerDto } from './dto/delete-answer.dto';
@@ -24,13 +21,9 @@ import AdmZip from 'adm-zip';
 // Import shared utilities
 import {
   extractContentFromFile,
-  extractQuestionsWithAI,
   type ContentExtractionLogger,
 } from './utils/content-extractor';
-import {
-  parseQuestionsAndAnswers,
-  type QuestionAnswer as ParsedQA,
-} from './utils/question-parser';
+import { parseQuestionsAndAnswers } from './utils/question-parser';
 import {
   generateExportFile,
   type ExportFormat,
@@ -85,12 +78,15 @@ export class QuestionnaireService {
   async parseQuestionnaire(
     dto: ParseQuestionnaireDto,
   ): Promise<ParsedQuestionnaireResult> {
-    // Use faster AI-powered extraction (combines extraction + parsing in one step)
-    const questionsAndAnswers = await extractQuestionsWithAI(
+    const extractedContent = await extractContentFromFile(
       dto.fileData,
       dto.fileType,
       this.contentLogger,
     );
+    const questionsAndAnswers = await parseQuestionsAndAnswers(
+      extractedContent,
+      this.contentLogger,
+    );
 
     return {
       vendorName: dto.vendorName,
@@ -122,24 +118,33 @@ export class QuestionnaireService {
       );
     }
 
-    console.log(Date.now(), 'Parsing questionnaire');
-    // Use faster AI-powered extraction (combines extraction + parsing in one step)
-    const questionsAndAnswers = await extractQuestionsWithAI(
+    this.logger.log('Parsing questionnaire for auto-answer export');
+    const extractedContent = await extractContentFromFile(
       dto.fileData,
       dto.fileType,
       this.contentLogger,
     );
-    console.log(Date.now(), 'Parsed questionnaire');
+    const questionsAndAnswers = await parseQuestionsAndAnswers(
+      extractedContent,
+      this.contentLogger,
+    );
+    this.logger.log('Parsed questionnaire for auto-answer export', {
+      questionCount: questionsAndAnswers.length,
+    });
 
-    console.log(Date.now(), 'Generating answers for questions');
+    this.logger.log('Generating answers for parsed questionnaire', {
+      questionCount: questionsAndAnswers.length,
+    });
     const answered = await this.generateAnswersForQuestions(
       questionsAndAnswers.map((qa) => ({
         question: qa.question,
-        answer: qa.answer,
+        answer: null,
       })),
       dto.organizationId,
     );
-    console.log(Date.now(), 'Generated answers for questions');
+    this.logger.log('Generated questionnaire answers', {
+      questionCount: answered.length,
+    });
 
     const vendorName = dto.vendorName || dto.fileName || 'questionnaire';
 
@@ -186,7 +191,7 @@ export class QuestionnaireService {
     // Single format export (default behavior)
     const exportFile = await generateExportFile(
       answered.map((a) => ({ question: a.question, answer: a.answer })),
-      dto.format as ExportFormat,
+      dto.format,
       vendorName,
     );
 
@@ -432,7 +437,7 @@ export class QuestionnaireService {
 
     return await generateExportFile(
       questionsAndAnswers,
-      dto.format as ExportFormat,
+      dto.format,
       questionnaire.filename,
     );
   }

diff --git a/apps/api/src/questionnaire/utils/constants.ts b/apps/api/src/questionnaire/utils/constants.ts
@@ -2,10 +2,11 @@
  * Shared constants for questionnaire module
  */
 
-// Chunk sizes for question-aware parsing
-export const MAX_CHUNK_SIZE_CHARS = 80_000;
+// Chunk sizes for questionnaire item classification
+export const MAX_CHUNK_SIZE_CHARS = 25_000;
 export const MIN_CHUNK_SIZE_CHARS = 5_000;
 export const MAX_QUESTIONS_PER_CHUNK = 1;
+export const MAX_CLASSIFICATION_CONCURRENCY = 4;
 
 // File size limits
 export const MAX_FILE_SIZE_BYTES = 100 * 1024 * 1024; // 100MB
@@ -30,10 +31,10 @@ CRITICAL RULES:
 8. Always write in first person plural (we, our, us) as if speaking on behalf of the organization.
 9. Keep answers to 1-3 sentences maximum unless the question explicitly requires more detail.`;
 
-export const QUESTION_PARSING_SYSTEM_PROMPT = `You parse vendor questionnaires from Excel, PDF, or document text. Extract all items the respondent is expected to answer (questions, prompts, or compliance statements) and pair each with its answer if one exists.
+export const QUESTION_PARSING_SYSTEM_PROMPT = `You parse vendor questionnaires from Excel, PDF, images, CSV, or document text. Your job is to classify content and return only answerable questionnaire items.
 
 What counts as an item to extract:
-1. Interrogative questions ending with "?" or starting with What/How/Why/When/Where/Is/Are/Do/Does/Can/Will/Should.
+1. Interrogative questions in any language. A question mark is helpful but not required.
 2. Form fields like "1.1 Vendor Name", "Contact Email", "Company Address" (numbered or labeled fields requesting information).
 3. Compliance/requirement statements that the respondent must confirm or describe their compliance with — vendor questionnaires often consist entirely of these. Examples:
    - "The organization must X"
@@ -53,10 +54,13 @@ Input format hints:
 Rules:
 1. Find the column containing the actual item text, not just IDs/numbers (e.g., skip "SQ14.3", keep the full sentence).
 2. Extract the FULL text of each item.
-3. Match each item to its Response/Answer text from the same row. If empty or missing, set answer = null.
-4. Skip pure section headers ("Information Security Program", "General Information") UNLESS they are also items the respondent must answer.
-5. Skip metadata rows (Company Name, Date, file headers).
-6. NEVER return zero items if the document has any rows of substantive content — extract every row that looks like an item the respondent must address.`;
+3. For upload-to-autofill parsing, always set saved answers to null. The user expects us to generate answers later.
+4. Never use scoring/options values as answers, e.g. "(Oui : 0, Non : 3)" or "(Yes : 0, No : 1)".
+5. Never use placeholders as answers, e.g. "A remplir", "A compléter", "To be completed".
+6. Do not treat guidance, instructions, examples, mode opératoire, remediation plans, or calculated score/formula cells as answerable items.
+7. Skip pure section headers ("Information Security Program", "General Information") UNLESS the text itself asks the respondent to provide information.
+8. Skip metadata rows (Company Name, Date, file headers) unless the field is clearly part of the vendor questionnaire response surface.
+9. Be high recall for answerable items, but do not include instructions, examples, scoring, or metadata just to avoid returning zero items.`;
 
 // Vision extraction prompt for PDFs and images
 export const VISION_EXTRACTION_PROMPT = `Transcribe this document into plain text. Output ONLY the document's text content — no summaries, no analysis, no commentary about what the document is or whether it contains questions.

diff --git a/apps/api/src/questionnaire/utils/content-extractor.spec.ts b/apps/api/src/questionnaire/utils/content-extractor.spec.ts
@@ -1,5 +1,7 @@
 import { extractContentFromFile } from './content-extractor';
 import ExcelJS from 'exceljs';
+import { PDFDocument } from 'pdf-lib';
+import { generateText } from 'ai';
 
 // Mock AI dependencies
 jest.mock('@ai-sdk/openai', () => ({ openai: jest.fn() }));
@@ -50,6 +52,31 @@ describe('content-extractor: extractContentFromFile', () => {
     expect(result).toContain('Rating?');
   });
 
+  it('should ignore scoring columns and placeholders in BPCE-style sheets', async () => {
+    const workbook = new ExcelJS.Workbook();
+    const worksheet = workbook.addWorksheet('Risk assessment SSI');
+    worksheet.getCell('B10').value =
+      'Le prestataire effectue-t-il des revues régulières des comptes à privilèges ?';
+    worksheet.getCell('F10').value = {
+      formula: 'IF(E10="NON",3,0)',
+      result: 0,
+    };
+    worksheet.getCell('K10').value = '(Oui : 0, Non : 3)';
+    worksheet.getCell('M10').value = 'A remplir';
+    worksheet.getCell('O10').value =
+      "La gestion des comptes à privilèges consiste à contrôler l'accès aux comptes.";
+
+    const arrayBuffer = await workbook.xlsx.writeBuffer();
+    const base64 = Buffer.from(arrayBuffer).toString('base64');
+    const result = await extractContentFromFile(base64, XLSX_MIME);
+
+    expect(result).toContain('[B10 Question]');
+    expect(result).toContain('revues régulières des comptes à privilèges');
+    expect(result).not.toContain('(Oui : 0, Non : 3)');
+    expect(result).not.toContain('A remplir');
+    expect(result).not.toContain('[F10');
+  });
+
   it('should extract content from multiple sheets', async () => {
     const buffer = await createTestExcelBuffer([
       {
@@ -85,6 +112,32 @@ describe('content-extractor: extractContentFromFile', () => {
     expect(result).toContain('What is 2+2?,4');
   });
 
+  it('should fall back to OpenAI when Claude PDF extraction is overloaded', async () => {
+    const pdf = await PDFDocument.create();
+    pdf.addPage();
+    const bytes = await pdf.save();
+    const mockGenerateText = generateText as jest.Mock;
+    mockGenerateText
+      .mockRejectedValueOnce(new Error('Overloaded'))
+      .mockResolvedValueOnce({ text: 'Extracted PDF text' });
+
+    const result = await extractContentFromFile(
+      Buffer.from(bytes).toString('base64'),
+      'application/pdf',
+    );
+
+    expect(result).toBe('Extracted PDF text');
+    expect(mockGenerateText).toHaveBeenCalledTimes(2);
+  });
+
+  it('should reject legacy XLS files with a clear message', async () => {
+    const base64 = Buffer.from('legacy-binary-xls').toString('base64');
+
+    await expect(
+      extractContentFromFile(base64, 'application/vnd.ms-excel'),
+    ).rejects.toThrow('Legacy Excel files');
+  });
+
   it('should handle plain text files', async () => {
     const text = 'Some compliance document content';
     const base64 = Buffer.from(text).toString('base64');