From c74e2a15ca3039710762e82effd471188b556623 Mon Sep 17 00:00:00 2001
From: Mariano Fuentes <marfuen98@gmail.com>
Date: Thu, 7 May 2026 17:22:49 +0100
Subject: [PATCH] fix(questionnaire): extract compliance statements as
 questions

fix(questionnaire): extract compliance statements as questions
---
 apps/api/src/questionnaire/utils/constants.ts |  58 ++--
 .../utils/question-parser.spec.ts             | 286 ++++++++++++++++++
 .../questionnaire/utils/question-parser.ts    |  22 +-
 3 files changed, 333 insertions(+), 33 deletions(-)
 create mode 100644 apps/api/src/questionnaire/utils/question-parser.spec.ts

diff --git a/apps/api/src/questionnaire/utils/constants.ts b/apps/api/src/questionnaire/utils/constants.ts
index 54c648b31b..972d96ea76 100644
--- a/apps/api/src/questionnaire/utils/constants.ts
+++ b/apps/api/src/questionnaire/utils/constants.ts
@@ -30,37 +30,43 @@ CRITICAL RULES:
 8. Always write in first person plural (we, our, us) as if speaking on behalf of the organization.
 9. Keep answers to 1-3 sentences maximum unless the question explicitly requires more detail.`;
 
-export const QUESTION_PARSING_SYSTEM_PROMPT = `You parse vendor questionnaires from Excel spreadsheets. Extract all question-answer pairs.
+export const QUESTION_PARSING_SYSTEM_PROMPT = `You parse vendor questionnaires from Excel, PDF, or document text. Extract all items the respondent is expected to answer (questions, prompts, or compliance statements) and pair each with its answer if one exists.
 
-Input format:
-- Each row has columns like: [Question] ID | [Question Text] actual question | [Response] answer | [Comment] notes
-- Or: [Question] actual question text | [Response] answer
-- Lines starting with [COLUMNS:] show the column headers - use these to understand the structure
-- The actual question TEXT is usually the longest cell, contains "?" or starts with What/How/Do/Is/Are/etc.
+What counts as an item to extract:
+1. Interrogative questions ending with "?" or starting with What/How/Why/When/Where/Is/Are/Do/Does/Can/Will/Should.
+2. Form fields like "1.1 Vendor Name", "Contact Email", "Company Address" (numbered or labeled fields requesting information).
+3. Compliance/requirement statements that the respondent must confirm or describe their compliance with — vendor questionnaires often consist entirely of these. Examples:
+   - "The organization must X"
+   - "The organization has X"
+   - "The organization ensures Y"
+   - "The organization implements Z"
+   - "We have a documented procedure for X"
+   Each such statement is one item, even with no question mark.
+4. Items marked with "*", "(required)", "(Single selection allowed)", "(Multiple selections allowed)".
 
-CRITICAL: The "Question" column might contain just an ID (like "SQ14.3") - look for the column with the ACTUAL question text!
+Input format hints:
+- Tables have rows like: [Question] ID | [Question Text] actual text | [Response] answer | [Comment] notes
+- Or simpler: [Question] text | [Response] answer
+- Lines starting with [COLUMNS:] show column headers — use them to find the right column.
+- Single-column checklists: each row IS the item. The answer column may be empty (set answer = null).
 
 Rules:
-1. Find the column containing actual question sentences (not just IDs/numbers)
-2. The question text is usually a full sentence ending with "?" or starting with interrogative words
-3. Extract the FULL question text, not the question ID
-4. Match each question to its Response/Answer from the same row
-5. If Response is empty, set answer to null
-6. Skip section headers (e.g., "Information Security Program", "General Information")
-7. Skip metadata rows (Company Name, Date, etc.)`;
+1. Find the column containing the actual item text, not just IDs/numbers (e.g., skip "SQ14.3", keep the full sentence).
+2. Extract the FULL text of each item.
+3. Match each item to its Response/Answer text from the same row. If empty or missing, set answer = null.
+4. Skip pure section headers ("Information Security Program", "General Information") UNLESS they are also items the respondent must answer.
+5. Skip metadata rows (Company Name, Date, file headers).
+6. NEVER return zero items if the document has any rows of substantive content — extract every row that looks like an item the respondent must address.`;
 
 // Vision extraction prompt for PDFs and images
-export const VISION_EXTRACTION_PROMPT = `Extract all text and identify question-answer pairs from this document.
+export const VISION_EXTRACTION_PROMPT = `Transcribe this document into plain text. Output ONLY the document's text content — no summaries, no analysis, no commentary about what the document is or whether it contains questions.
 
-Look for:
-- Tables with columns labeled "Question", "Q", "Response", "Answer", "A", "Comment"
-- Questions ending with "?" or starting with What/How/Why/When/Where/Is/Are/Do/Does/Can/Will/Should
-- Numbered questions like "06. Do you have...", "1) What is...", "Q1: How do..."
-- Section headers (e.g., "Information Security Program", "General Information") that group questions
-
-For each question found:
-- Extract the full question text (may omit number prefix)
-- Match it to any nearby response/answer in the same row or adjacent cell
-- If no answer is provided, note it as empty
+Rules:
+- Output every visible row, cell, paragraph, list item, and heading. Do not skip rows that "look like statements" — vendor questionnaires often consist entirely of compliance/requirement statements ("The organization must X", "The organization has X") that the respondent fills in, with no question marks or interrogatives.
+- For tables: preserve row order and use " | " to separate cells in the same row. If columns have headers (e.g., Question, Response, Answer, Comment), keep them and prefix each cell with [Header].
+- For single-column checklists (one statement per row): output one statement per line, in document order.
+- For each row, include any answer/response/comment text from adjacent columns or rows, even if the answer cell is blank.
+- Do NOT add bullet points, numbering, or formatting that wasn't in the source.
+- Do NOT write things like "no questions found", "this is a compliance document", or any meta-analysis. Just transcribe the content.
 
-Preserve the order of questions as they appear. Return Question → Answer pairs in a structured format.`;
+The downstream parser will identify which rows are questions/items and which are answers — your only job is to faithfully extract the text.`;
diff --git a/apps/api/src/questionnaire/utils/question-parser.spec.ts b/apps/api/src/questionnaire/utils/question-parser.spec.ts
new file mode 100644
index 0000000000..c930f4b46d
--- /dev/null
+++ b/apps/api/src/questionnaire/utils/question-parser.spec.ts
@@ -0,0 +1,286 @@
+jest.mock('@ai-sdk/openai', () => ({ openai: jest.fn() }));
+jest.mock('ai', () => ({
+  generateObject: jest.fn(),
+  jsonSchema: jest.fn((s) => s),
+}));
+
+import {
+  looksLikeQuestionLine,
+  buildQuestionAwareChunks,
+  estimateQuestionCount,
+} from './question-parser';
+
+const CHUNK_OPTS = {
+  maxChunkChars: 80_000,
+  minChunkChars: 5_000,
+  maxQuestionsPerChunk: 1,
+};
+
+describe('looksLikeQuestionLine', () => {
+  describe('existing patterns (regression guard)', () => {
+    it('detects question marks', () => {
+      expect(looksLikeQuestionLine('Do you have a security policy?')).toBe(
+        true,
+      );
+      expect(looksLikeQuestionLine('Company Name：何ですか？')).toBe(true);
+    });
+
+    it('detects "Question:" labels', () => {
+      expect(looksLikeQuestionLine('[Question] SQ14.3')).toBe(false);
+      expect(
+        looksLikeQuestionLine('Question: Do you encrypt data at rest'),
+      ).toBe(true);
+      expect(looksLikeQuestionLine('question : Describe your BCP')).toBe(true);
+    });
+
+    it('detects explicit question/Q prefix', () => {
+      expect(looksLikeQuestionLine('Q1. What is your data retention?')).toBe(
+        true,
+      );
+      expect(looksLikeQuestionLine('Question 5: Describe controls')).toBe(true);
+    });
+
+    it('detects interrogative-starting lines', () => {
+      expect(
+        looksLikeQuestionLine('What security certifications do you hold'),
+      ).toBe(true);
+      expect(
+        looksLikeQuestionLine('How do you handle data breaches'),
+      ).toBe(true);
+      expect(looksLikeQuestionLine('Is data encrypted in transit')).toBe(true);
+      expect(looksLikeQuestionLine('Are backups tested regularly')).toBe(true);
+      expect(looksLikeQuestionLine('Does the company have SOC 2')).toBe(true);
+      expect(looksLikeQuestionLine('Can users export their data')).toBe(true);
+      expect(
+        looksLikeQuestionLine('Describe your incident response plan'),
+      ).toBe(true);
+      expect(looksLikeQuestionLine('List all subprocessors')).toBe(true);
+    });
+
+    it('detects numbered questions with interrogatives', () => {
+      expect(looksLikeQuestionLine('06. Do you have a BCP?')).toBe(true);
+      expect(looksLikeQuestionLine('1) What is your uptime SLA')).toBe(true);
+      expect(looksLikeQuestionLine('Q1: How do you handle PII')).toBe(true);
+    });
+
+    it('detects form-style numbered fields', () => {
+      expect(looksLikeQuestionLine('1.1 Vendor Name')).toBe(true);
+      expect(looksLikeQuestionLine('2.3 Contact Email')).toBe(true);
+      expect(looksLikeQuestionLine('1.4 Company Address')).toBe(true);
+    });
+
+    it('detects required markers', () => {
+      expect(looksLikeQuestionLine('Company legal name *')).toBe(true);
+    });
+
+    it('detects selection notes', () => {
+      expect(
+        looksLikeQuestionLine(
+          'Primary data center location (Single selection allowed)',
+        ),
+      ).toBe(true);
+      expect(
+        looksLikeQuestionLine(
+          'Which certifications (Multiple selections allowed)',
+        ),
+      ).toBe(true);
+    });
+
+    it('rejects empty lines and pure metadata', () => {
+      expect(looksLikeQuestionLine('')).toBe(false);
+      expect(looksLikeQuestionLine('   ')).toBe(false);
+    });
+  });
+
+  describe('compliance-statement patterns (new)', () => {
+    it('detects "The organization" statements', () => {
+      expect(
+        looksLikeQuestionLine(
+          'The organization must determine the respective roles and responsibilities',
+        ),
+      ).toBe(true);
+      expect(
+        looksLikeQuestionLine(
+          'The organization has entered into a contract with the PII data controller',
+        ),
+      ).toBe(true);
+      expect(
+        looksLikeQuestionLine(
+          'The organization ensures that temporary files are deleted',
+        ),
+      ).toBe(true);
+      expect(
+        looksLikeQuestionLine(
+          'The organization conducts a risk analysis regarding PII processing',
+        ),
+      ).toBe(true);
+      expect(
+        looksLikeQuestionLine(
+          'The organization implements procedures and technical solutions',
+        ),
+      ).toBe(true);
+    });
+
+    it('detects "The company/vendor/supplier" statements', () => {
+      expect(
+        looksLikeQuestionLine(
+          'The company maintains a processing register for compliance',
+        ),
+      ).toBe(true);
+      expect(
+        looksLikeQuestionLine(
+          'The vendor provides the data controller with appropriate information',
+        ),
+      ).toBe(true);
+      expect(
+        looksLikeQuestionLine(
+          'The supplier has defined policies for managing data subject requests',
+        ),
+      ).toBe(true);
+    });
+
+    it('detects "Our organization/company/team" statements', () => {
+      expect(
+        looksLikeQuestionLine(
+          'Our organization maintains SOC 2 Type II certification',
+        ),
+      ).toBe(true);
+      expect(
+        looksLikeQuestionLine('Our company encrypts all data at rest'),
+      ).toBe(true);
+      expect(
+        looksLikeQuestionLine('Our team conducts quarterly security reviews'),
+      ).toBe(true);
+    });
+
+    it('does NOT match ambiguous "We X" lines (could be answers)', () => {
+      expect(
+        looksLikeQuestionLine('We retain data for 90 days.'),
+      ).toBe(false);
+      expect(
+        looksLikeQuestionLine('We follow our IRP documented in SOC 2.'),
+      ).toBe(false);
+    });
+
+    it('does NOT false-positive on section headers and metadata', () => {
+      expect(
+        looksLikeQuestionLine('Information Security Program'),
+      ).toBe(false);
+      expect(looksLikeQuestionLine('General Information')).toBe(false);
+      expect(looksLikeQuestionLine('Section 2: Data Protection')).toBe(false);
+      expect(looksLikeQuestionLine('Acme Corp')).toBe(false);
+      expect(looksLikeQuestionLine('2026-01-15')).toBe(false);
+      expect(looksLikeQuestionLine('Version 3.0')).toBe(false);
+      expect(looksLikeQuestionLine('Confidential')).toBe(false);
+    });
+
+    it('handles case-insensitive matching', () => {
+      expect(
+        looksLikeQuestionLine('THE ORGANIZATION MUST PROVIDE EVIDENCE'),
+      ).toBe(true);
+      expect(
+        looksLikeQuestionLine('the organization supports the data controller'),
+      ).toBe(true);
+    });
+  });
+});
+
+describe('buildQuestionAwareChunks', () => {
+  it('returns empty for empty input', () => {
+    expect(buildQuestionAwareChunks('', CHUNK_OPTS)).toEqual([]);
+    expect(buildQuestionAwareChunks('   ', CHUNK_OPTS)).toEqual([]);
+  });
+
+  it('chunks interrogative questions one per chunk', () => {
+    const content = [
+      'What is your data retention policy?',
+      'We retain data for 90 days.',
+      'How do you handle security incidents?',
+      'We follow our IRP documented in SOC 2.',
+      'Do you encrypt data at rest?',
+      'Yes, AES-256.',
+    ].join('\n');
+
+    const chunks = buildQuestionAwareChunks(content, CHUNK_OPTS);
+    expect(chunks.length).toBe(3);
+    expect(chunks[0].content).toContain('data retention policy');
+    expect(chunks[1].content).toContain('security incidents');
+    expect(chunks[2].content).toContain('encrypt data at rest');
+  });
+
+  it('chunks compliance statements one per chunk', () => {
+    const content = [
+      'The organization must determine roles and responsibilities for PII processing.',
+      'The organization has entered into a contract with the PII data controller.',
+      'The organization conducts a risk analysis regarding PII processing.',
+    ].join('\n');
+
+    const chunks = buildQuestionAwareChunks(content, CHUNK_OPTS);
+    expect(chunks.length).toBe(3);
+    expect(chunks[0].content).toContain('roles and responsibilities');
+    expect(chunks[1].content).toContain('entered into a contract');
+    expect(chunks[2].content).toContain('risk analysis');
+  });
+
+  it('handles mixed interrogative + compliance content', () => {
+    const content = [
+      'The organization must have documented procedures for PII deletion.',
+      'How often do you review your data retention policies?',
+      'The organization ensures temporary files are deleted.',
+      'Do you have a DPIA process?',
+    ].join('\n');
+
+    const chunks = buildQuestionAwareChunks(content, CHUNK_OPTS);
+    expect(chunks.length).toBe(4);
+  });
+
+  it('keeps non-question context lines with the preceding question', () => {
+    const content = [
+      'What is your encryption standard?',
+      'Please provide details about key management.',
+      'Additional notes on rotation schedule.',
+      'How do you handle key rotation?',
+    ].join('\n');
+
+    const chunks = buildQuestionAwareChunks(content, CHUNK_OPTS);
+    expect(chunks.length).toBe(2);
+    expect(chunks[0].content).toContain('key management');
+    expect(chunks[0].content).toContain('rotation schedule');
+    expect(chunks[1].content).toContain('key rotation');
+  });
+
+  it('falls back to single chunk when no patterns match', () => {
+    const content = [
+      'Acme Corp Security Assessment',
+      'Prepared by: John Smith',
+      'Date: 2026-01-15',
+      'Version 3.0',
+    ].join('\n');
+
+    const chunks = buildQuestionAwareChunks(content, CHUNK_OPTS);
+    expect(chunks.length).toBe(1);
+    expect(chunks[0].content).toContain('Acme Corp');
+  });
+});
+
+describe('estimateQuestionCount', () => {
+  it('counts question marks when present', () => {
+    expect(
+      estimateQuestionCount('Q1? Q2? Q3?'),
+    ).toBe(3);
+  });
+
+  it('counts lines matching looksLikeQuestionLine when no question marks', () => {
+    const text = [
+      'The organization must have a BCP.',
+      'The organization ensures data is encrypted.',
+      'Section header',
+    ].join('\n');
+    expect(estimateQuestionCount(text)).toBe(2);
+  });
+
+  it('uses fallback heuristic for unrecognized content', () => {
+    const text = 'a'.repeat(3600);
+    expect(estimateQuestionCount(text)).toBe(3);
+  });
+});
diff --git a/apps/api/src/questionnaire/utils/question-parser.ts b/apps/api/src/questionnaire/utils/question-parser.ts
index 4108aec160..4b0ee35f8b 100644
--- a/apps/api/src/questionnaire/utils/question-parser.ts
+++ b/apps/api/src/questionnaire/utils/question-parser.ts
@@ -183,12 +183,13 @@ function buildParsingPrompt(
   totalChunks: number,
 ): string {
   const instructions = `Instructions:
-- Extract all question → answer pairs from this questionnaire data
-- IMPORTANT: Look for the actual question TEXT (full sentences), NOT just question IDs like "SQ14.3"
-- The question text is the cell containing a full sentence (often ending with "?" or starting with What/How/Do/Is/Are/Does/Can/Will/Should)
-- Match each question to its corresponding Response/Answer value from the same row
-- If the Response/Answer is empty, set answer to null
-- Skip section headers and metadata rows`;
+- Extract every item the respondent must address from this questionnaire data, paired with its answer.
+- Items include: interrogative questions (ending in "?" or starting with What/How/Do/Is/Are/etc.), form fields ("1.1 Vendor Name"), AND compliance/requirement statements ("The organization must X", "The organization has X", "We have X"). All are valid items.
+- IMPORTANT: Extract the FULL item text, not just IDs like "SQ14.3". Find the column with the actual sentences.
+- Match each item to its corresponding Response/Answer/Comment from the same row.
+- If the Response/Answer is empty or missing, set answer to null.
+- Skip pure section headers (e.g., "Information Security Program") and metadata (Company Name, Date).
+- If the document is a single-column checklist of statements, treat each row as one item with answer = null.`;
 
   if (totalChunks > 1) {
     return `${instructions}
@@ -306,6 +307,12 @@ export function looksLikeQuestionLine(line: string): boolean {
   const hasSelectionNote =
     /\((?:single|multiple)\s+selection|allows?\s+other|required\)/i.test(line);
 
+  // Compliance-statement style: "The organization X", "We have X", "Our company X"
+  // Vendor questionnaires often consist entirely of these — each is a row
+  // the respondent must address.
+  const compliancePrefix =
+    /^(?:the\s+organization|the\s+company|the\s+vendor|the\s+supplier|the\s+respondent|our\s+(?:organization|company|team))\b/i;
+
   return (
     hasQuestionMark ||
     questionLabel.test(line) ||
@@ -314,7 +321,8 @@ export function looksLikeQuestionLine(line: string): boolean {
     numberedQuestionWithInterrogative.test(line) ||
     formStyleNumberedField.test(line) ||
     hasRequiredMarker ||
-    hasSelectionNote
+    hasSelectionNote ||
+    compliancePrefix.test(line)
   );
 }