In [12]:
import json
from langchain_community.llms import Ollama
from langchain_core.prompts import PromptTemplate
from bs4 import BeautifulSoup

import re


In [6]:
# 1. Setup Ollama with  existing model
llm = Ollama(model="mistral:latest")

# 2. Load Data
with open('../2000002.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

In [7]:
# Helper to clean HTML
def clean_text(html):
    return BeautifulSoup(html, "html.parser").get_text(separator=" ", strip=True)

In [8]:
# 3. Create the Extraction Chain
template = """
You are an expert SAP Technical Writer.
I will give you a section of an SAP Knowledge Base Article (KBA).
Your task is to create a training example for a Chatbot.

Format your response exactly as a JSON object with these keys:
- "instruction": A question a user might ask about this problem.
- "input": The context (Symptom or Cause).
- "output": The solution (Resolution).

KBA Title: {title}
Content: {content}

JSON Output:
"""

In [9]:
prompt = PromptTemplate(template=template, input_variables=["title", "content"])

In [10]:
# 4. Run Extraction
dataset = []
title = data.get('shortText')

# We combine relevant sections for the model to analyze
full_content = ""
for section in data.get('texts', []):
    if section['title'] in ['Symptom', 'Cause', 'Resolution']:
        full_content += f"\n[{section['title']}]\n{clean_text(section['text'])}"

print("Now  Mistral to extract Q&A pairs... (this may take a minute)")
try:
    # Invoke Ollama
    formatted_prompt = prompt.format(title=title, content=full_content[:3000]) # Trim to fit context
    response = llm.invoke(formatted_prompt)
    
    # Attempt to parse the JSON output from Mistral
    # (LLMs sometimes add text before/after JSON, so we find the braces)
    json_start = response.find('{')
    json_end = response.rfind('}') + 1
    clean_json = response[json_start:json_end]
    
    entry = json.loads(clean_json)
    dataset.append(entry)
    print("Success! Extracted:", entry['instruction'])

except Exception as e:
    print(f"Extraction failed: {e}")
    # Fallback to manual if LLM fails
    dataset.append({
        "instruction": f"How do I fix {title}?",
        "input": "",
        "output": "See attached KBA."
    })

Now  Mistral to extract Q&A pairs... (this may take a minute)
Extraction failed: Ollama call failed with status code 500. Details: {"error":"llama runner process has terminated: error loading model: unable to allocate CUDA0 buffer\nllama_model_load_from_file_impl: failed to load model"}


In [13]:


class SAPKBAExtractor:
    def __init__(self, file_path):
        self.file_path = file_path
        self.data = self._load_data()
        
    def _load_data(self):
        with open(self.file_path, 'r', encoding='utf-8') as f:
            return json.load(f)

    def _clean_html(self, html_content):
        """Removes HTML tags and normalizes whitespace."""
        if not html_content:
            return ""
        soup = BeautifulSoup(html_content, "html.parser")
        text = soup.get_text(separator=" ", strip=True)
        # Remove multiple spaces/newlines
        return re.sub(r'\s+', ' ', text).strip()

    def extract_pairs(self):
        """
        Generates Instruction/Input/Output pairs.
        Returns a list of dictionaries.
        """
        pairs = []
        
        # 1. Extract Core Metadata
        kba_id = str(self.data.get('_id', 'Unknown ID'))
        title = self.data.get('shortText', 'Unknown Issue')
        
        # 2. Extract Sections (Symptom, Cause, Resolution)
        texts = {t['title']: self._clean_html(t['text']) for t in self.data.get('texts', [])}
        
        symptom = texts.get('Symptom', '')
        cause = texts.get('Cause', '')
        resolution = texts.get('Resolution', '')
        
        # --- STRATEGY 1: Contextual Troubleshooting (The "Hard" Question) ---
        # Instruction: User describes the problem.
        # Output: Model provides the solution.
        if symptom and resolution:
            pairs.append({
                "instruction": f"I am facing an issue in SAP HANA. The symptoms are: {symptom[:300]}... How do I resolve this?",
                "input": f"Context: {title}", 
                "output": resolution
            })

        # --- STRATEGY 2: Direct ID Lookup (The "Expert" Question) ---
        # Instruction: User asks about a specific Note ID.
        # Output: Model summarizes the note.
        if resolution:
            pairs.append({
                "instruction": f"What is the resolution provided in SAP Note {kba_id}?",
                "input": "",
                "output": f"Title: {title}\n\nResolution: {resolution}"
            })

        # --- STRATEGY 3: Title-Based Query (The "Search" Question) ---
        # Instruction: User asks using the exact error message/title.
        # Output: Model provides Cause + Resolution.
        if title and resolution:
            combined_answer = f"Cause: {cause}\n\nFix: {resolution}" if cause else resolution
            pairs.append({
                "instruction": f"How do I fix the SAP error: '{title}'?",
                "input": "",
                "output": combined_answer
            })
            
        return pairs

    def save_to_jsonl(self, pairs, output_file):
        with open(output_file, 'w', encoding='utf-8') as f:
            for pair in pairs:
                json.dump(pair, f)
                f.write('\n')
        print(f" Successfully saved {len(pairs)} training pairs to {output_file}")



In [14]:
# --- EXECUTION ---

    # Initialize with your uploaded file
extractor = SAPKBAExtractor('../2000002.json')

# Run Extraction
dataset = extractor.extract_pairs()

# Save Result
extractor.save_to_jsonl(dataset, 'sap_instruction_dataset.jsonl')

# Preview
print("\n--- Sample Pair ---")
print(json.dumps(dataset[0], indent=2))

 Successfully saved 3 training pairs to sap_instruction_dataset.jsonl

--- Sample Pair ---
{
  "instruction": "I am facing an issue in SAP HANA. The symptoms are: SQL statements run for a long time or consume a high amount of resources in terms of memory and CPU.... How do I resolve this?",
  "input": "Context: FAQ: SAP HANA SQL Optimization",
}
