# Semgrep AI

In [26]:
import os, sys, json, re
from dotenv import load_dotenv
import google.generativeai as genai
from IPython.display import Markdown, display, update_display

In [11]:
load_dotenv()
GEMINI_API_KEY = os.getenv('GEMINI_API_KEY')

if not GEMINI_API_KEY:
    print("No GEMINI_API_KEY in environment")
    sys.exit(1)

In [8]:
gemini = genai.configure(
    api_key = GEMINI_API_KEY
)

In [96]:
def extract_code_snippet(filename: str, start_line: int, end_line: int, context_lines: int = 50) -> str:
    """
    Extracts a code snippet from a file with surrounding context lines.

    Args:
        filename (str): The full path to the file.
        start_line (int): The starting line number of the finding (1-indexed).
        end_line (int): The ending line number of the finding (1-indexed).
        context_lines (int): The number of lines to include before and after the finding.

    Returns:
        A string containing the code snippet, or an error message if the file
        cannot be read or the lines are invalid.
    """

    try:
        with open(filename, 'r', encoding='utf-8') as f:
            lines = f.readlines()
    except FileNotFoundError:
        return f"Error: File not found at '{filename}'."
    except Exception as e:
        return f"Error: Could not read file '{filename}'. Reason: {e}"

    # Convert 1-indexed line numbers to 0-indexed list indices
    # The start index for the slice is start_line - 1
    # The end index for the slice is end_line
    start_index = start_line - 1
    end_index = end_line

    # Calculate the context-aware start and end indices
    context_start_index = max(0, start_index - context_lines)
    context_end_index = min(len(lines), end_index + context_lines)

    # Slice the lines and join them into a single string
    snippet_lines = lines[context_start_index:context_end_index]
    
    # Add line numbers for better context in the output
    formatted_snippet = ""
    for i in range(context_start_index, context_end_index):
        line_number = i + 1
        prefix = ">> " if start_line <= line_number <= end_line else "   "
        formatted_snippet += f"{prefix}{line_number:4d} | {lines[i]}"

    return formatted_snippet

In [117]:
try:
    with open("test/report.semgrep.formatted.json", "r") as f:
        report_content = f.read()
except FileNotFoundError:
    print("Error: Report file not found.")
    sys.exit(1)

report = json.loads(report_content)
for finding in report['fingerprints']:
    finding_path = finding['file']
    finding_start_line = finding['start_line']
    finding_end_line = finding['end_line']
    file_snippet = extract_code_snippet(
        finding_path,
        start_line = finding_start_line,
        end_line = finding_end_line,
        context_lines = 100
    )

In [None]:
system_prompt = """
You are a Security Analyst AI.
Your task is to review static code analysis findings and their corresponding code.
You will be provided with a JSON object containing the fingerprints from a static code analyzer report and the code snippet with its surrounding lines of each file.
Your sole output must be a single JSON object with the following structure and no other text or explanation.
First analyze the report. Your findings nest under the `fingerprints` array and include the following fields:
- fingerprint: the fingerprint of the finding
- warning_type: the warning category of the finding
- severity: includes WARNING, ERROR or INFO category values of the finding
- confidence: the confidence level of the finding
- impact: the actual severity of the finding
- check_name: the rule file that the finding was checked against
- message: a short message about the finding
- file: the file that the finding was found
- start_line: the starting line of the finding
- end_line: the ending line of the finding
- code: the small code part of the finding
- references: a reference url for the finding

You are tasked with reviewing these files with their respective findings and you have to output a single response with a JSON object with the following format:
{
  "classification": "True Positive" | "False Positive",
  "confidence": "LOW" | "MEDIUM" | "HIGH",
  "remediation": [
    "<string>"
  ],
  "references": [
    "<string>"
  ],
  "comment": "<string>",
  "code": [
      "<string>"
  ]
}

You have to do the following:
1. Analyze the Input: You will receive a JSON object containing the fingerprints array. Each object in the array represents a single security finding.
2. Determine Classification: Based on the provided code and message, classify the finding as either a "True Positive" or a "False Positive".
3. Set Confidence Level: Indicate your confidence in this classification as "LOW", "MEDIUM", or "HIGH".
4. Provide Remediation: If you classify the finding as a "True Positive", provide a remediation array containing one or more clear, actionable steps to fix the vulnerability. If it's a "False Positive", this array should contain a brief explanation array item. Keep in mind that we have implemented `purifyHTML` and `purifyURL` functions that can be used to sanitize HTML and URLs respectively. Recommend them if applicable. An example of purifyHTML: `import { purifyHTML } from 'app/helpers/purify'; ... dangerouslySetInnerHTML={{ __html: purifyHTML(formattedBody) }} ...`. You can enhance their code in the remediation actions for more eye-pleasing output.
5. Provide Code examples: Do not output whole code blocks in the remediation array. Use the code section for such purposes with one or more examples. Feel free to add example code resolving the mitigation or alternatives.
6. Manage References: Populate the references array. Always include the original reference URL from the input. You may add one to two additional, relevant, and active URLs.
7. Write a Comment: In the comment field, provide a concise plaintext summary of your analysis. For instance: "This looks like a True Positive with high confidence because the user-controlled input is passed directly to a SQL query, leading to a potential SQL injection." or "This appears to be a False Positive with high confidence as the variable is hardcoded and not influenced by external input."
8. Final Output: Ensure your final output is only the JSON object described above. Do NOT include any other text, markdown formatting, or explanations before or after the JSON object. Ignore any empty or irrelevant fields from the initial report.
9. You will be provided with a single Static code analysis report and more than one files containing the code relevant to the finding. Use them accordingly.
10. Your output has to be compatible with python method json.loads(response.text).
"""
user_prompt = f"Please review. Static code analysis report:\n\n{report_content}\n\nFile contents:\n\n{file_snippet}\n\n"

generation_config = {
    "temperature": 0.2,
    "max_output_tokens": 8192,
    "response_mime_type": "application/json"
}

try:
    gemini = genai.GenerativeModel(
        model_name = "gemini-2.5-pro",
        system_instruction = system_prompt
    )
    response = gemini.generate_content(
        user_prompt,
        generation_config = generation_config
    )
    print(response.text)
except Exception as e:
    print(f"Something went wrong: {e}")

In [None]:
def parse_llm_json_output(response_text: str) -> dict:
    """
    Parses a JSON object from a string that might be wrapped
    in Markdown code blocks.
    """

    # Use a regex to find the JSON content between ```json and ```
    match = re.search(r"```json\s*(\{.*?\})\s*```", response_text, re.DOTALL)
    
    if match:
        # If a match is found, extract the JSON part
        json_str = match.group(1)
    else:
        # Otherwise, assume the whole string is the JSON
        json_str = response_text

    try:
        # Load the cleaned string as JSON
        return json.loads(json_str)
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON: {e}")
        # Handle the error appropriately, maybe return None or raise
        return None

response_json = parse_llm_json_output(response.text)
response_json

In [120]:
def markdown_json_output(response: json, file_type: str) -> str:
    """
    Parses the final JSON object and creates a GitHub compatible
    Markdown text.
    """

    # Use emojis for quick visual identification
    classification_emoji = "❌" if response['classification'] == 'True Positive' else "✅"
    
    markdown_parts = [ f"## ✨ Gemini thoughs" ]

    if not ("True Positive" in response['comment'] or "False Positive" in response['comment']):
        markdown_parts.append(f"{classification_emoji} This looks like a **{response['classification']}** with a **{response['confidence'].lower()}** confidence.")

    markdown_parts.append(f"{response['comment']}")

    if response.get('remediation'):
        markdown_parts.append("### 🛠️ Remediation")
        remediation_steps = [f"{i+1}. {step}" for i, step in enumerate(response['remediation'])]
        markdown_parts.append("\n".join(remediation_steps))
        markdown_parts.append("\n")

    if response.get('references'):
        markdown_parts.append("### 🔗 References")
        reference_links = [f"- [{url.split('//')[-1]}]({url})" for url in response['references']]
        markdown_parts.append("\n".join(reference_links))

    if response.get('code'):
        markdown_parts.append("### 🖥️ Code Example")
        code_block = "\n".join(response['code'])
        markdown_parts.append(f"```{file_type}\n{code_block}\n```")
        markdown_parts.append("\n")

    return "\n".join(markdown_parts)

display(Markdown(markdown_json_output(response_json, file_type)))

## ✨ Gemini thoughs
This is a True Positive finding. The `formattedBody` prop, which contains content that can be influenced by a user, is passed directly to `dangerouslySetInnerHTML`. This is a classic Cross-Site Scripting (XSS) vulnerability. The content must be sanitized on the client-side before being rendered to prevent malicious script execution.
### 🛠️ Remediation
1. The `formattedBody` prop, which likely contains user-generated content, is used with `dangerouslySetInnerHTML` without sanitization. This can lead to Cross-Site Scripting (XSS) attacks. Sanitize the HTML content using the `purifyHTML` helper function before rendering it.


### 🔗 References
- [react.dev/reference/react-dom/components/common#dangerously-setting-the-inner-html](https://react.dev/reference/react-dom/components/common#dangerously-setting-the-inner-html)
- [owasp.org/www-community/attacks/xss/](https://owasp.org/www-community/attacks/xss/)
- [github.com/cure53/DOMPurify](https://github.com/cure53/DOMPurify)
### 🖥️ Code Example
```jsx
import { purifyHTML } from 'app/helpers/purify';

// ... inside the Comment component

          <div
            className="qna-comment-content"
            dangerouslySetInnerHTML={{ __html: purifyHTML(formattedBody) }}
          />
```

