In [None]:
!pip install -U langchain-ollama

## Create a list of unique scenarios

In [None]:
import pandas as pd

scenarios_file = "scenarios.csv"
unique_scenarios_file = "scenarios_unique.csv"

def select_unique_scenarios(input, output):
    # Read the CSV file
    df = pd.read_csv(input, delimiter=";")

    # Strip unnecessary leading and trailing spaces
    df['Scenario ID'] = df['Scenario ID'].str.strip()
    df['User'] = df['User'].str.strip()

    # Select unique Scenario IDs
    unique = df.drop_duplicates(subset=['Scenario ID'], keep='first')[
        ['Scenario ID', 'User']]
   
    # Save the unique rows to a new CSV file
    unique.to_csv(output, index=False, sep=";")

    print(f"Unique scenarios saved to {output}")


select_unique_scenarios(scenarios_file, unique_scenarios_file)

## Generate prompts

In [None]:
import pandas as pd

def generate_llm_prompt(scenario_id, scenarios_df, threats_df, vulnerabilities_df):
    """
    Generates a formatted LLM prompt based on the given scenario, threats, and vulnerabilities.
    
    Args:
        scenario_id (str): The scenario ID to pull the correct description.
        scenarios_df (pd.DataFrame): DataFrame containing scenarios data.
        threats_df (pd.DataFrame): DataFrame containing threat data.
        vulnerabilities_df (pd.DataFrame): DataFrame containing vulnerability data.
        
    Returns:
        str: The LLM prompt formatted as a string.
    """
    # Get the specific scenario description
    scenario_row = scenarios_df[scenarios_df['Scenario ID'] == scenario_id]
    if not scenario_row.empty:
        scenario_description = scenario_row['User'].iloc[0]  # Access the description safely
    else:
        print(f"No scenario found with ID: {scenario_id}")
    
    # Get the risk and vulnerability descriptions
    risk_description = scenario_row['Assistant - Risk description'] if 'Assistant - Risk description' in scenario_row else "No risk description available."
    vulnerability_description = scenario_row['Assistant - Vulnerability description'] if 'Assistant - Vulnerability description' in scenario_row else "No vulnerability description available."
    
    # Format the Threats section
    threats_text = "\n\n".join([
    f"ThreatID: {row['THREAT ID']}\n"
    f"ThreatName: {row['THREAT']}\n"
    f"ThreatDescription: {row['DESCRIPTION']}\n"
    "---------------------------------"  # Separator between threats
    for _, row in threats_df.iterrows()
])

    #print(threats_df)
    vulnerabilities_text = "\n\n".join([
    f"VulnID: {row['ID']}\n"
    f"VulnerabilityName: {row['VULNERABILITY']}\n"
    f"VulnDescription: {row['DESCRIPTION']}\n"
    "---------------------------------"  # Separator between vulnerabilities
    for _, row in vulnerabilities_df.iterrows()
])

    # Format the Vulnerabilities section
    
    #print(vulnerabilities_text)
    # Format the complete LLM prompt

    prompt = f"""
    
    ROLE: You are an assistant specializing in security risk analysis. Your task is to assess the provided scenario and identify any potential threats from the given list.
    Do not generate new threats. Use values that I provide

### **Instructions:**

1. **Read the scenario carefully.** This is the only scenario you need to evaluate. Do not invent any details or make assumptions. Always respond in English.

    - ScenarioID: {scenario_id}
    - Scenario description: "{scenario_description}"

2. **Identify all the applicable threats.** Review the scenario and compare it against the "Threats List" below. If a security threat is present, please explain what the security threat is. Only choose threats that are exact matches from the list. Do not select any threats that are not in the provided list.
    - **Threats List:**
    {threats_text}

    **DO NOT create new threats or mention anything outside of the provided Threats List.** If no threats apply, leave the "Threats" array empty.

3. **Create a new JSON object for every threat you find.** For each identified threat, create a JSON object with the following format:

    {{
        "ScenarioID": "{scenario_id}",
        "Scenario": "{scenario_description}",
        "Threats": [
            {{
                "ThreatID": "<THREAT_ID>",
                "ThreatName": "<THREAT_NAME>",
                "ThreatDescription": "<THREAT_DESCRIPTION>"
            }}
        ]
    }}

    **For Example**
    {{
        "ScenarioID": "{scenario_id}",
        "Scenario": "{scenario_description}",
        "Threats": [
            {{
                "ThreatID": "T2"
                "ThreatName": "Power supply"
                "ThreatDescription": "Power failure to devices which may cause loss or corruption of processed data"
            }}
        ]
    }}


    - Ensure that Threats matches list i provided.

    ### **Important Notes:**
    - **Do not** generate new threats. Use only the values provided in the "Threats List".
    - Ensure that the JSON output is valid and correctly formatted.
    """
    
    return prompt


def generate_prompts_for_all_scenarios(scenarios_df, threats_df, vulnerabilities_df):
    """
    Generate LLM prompts for all scenarios in the CSV.

    Args:
        scenarios_df (pd.DataFrame): DataFrame containing scenarios.
        threats_df (pd.DataFrame): DataFrame containing threats.
        vulnerabilities_df (pd.DataFrame): DataFrame containing vulnerabilities.

    Returns:
        list: A list of prompts for each scenario.
    """
    prompts = []
    for scenario_id in scenarios_df['Scenario ID']:
        prompt = generate_llm_prompt(scenario_id, scenarios_df, threats_df, vulnerabilities_df)
        prompts.append(prompt)
    
    return prompts


# Example usage:

# Load the CSV files into DataFrames with the correct delimiter (semicolon)
scenarios_df = pd.read_csv(unique_scenarios_file, delimiter=';')
threats_df = pd.read_csv('threat.csv', delimiter=';')
vulnerabilities_df = pd.read_csv('vulnerability.csv', delimiter=';')

# Clean the column names (strip spaces)
scenarios_df.columns = scenarios_df.columns.str.strip()
threats_df.columns = threats_df.columns.str.strip()
vulnerabilities_df.columns = vulnerabilities_df.columns.str.strip()

# Generate prompts for all scenarios
prompts = generate_prompts_for_all_scenarios(scenarios_df, threats_df, vulnerabilities_df)

# Print the first generated prompt as an example
print(prompts[1])  # Example: Print the first prompt


## Generate answers

In [None]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_ollama.llms import OllamaLLM
import csv

answer_limit = 193
csv_filename = "answers.csv"

template = """Question: {question}
Answer: Return only json"""
prompt = ChatPromptTemplate.from_template(template)

def generate_answer_for_prompt(question):
    model = OllamaLLM(model="qwen2.5")  # Initialize LLM inside the function
    chain = prompt | model
    answer = chain.invoke({"question": question})
    return answer

def generate_answers_for_all_prompts(prompts, limit, answer_file):
    limit = min(limit, len(prompts))

    for i in range(limit):
        question = prompts[i]
        answer = generate_answer_for_prompt(question)  # Call function to generate answer

        with open(answer_file, mode="a", newline="", encoding="utf-8") as file:
            writer = csv.writer(file)
            writer.writerow([answer])  # Writing answer

        print(f"Answer saved to {answer_file}")

# Example usage
generate_answers_for_all_prompts(prompts, answer_limit, csv_filename)

