In [None]:
!pip install -U langchain-ollama

## Create a list of unique scenarios

In [None]:
import pandas as pd

scenarios_file = "scenarios.csv"
unique_scenarios_file = "scenarios_unique.csv"

def select_unique_scenarios(input, output):
    # Read the CSV file
    df = pd.read_csv(input, delimiter=";")

    # Strip unnecessary leading and trailing spaces
    df['Scenario ID'] = df['Scenario ID'].str.strip()
    df['User'] = df['User'].str.strip()

    # Select unique Scenario IDs
    unique = df.drop_duplicates(subset=['Scenario ID'], keep='first')[
        ['Scenario ID', 'User']]
   
    # Save the unique rows to a new CSV file
    unique.to_csv(output, index=False, sep=";")

    print(f"Unique scenarios saved to {output}")


select_unique_scenarios(scenarios_file, unique_scenarios_file)

## Generate prompts

In [None]:
import pandas as pd

def generate_llm_prompt(scenario_id, scenarios_df, threats_df, vulnerabilities_df):
    """
    Generates a formatted LLM prompt based on the given scenario, threats, and vulnerabilities.
    
    Args:
        scenario_id (str): The scenario ID to pull the correct description.
        scenarios_df (pd.DataFrame): DataFrame containing scenarios data.
        threats_df (pd.DataFrame): DataFrame containing threat data.
        vulnerabilities_df (pd.DataFrame): DataFrame containing vulnerability data.
        
    Returns:
        str: The LLM prompt formatted as a string.
    """
    # Get the specific scenario description
    scenario_row = scenarios_df[scenarios_df['Scenario ID'] == scenario_id]
    if not scenario_row.empty:
        scenario_description = scenario_row['User'].iloc[0]  # Access the description safely
    else:
        print(f"No scenario found with ID: {scenario_id}")
    
    # Get the risk and vulnerability descriptions
    risk_description = scenario_row['Assistant - Risk description'] if 'Assistant - Risk description' in scenario_row else "No risk description available."
    vulnerability_description = scenario_row['Assistant - Vulnerability description'] if 'Assistant - Vulnerability description' in scenario_row else "No vulnerability description available."
    
    # Format the Threats section
    threats_text = "\n\n".join([
    f"THREAT ID: {row['THREAT ID']}\n"
    f"THREAT: {row['THREAT']}\n"
    f"DESCRIPTION: {row['DESCRIPTION']}\n"
    "---------------------------------"  # Separator between threats
    for _, row in threats_df.iterrows()
])

    #print(threats_df)
    vulnerabilities_text = "\n\n".join([
    f"VULNERABILITY ID: {row['ID']}\n"
    f"VULNERABILITY: {row['VULNERABILITY']}\n"
    f"DESCRIPTION: {row['DESCRIPTION']}\n"
    "---------------------------------"  # Separator between vulnerabilities
    for _, row in vulnerabilities_df.iterrows()
])

    # Format the Vulnerabilities section
    
    #print(vulnerabilities_text)
    # Format the complete LLM prompt

    prompt = f"""
    
    ROLE: You are an assistant in security risk analysis. 

    ### **Instructions:**
    1.Read the given scenario carefully.
    2.Identify all potential threats from the Threats List that could apply to the scenario.
    3.For each identified threat, find the most relevant vulnerability from the Vulnerabilities List that matches the threat.
    4.Return the response in JSON format as shown below.
    
    ### **Additional Instructions:**
    - You must never hallucinate
    - You have to always answer in English

    Here is the scenario I want you to estimate. Use this values when creating json
    ScenarioID: {scenario_id}
    Scenario description: "{scenario_description}"

    
    Beginning of list of Threats
    Threats:
    {threats_text}
    End list of Threats

    Beginning of list of Vulnerabilities
    Vulnerabilities:
    {vulnerabilities_text}
    end list of vulnerabilities

    Each item in the list should contain:
    - **ThreatID** / **VulnID**: The identifier for the threat/vulnerability.
    - **Threat** / **Vulnerability**: The name of the threat/vulnerability.
    - **Description**: A detailed explanation of the threat/vulnerability.

    **If no threats or vulnerabilities apply**, respond with an empty array for that category.
    "Generate a JSON object with the following information. Example below"

    In json 
    - ScenarioID is same as given above example s1
    - Scenario is as given above as scenario description

    ### **EXAMPLE Format of the Response:**
        {{
            "ScenarioID": "{scenario_id}",
            "Scenario": "{scenario_description}"
            "Threats": [
                {{
            "ThreatID": "M3",
            "Threat": "Flooding",
            "Description": "Flooding of the rooms where the systems and/or storage media are located.",
            "VulnID": "V27",
            "Vulnerability": "Inadequate flood protection",
            "VulnDescription": "Lack of a specific anti-flooding system to safeguard the system and the data contained within it (e.g., watertight bulkheads)."
        }}
            ]
            
        }}
        """
    
    return prompt


def generate_prompts_for_all_scenarios(scenarios_df, threats_df, vulnerabilities_df):
    """
    Generate LLM prompts for all scenarios in the CSV.

    Args:
        scenarios_df (pd.DataFrame): DataFrame containing scenarios.
        threats_df (pd.DataFrame): DataFrame containing threats.
        vulnerabilities_df (pd.DataFrame): DataFrame containing vulnerabilities.

    Returns:
        list: A list of prompts for each scenario.
    """
    prompts = []
    for scenario_id in scenarios_df['Scenario ID']:
        prompt = generate_llm_prompt(scenario_id, scenarios_df, threats_df, vulnerabilities_df)
        prompts.append(prompt)
    
    return prompts


# Example usage:

# Load the CSV files into DataFrames with the correct delimiter (semicolon)
scenarios_df = pd.read_csv(unique_scenarios_file, delimiter=';')
threats_df = pd.read_csv('threat.csv', delimiter=';')
vulnerabilities_df = pd.read_csv('vulnerability.csv', delimiter=';')

# Clean the column names (strip spaces)
scenarios_df.columns = scenarios_df.columns.str.strip()
threats_df.columns = threats_df.columns.str.strip()
vulnerabilities_df.columns = vulnerabilities_df.columns.str.strip()

# Generate prompts for all scenarios
prompts = generate_prompts_for_all_scenarios(scenarios_df, threats_df, vulnerabilities_df)

# Print the first generated prompt as an example
print(prompts[1])  # Example: Print the first prompt


## Generate answers

In [None]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_ollama.llms import OllamaLLM
import csv

answer_limit = 193
csv_filename = "answers.csv"

template = """Question: {question}
Answer: Lets think step by step"""
print(len(prompts))
prompt = ChatPromptTemplate.from_template(template)
model = OllamaLLM(model="marco-o1")
chain = prompt | model

def generate_answer_for_prompt(prompt, chain):
    answer = (chain.invoke({"question": prompt}))
    print(answer)
    return answer

def generate_answers_for_all_prompts(prompts, chain, limit, answer_file):
    limit = min(limit, len(prompts))

    for i in range(limit):
        question = prompts[i]
        print(f"Generating {i+1}/{limit}")
        answer = generate_answer_for_prompt(question, chain)

        with open(answer_file, mode="a", newline="", encoding="utf-8") as file:
            writer = csv.writer(file)
            writer.writerow([answer])  # Writing question-answer pair

        print(f"Answer saved to {answer_file}")

generate_answers_for_all_prompts(prompts, chain, answer_limit, csv_filename)
