# Testing DeepSeek-R1 on UK/CA/AU Statements with context

In [None]:
import re
import string
import time

import pandas as pd
import requests
import torch
import torchmetrics
from tqdm import tqdm

In [None]:
# Reshaping the annotated data to crete context for the target sentance
csv_path = "data/contextualized_data/AU_context0.csv"
df = pd.read_csv(csv_path)

# Rename the column names to be consistent with AU
if "sentence_statement_id" in df.columns and "statement_id" not in df.columns:
    df.rename(columns={"sentence_statement_id": "statement_id"}, inplace=True)
if "sentence_orig_idxs" in df.columns and "sentence_id" not in df.columns:
    df.rename(columns={"sentence_orig_idxs": "sentence_id"}, inplace=True)
if "sentence_orig_text" in df.columns and "sentence_text" not in df.columns:
    df.rename(columns={"sentence_orig_text": "sentence_text"}, inplace=True)
if "sentence" in df.columns and "sentence_text" not in df.columns:
    df.rename(columns={"sentence": "sentence_text"}, inplace=True)

In [None]:
# Modify the 'sentence_sentence_statement_id' column to remove the 'SID_' prefix
# Check if statement_id is string and contains 'SID_'
if isinstance(df["statement_id"], str) and df["statement_id"].str.contains("SID_").any():
    df["statement_id"] = df["statement_id"].str.replace("SID_", "", regex=False)

# Display the first few rows to confirm the change
print(df.head())

# This is how many words of context (left/right) to take.
CONTEXT_SIZE = 50

# Group by sentence_statement_id so we can handle each statement independently.
grouped = df.groupby("statement_id", group_keys=False)

all_outputs = []

for sentence_statement_id, subdf in grouped:
    # Sort the rows in the correct reading order (by sentence_orig_idxs, presumably ascending):
    subdf_sorted = subdf.sort_values("sentence_id").reset_index(drop=True)

    # 1. Build one flat list of all words for this statement
    all_words = []
    offsets = []  # will store (start_offset, end_offset) for each row’s sentence
    current_offset = 0

    for text in subdf_sorted["sentence_text"]:
        words = text.split()
        start_offset = current_offset
        end_offset = start_offset + len(words)
        offsets.append((start_offset, end_offset))
        all_words.extend(words)
        current_offset = end_offset

    # 2. Now assign context to each row
    for i in range(len(subdf_sorted)):
        start_off, end_off = offsets[i]

        # Clip the 50 words before/after to the statement boundaries
        context_start = max(0, start_off - CONTEXT_SIZE)
        context_end = min(len(all_words), end_off + CONTEXT_SIZE)

        context_words = all_words[context_start:context_end]
        text_with_context = " ".join(context_words)

        # Store it in a new column:
        subdf_sorted.loc[i, "text_with_context"] = text_with_context

    # After we’ve built this for every row in subdf_sorted, collect results
    all_outputs.append(subdf_sorted)

# 3. Concatenate all statement sub-dataframes into one final DF
df_with_context = pd.concat(all_outputs, ignore_index=True)

# (Optional) Save to a CSV to inspect
df_with_context.to_csv("output_with_context.csv", index=False)

In [None]:
"""Need to launch the deepseek server before running this script
Follow the instructions on unsloth deepseek documentation for setup and download of the models
https://huggingface.co/unsloth/DeepSeek-R1-GGUF
Then, run the following command on the same machine as you are running this script

"podman run -v /network/projects/amlrt/qut01-aims/HF_HOME/DeepSeek-R1-GGUF:/models \
--rm --device nvidia.com/gpu=all \
-p 8080:8080 \
ghcr.io/ggerganov/llama.cpp:full-cuda \
--server -m /models/DeepSeek-R1-UD-Q2_K_XL/DeepSeek-R1-UD-Q2_K_XL-00001-of-00005.gguf \
--ctx-size 8192 --temp 0.6 --threads 16 --n-gpu-layers 62 --cache-type-k q4_0 \
--prio 2 --host 0.0.0.0 --seed 3407"
"""

In [None]:
LLAMA_SERVER_URL = "http://localhost:8080/v1/chat/completions"

punct_remover = str.maketrans("", "", string.punctuation)

no_regex = re.compile("no[^0-9a-zA-Z]")
yes_regex = re.compile("yes[^0-9a-zA-Z]")

model_path = "/models/DeepSeek-R1-UD-Q2_K_XL/DeepSeek-R1-UD-Q2_K_XL-00001-of-00005.gguf"
prompt = """"<｜User｜>You are an analyst that inspects modern slavery declarations made by Australian reporting
entities. You are specialized in the analysis of statements made with respect to the Australian Modern Slavery Act of 2018, and not of any other legislation. You are currently looking for sentences in statements that describe whether the statement is approved by the principal governing body of one or more reporting entities. This governing body holds the primary responsibility for governance of the reporting entity. The Act explicitly prohibits the delegation of this approval authority to individuals, executive committees, sub-committees, or workgroups. It is crucial for entities to clearly identify that the approval has come directly from this sole governing body without ambiguity. Terms like “Executive Leadership Committee,” “Members of the Board,” or “Senior Executive on behalf of the Board” do not sufficiently demonstrate that the principal governing body itself has approved the modern slavery statement. Additionally, descriptions such as "considered by the board" or "put forward to the board" are also inadequate to fulfill the requirement of direct approval by the governing body. If there is only a single reporting entity, the approval must come from its sole principal governing body.
Otherwise, for joint statements made by multiple reporting entities, there are three options for the approval of a statements:
The principal governing body of each reporting entity covered by the statement approves the statement. The principal governing body of a higher entity (such as a global parent), which is in a position to influence or control each entity covered by the statement, approves the statement. The higher entity does not have to be a reporting entity itself. If it is not practicable to comply with other options, the principal governing body of at least one reporting entity covered by the statement may approve the statement. In this case, the statement must also explain why this option was taken. We therefore consider any sentence containing language that provides such approval as relevant. Given the above definitions of what constitutes a relevant sentence, you will need to determine if a target sentence is relevant or not inside a larger block of text. The target sentence will first be provided by itself so you can know which sentence we want to classify. It will then be provided again as part of the larger block of text it originally came from (extracted from a PDF file) so you can analyze it with more context. While some of the surrounding sentences may be relevant according to the earlier definitions, we are only interested in classifying the target sentence according to the relevance of its own content. You must avoid labeling sentences with only vague descriptions or corporate talk (and no actual information) as relevant.
The answer you provide regarding whether the sentence is relevant or not can only be 'YES' or 'NO', and nothing else. The target sentence to classify is the following:
------------
{}
------------
The same target sentence inside its original block of text:
------------
{}
------------
Question: Is the target sentence relevant? (YES/NO)<｜Assistant｜>"""


def run_deepseek(target_sentence, text, debug=False):
    """Runs DeepSeek for a given prompt and returns the response."""
    current_prompt = prompt.format(target_sentence, text)
    max_tokens = 1000
    temperature = 0.6
    payload = {
        "messages": [{"role": "user", "content": current_prompt}],
        "n_predict": max_tokens,
        "temperature": temperature,
    }

    try:
        # Convert the payload dictionary to a JSON string
        response = requests.post(LLAMA_SERVER_URL, json=payload)
        response.raise_for_status()
        result = response.json()["choices"][0]["message"]["content"]
        # Extract the 'content' field from the response (adjust if necessary)
        # result = response.json().get("content", "No response")
    except requests.exceptions.RequestException as e:
        print(f"Error querying Llama: {e}")
        result = None

    # Extract final answer
    final_answer_match = re.search(r"\s*(YES|NO)\s*$", result, re.IGNORECASE)

    final_answer = final_answer_match.group(1).strip() if final_answer_match else "Final Answer not found"

    # If the final answer is not found, then look for yes/no at the end of result
    if final_answer == "Final Answer not found":
        final_answer_match = re.search(r"(yes|no)$", result[:-20], re.IGNORECASE)
        final_answer = final_answer_match.group(1).strip() if final_answer_match else "Final Answer not found"

    # Normalize the answer
    normalized_answer = final_answer.lower().translate(punct_remover).strip()
    if normalized_answer == "no" or no_regex.match(normalized_answer):
        final_result = 0
    elif normalized_answer == "yes" or yes_regex.match(normalized_answer):
        final_result = 1
    else:
        final_result = -1
        if debug:
            print(f"GPT result '{result}' could not be processed (normalized: '{normalized_answer}')")

    if debug:
        print("Prompt: ", current_prompt)
        print("XXXXXXXXXXXXXXXXXXXXXXXXX")
        print("This is the model output:\n", result)
        print("XXXXXXXXXXXXXXXXXXXXXXXXX")
        print("Final result: ", final_result)
        print("\n\n")

    return final_result, result, normalized_answer


print(prompt)

In [None]:
# Small test to check if everything is working as expected
target_sentence = "Baby Bunting Modern Slavery Statement 2020 This statement, pursuant to the Modern Slavery Act 2018 (Cth), describes the risks of modern slavery in the operations and supply chains of Baby Bunting1 and includes information about actions taken to address those risks for the financial year ended 28 June 2020"
text = "Baby Bunting Modern Slavery Statement 2020 This statement, pursuant to the Modern Slavery Act 2018 (Cth), describes the risks of modern slavery in the operations and supply chains of Baby Bunting1 and includes information about actions taken to address those risks for the financial year ended 28 June 2020 . Modern slavery includes trafficking in persons, slavery, servitude, forced marriage, forced labour, debt bondage, deceptive recruitment for labour or services; and the worst forms of child labour. Modern slavery has severe consequences for its victims and often disproportionately impacts women and girls. Minimising the risk of modern slavery in its supply chains and, in particular, ensuring that women and girls can exercise their own choices free from the undue influence that arises in modern slavery is a critical focus for Baby Bunting. This is Baby Buntings first modern slavery statement. During the financial year, Baby Bunting has introduced a"

cp = run_deepseek(target_sentence, text, debug=True)
print(cp)

In [None]:
# Load the dataset
csv_path = "output_with_context.csv"
df = pd.read_csv(csv_path)
print(df.head())

In [None]:
targets = df["targets"].tolist()  # Extract the targets as a list

# Manually specify the index for your class of interest
class_of_interest_index = 0  # Replace with the index of your desired class

class_names = [
    "approval",
    "signature",
    "criterion1",
    "criterion2_structure",
    "criterion2_operations",
    "criterion2_supplychains",
    "criterion3_risks",
    "criterion4_mitigation",
    "criterion4_remediation",
    "criterion5_assessment",
    "criterion6_consultation",
]
# Retrieve the class name corresponding to the index
class_of_interest = class_names[class_of_interest_index]

# Output results
# print(f"Targets: {targets}")
print(f"Class names: {class_names}")
print(f"Class index: {class_of_interest_index}")
print(f"Class name for index {class_of_interest_index} is {class_of_interest}")

In [None]:
# Initialize counters and storage
import os

count_sentence_was_annotated_with_class_of_interest = 0
count_sentence_was_not_annotated_with_class_of_interest = 0
data = []
already_there = 0
annotated = 0
not_ok_results = 0
not_ok_details = []
tot_count = 0

# Parameters
max_amount = 9999  # Limit the computation
print_count = True
inter_query_wait_time_in_sec = 2  # Time to wait between queries
debug = False

# Load cached results if available
prev_df = (
    pd.read_csv(f"result_for_class_index_test{class_of_interest_index}.csv")
    if os.path.isfile(f"result_for_class_index_test{class_of_interest_index}.csv")
    else None
)  # Replace with pandas.read_csv if using a cached file
predict_fun = run_deepseek  # Replace with the appropriate prediction function

try:
    for i, row in tqdm(df.iterrows(), total=len(df)):
        tot_count += 1
        sentence_text = row["sentence_text"]
        sentence_statement_id = int(row["statement_id"])
        sentence_orig_idxs = int(row["sentence_id"])
        text_with_context = row["text_with_context"]

        # Check if the sentence is already cached
        if (
            prev_df is not None
            and (
                (prev_df["sentence_statement_id"] == sentence_statement_id)
                & (prev_df["sentence_orig_idxs"] == sentence_orig_idxs)
            ).any()
        ):
            # Retrieve cached result
            cached_row = prev_df[
                (prev_df["sentence_statement_id"] == sentence_statement_id)
                & (prev_df["sentence_orig_idxs"] == sentence_orig_idxs)
            ]
            if len(cached_row) != 1:
                raise ValueError(f"Multiple cache rows found for: {sentence_statement_id}:{sentence_orig_idxs}")

            cached_row = cached_row.iloc[0]
            data.append(
                [
                    sentence_statement_id,
                    sentence_orig_idxs,
                    cached_row.get("target_classes", []),
                    cached_row.get("predicted_classes", []),
                    cached_row.get("reasoning", ""),
                    cached_row.get("answer", ""),
                ]
            )
            already_there += 1

        else:
            # Process new sentence
            target_classes = [int(x) for x in eval(row["targets"])]  # Parse targets correctly
            predicted_classes = [-1] * len(target_classes)  # Initialize predictions
            reasoning = ""
            answer = ""

            if target_classes[class_of_interest_index] > -1:
                # Predict classification
                predicted_class, reasoning, answer = predict_fun(sentence_text, text_with_context, debug=debug)
                time.sleep(inter_query_wait_time_in_sec)

                predicted_classes[class_of_interest_index] = predicted_class
                count_sentence_was_annotated_with_class_of_interest += 1

                # Handle problematic results
                if predicted_class == -1:
                    not_ok_results += 1
                    not_ok_details.append([sentence_statement_id, sentence_orig_idxs, reasoning, answer])

            else:
                count_sentence_was_not_annotated_with_class_of_interest += 1

            # Append processed row
            data.append(
                [sentence_statement_id, sentence_orig_idxs, target_classes, predicted_classes, reasoning, answer]
            )
            annotated += 1

        # Stop processing if the max limit is reached
        if max_amount > -1 and tot_count >= max_amount:
            print(f"Reached the max amount of {max_amount}")
            break

        # Print progress
        if print_count:
            print(f"Processed {tot_count} / {max_amount}")

        if annotated % 25 == 0:
            print(f"Annotated {annotated} sentences")
            result_df = pd.DataFrame(
                data,
                columns=[
                    "sentence_statement_id",
                    "sentence_orig_idxs",
                    "target_classes",
                    "predicted_classes",
                    "reasoning",
                    "answer",
                ],
            )
            result_df.to_csv(f"result_for_class_index_test{class_of_interest_index}.csv", index=False)

finally:
    # Save results to CSV
    result_df = pd.DataFrame(
        data,
        columns=[
            "sentence_statement_id",
            "sentence_orig_idxs",
            "target_classes",
            "predicted_classes",
            "reasoning",
            "answer",
        ],
    )
    result_df.to_csv(f"result_for_class_index_test{class_of_interest_index}.csv", index=False)

    # Print summary
    print(
        f"{already_there} sentences retrieved from cache. {annotated} sentences were newly annotated.\n"
        f"Of these, {count_sentence_was_annotated_with_class_of_interest} were annotated with the class of interest, "
        f"and {count_sentence_was_not_annotated_with_class_of_interest} were not.\n"
        f"Total processed: {count_sentence_was_annotated_with_class_of_interest + count_sentence_was_not_annotated_with_class_of_interest}\n"
        f"Problematic results: {not_ok_results}"
    )

    # Save problematic results
    if not_ok_details:
        not_ok_df = pd.DataFrame(not_ok_details, columns=["statement_id", "sentence_orig_idxs", "reasoning", "answer"])
        not_ok_df.to_csv(f"not_ok_for_class_index_{class_of_interest_index}.csv", index=False)

In [None]:
print(f"Data Length: {len(data)}")
print(f"Example Row Lengths: {[len(row) for row in data[:5]]}")

In [None]:
df = pd.read_csv(f"result_for_class_index_test{class_of_interest_index}.csv")
amount_of_classes = 11


def parse_list(list_as_str):
    stripped = list_as_str.strip("[]")
    lst = [int(x) for x in stripped.split(",")]
    assert len(lst) == amount_of_classes
    return lst


preds = [parse_list(e)[class_of_interest_index] for e in list(df["predicted_classes"])]
targets = [parse_list(e)[class_of_interest_index] for e in list(df["target_classes"])]
stat_ids = df["sentence_statement_id"]
sent_ids = df["sentence_orig_idxs"]
assert len(preds) == len(targets) == len(stat_ids) == len(sent_ids)

# note: preds are casted to 0 when they are -1, otherwise torchmertric will complain
# this is ok because we still keep the target with the -1, so they will be ignored
# Corrected version
fixed_preds = [x if x > -1 else 0 for x in preds]

In [None]:
f1_fun = torchmetrics.classification.F1Score(task="binary", ignore_index=-1)
f1 = f1_fun(torch.tensor(fixed_preds), torch.tensor(targets))
p_fun = torchmetrics.classification.Precision(task="binary", ignore_index=-1)
p = p_fun(torch.tensor(fixed_preds), torch.tensor(targets))
r_fun = torchmetrics.classification.Recall(task="binary", ignore_index=-1)
r = r_fun(torch.tensor(fixed_preds), torch.tensor(targets))

acc_fun = torchmetrics.classification.Accuracy(task="binary", ignore_index=-1)
acc = acc_fun(torch.tensor(fixed_preds), torch.tensor(targets))

In [None]:
print(f"example amount is {len(preds)}")
print(f"precision is {p:.3f}, recall is {r:.3f}, f1 is {f1:.3f}")
print(f"accuracy is {acc:.3f}")