<a href="https://colab.research.google.com/github/tubagokhan/ADGM/blob/main/ObligationSentenceProcessor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import json
import random
from openai import AzureOpenAI

# Configure API settings via environment variables
os.environ["OPENAI_API_TYPE"] = "azure"
os.environ["OPENAI_API_VERSION"] = "2024-02-15-preview"

# Setup Azure OpenAI client
client = AzureOpenAI(
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    api_key=os.getenv("AZURE_OPENAI_KEY"),
    api_version=os.getenv("OPENAI_API_VERSION")
)

def verify_label(sentence, label):
    prompt = f"Does the following sentence represent an {label}? '{sentence}' Provide a simple 'yes' or 'no'."
    try:
        response = client.chat.completions.create(
            messages=[
                {"role": "system", "content": prompt},
                {"role": "user", "content": ""}
            ],
            model="gpt-4-turbo-1106",
            temperature=0.0,
            max_tokens=8,
            top_p=1.0,
            frequency_penalty=0,
            presence_penalty=0,
            stop=["\n"]
        )
        return response.choices[0].message.content.strip().lower() == 'yes'
    except Exception as e:
        print(f"Error verifying sentence label: {e}")
        return False

def generate_sentences(text, label, iterations):
    all_sentences = []
    for i in range(iterations):
        print(f"Generating {label} sentences, iteration {i + 1}...")
        prompt = f"""
        Context: Review the document thoroughly. Identify sections that specify guidelines but do not impose obligations.

        Task: Generate {label} sentences based on the document content.

        Content: {text}
        """
        try:
            response = client.chat.completions.create(
                messages=[
                    {"role": "system", "content": prompt},
                    {"role": "user", "content": ""}
                ],
                model="gpt-4-turbo-1106",
                temperature=0.7,
                max_tokens=1024,
                top_p=0.95,
                frequency_penalty=0,
                presence_penalty=0,
                stop=None
            )
            sentences = response.choices[0].message.content.strip().split("\n")
            verified_sentences = [{"Text": sentence.strip(), "Obligation": label == 'obligation'} for sentence in sentences if sentence.strip()]
            all_sentences.extend(verified_sentences)
            print(f"Added {len(verified_sentences)} verified {label} sentences.")
        except Exception as e:
            print(f"Error generating sentences: {e}")

    return all_sentences

def process_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()

    obligations = generate_sentences(text, "obligation", 5)
    non_obligations = generate_sentences(text, "non-obligation", 5)

    json_data = obligations + non_obligations
    json_file_path = os.path.splitext(file_path)[0] + "_sentences.json"

    with open(json_file_path, 'w', encoding='utf-8') as f:
        json.dump(json_data, f, indent=4)
    print(f"Sentences saved successfully to {json_file_path}")

def combine_and_shuffle_json_files(directory_path, output_file):
    all_data = []
    obligation_count = 0
    non_obligation_count = 0

    for filename in os.listdir(directory_path):
        if filename.endswith("_sentences.json"):
            file_path = os.path.join(directory_path, filename)
            print(f"Reading data from {file_path}...")
            with open(file_path, 'r', encoding='utf-8') as file:
                data = json.load(file)
                all_data.extend(data)
                for item in data:
                    if item["Obligation"]:
                        obligation_count += 1
                    else:
                        non_obligation_count += 1

    print("Shuffling data...")
    random.shuffle(all_data)

    output_path = os.path.join(directory_path, output_file)
    with open(output_path, 'w', encoding='utf-8') as file:
        json.dump(all_data, file, indent=4)

    print(f"Total items: {len(all_data)}")
    print(f"Obligation items: {obligation_count}")
    print(f"Non-obligation items: {non_obligation_count}")
    print(f"Shuffled data saved to {output_path}")

# Processing all files in a directory
directory_path = "your_directory_here"
for filename in os.listdir(directory_path):
    if filename.endswith(".txt"):
        process_file(os.path.join(directory_path, filename))

# Combine and shuffle all created JSON files
output_file = "ObligationNonObligationSentenceDataset.json"
combine_and_shuffle_json_files(directory_path, output_file)
