In [1]:
import os
import openai
import json

In [2]:
openai.api_key = os.getenv("OPENAI_API_KEY")

In [3]:
# Directory containing modules
module_dir = '../data/FCRA Course/modules'
ground_truth_dir = '../data/Ground Truth'

In [4]:
# Function to read the content of all modules
def read_modules(module_dir):
    module_texts = {}
    for filename in sorted(os.listdir(module_dir)):
        if filename.endswith(".txt"):
            with open(os.path.join(module_dir, filename), 'r', encoding='utf-8') as file:
                module_texts[filename] = file.read()
    return module_texts

In [5]:
# Function to split long text into chunks (within a safe token limit)
def split_text(text, max_tokens=3000):
    # Split text into paragraphs
    paragraphs = text.split("\n\n")
    
    chunks = []
    current_chunk = ""
    
    for paragraph in paragraphs:
        # Approximate token count using the number of words
        tokens_in_paragraph = len(paragraph.split())
        
        if len(current_chunk.split()) + tokens_in_paragraph > max_tokens:
            chunks.append(current_chunk)
            current_chunk = paragraph  # Start a new chunk
        else:
            current_chunk += "\n\n" + paragraph
    
    # Add the last chunk if any content is left
    if current_chunk:
        chunks.append(current_chunk)
    
    return chunks

In [6]:
# Function to summarize a module (handling long text with chunking)
def summarize_module(module_text):
    chunks = split_text(module_text)  # Split the text into manageable chunks
    summaries = []
    
    for chunk in chunks:
        prompt = f"Summarize the following module section:\n\n{chunk}\n\nProvide a concise summary."
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt}
            ]
        )
        summaries.append(response['choices'][0]['message']['content'].strip())
    
    # Join all chunk summaries into one final summary
    return "\n\n".join(summaries)

In [7]:
# Function to generate questions and answers based on module summary
def generate_qa(module_summary):
    prompt = f"Based on the following summary:\n\n{module_summary}\n\nCreate 3 questions with their correct answers."
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ]
    )
    return response['choices'][0]['message']['content'].strip()

In [8]:
# Main function to create the Ground Truth
def create_ground_truth(module_dir):
    ground_truth = {}
    modules = read_modules(module_dir)
    
    for module_name, module_text in modules.items():
        print(f"Processing {module_name}...")
        # Summarize the module
        summary = summarize_module(module_text)
        print(f"Summary for {module_name}:\n{summary}\n")
        
        # Generate Q&A based on the summary
        qa = generate_qa(summary)
        print(f"Questions and Answers for {module_name}:\n{qa}\n")
        
        # Store summary and Q&A in the ground truth dictionary
        ground_truth[module_name] = {
            'summary': summary,
            'questions_and_answers': qa
        }
    
    return ground_truth

In [9]:
# Function to save the ground truth to a file
def save_ground_truth(ground_truth, output_file):
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(ground_truth, f, ensure_ascii=False, indent=4)

In [10]:
# Running the main ground truth generation
ground_truth = create_ground_truth(module_dir)

Processing Module 01.txt...
Summary for Module 01.txt:
Module 1 of the course covers the history of the Fair Credit Reporting Act (FCRA), which was recognized by Congress in the late 1960s. The FCRA was implemented to address concerns about the growing credit reporting industry while maintaining consumer rights. It establishes permissible purposes for accessing consumer reports and outlines the rights and responsibilities of consumers and consumer reporting agencies. The FCRA provides consumers with rights to access and correct their data held by consumer reporting agencies, imposes obligations on companies taking adverse actions, and requires data furnishers to investigate disputed information.

Questions and Answers for Module 01.txt:
1. Question: What was the main purpose of implementing the Fair Credit Reporting Act (FCRA)?
Answer: The FCRA was implemented to address concerns about the growing credit reporting industry and to uphold consumer rights.

2. Question: What rights does t

In [13]:
ground_truth

{'Module 01.txt': {'summary': 'Module 1 of the course covers the history of the Fair Credit Reporting Act (FCRA), which was recognized by Congress in the late 1960s. The FCRA was implemented to address concerns about the growing credit reporting industry while maintaining consumer rights. It establishes permissible purposes for accessing consumer reports and outlines the rights and responsibilities of consumers and consumer reporting agencies. The FCRA provides consumers with rights to access and correct their data held by consumer reporting agencies, imposes obligations on companies taking adverse actions, and requires data furnishers to investigate disputed information.',
  'questions_and_answers': '1. Question: What was the main purpose of implementing the Fair Credit Reporting Act (FCRA)?\nAnswer: The FCRA was implemented to address concerns about the growing credit reporting industry and to uphold consumer rights.\n\n2. Question: What rights does the FCRA provide to consumers rega

In [15]:
# Define the path to save the Ground Truth in the specified folder
output_file_path = os.path.join(ground_truth_dir, 'ground_truth.json')

# Save the generated ground truth to the specified file
save_ground_truth(ground_truth, output_file_path)