# 数据读取

In [2]:
import os

# Directory containing the .txt files
# d = 'cs_professor'
d = 'ece_courses'
directory = f'../data/rag_data/{d}'

# Initialize an empty list to store the content of the files
text_list = []

# Iterate through all files in the specified directory
for filename in os.listdir(directory):
    # Check if the file has a .txt extension
    if filename.endswith('.txt'):
        file_path = os.path.join(directory, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            # Read the content of the file and append it to the list
            text_list.append(file.read())

# Display the result
print(f"Read {len(text_list)} files.")
print(text_list[:3])  # Display the content of the first 3 files for verification

Read 118 files.
['Course Code: ECE 585\nCourse Name: Signal Detection and Extraction Theory\n\nDescription:\nIntroduction to signal detection and information extraction theory from a statistical decision theory viewpoint. Subject areas covered within the context of a digital environment are decision theory, detection and estimation of known and random signals in noise, estimation of parameters and adaptive recursive digital filtering, and decision processes with finite memory. Applications to problems in communication theory. Prerequisite: Electrical and Computer Engineering 581 or consent of instructor.', 'Course Code: ECE 459\nCourse Name: Introduction to Embedded Systems\n\nDescription:\nAn introduction to hardware/software codesign of embedded computer systems. Structured programming techniques for high and low level programs. Hardware interfacing strategies for sensors, actuators, and displays. Detailed study of Motorola 68HC11 and 68HC12 microcomputers as applied to embedded syst

# ChatGPT 问题生成

In [3]:
# create batch API required upload file

import json

model_name = "gpt-4o-mini"
max_tokens = 2000
url = "/v1/chat/completions"
method = "POST"
system_message = '''
Given the provided text, generate 3-5 question-and-answer pairs. Each question must explicitly mention the course number or professor's name and should resemble questions typically asked by college students. Ensure the questions can be accurately answered using the information in the text. Format the output as follows:
Q: [Question]
A: [Answer]
'''

with open(f"batch_{d}.jsonl", "w") as f:
    i = 0
    for source_text in text_list:
            i+=1
            jsonl_entry = {
                "custom_id": f"request-{i}",
                "method": method,
                "url": url,
                "body": {
                    "model": model_name,
                    "messages": [
                        {"role": "system", "content": system_message},
                        {"role": "user", "content": source_text}
                    ],
                    "max_tokens": max_tokens
                }
            }
            print(len(source_text))
            f.write(json.dumps(jsonl_entry) + "\n")

589
804
884
868
721
499
361
694
74
585
543
767
417
442
382
580
97
592
346
612
606
740
510
344
721
626
733
774
499
533
320
536
755
688
610
946
466
562
529
456
536
167
75
788
665
785
657
425
717
673
705
679
800
862
804
567
587
811
450
691
571
546
702
545
689
692
675
277
463
527
860
778
802
379
780
543
694
700
754
71
82
869
806
576
713
661
225
779
657
785
273
750
754
877
791
807
732
503
652
745
765
551
695
837
823
436
436
429
532
913
793
437
753
696
778
661
459
328


# after getting batch api output, reform the data

In [3]:
import re
import pandas as pd
import json

# Load JSONL files into DataFrames
def load_jsonl(filename):
    with open(filename, 'r') as file:
        data = [json.loads(line) for line in file]
    return pd.DataFrame(data)

def parse_qa(text):
    # Split the input text into lines
    lines = text.split("\n")
    
    # Initialize variables
    qa_pairs = []
    current_question = None
    
    # Iterate through lines
    for line in lines:
        line = line.strip()  # Remove leading and trailing whitespace
        if line.startswith("Q:"):  # Identify questions
            current_question = line[2:].strip()
        elif line.startswith("A:") and current_question:  # Identify answers corresponding to questions
            answer = line[2:].strip()
            qa_pairs.append((current_question, answer))
            current_question = None
    
    return qa_pairs

# Load batch_response and batch_request files
response_df = load_jsonl(f'batch_{d}_output.jsonl')
request_df = load_jsonl(f'batch_{d}.jsonl')

# Perform a merge on the custom_id column
merged_df = pd.merge(request_df, response_df, on='custom_id', how='inner')

# Save the merged DataFrame back to JSONL
merged_df.to_json('temp.jsonl', orient='records', lines=True)

# Open the merged JSONL file and a new output file for writing
with open('temp.jsonl', 'r') as infile, open(f'{d}.jsonl', 'w') as outfile:
    for line in infile:
        # Load each line as a JSON object
        record = json.loads(line)
        
        # Filter the record to keep only specified fields
        qa_string = record['response']['body']['choices'][0]['message']['content']
        qa_list = parse_qa(qa_string)
        for question, answer in qa_list:
            filtered_record = {}
            filtered_record['question'] = question
            filtered_record['answer'] = answer
            filtered_record['raw_info'] = record['body']['messages'][1]['content']

        
            # Write the filtered record to the new JSONL file
            json.dump(filtered_record, outfile)
            outfile.write('\n')

if os.path.exists("temp.jsonl"):
    os.remove("temp.jsonl")
