In [1]:
import json
import random
import os

In [4]:
TEMPLATES = [
    'The course with ID {course_id} is titled {course_name}. It is offered in the {semester} semester and taught by {teacher_name}. This is a {course_type} course with {required_prerequisites} as its prerequisite. Students will gain the following outcomes: {learning_outcomes}. The course content covers: {content}.',
    '{course_name} (ID: {course_id}) is a {course_type} subject conducted during the {semester} semester. The course is instructed by {teacher_name}, and it {required_prerequisites_description}. Upon completion, students will achieve: {learning_outcomes}. The course includes: {content}.',
    'In the {semester} semester, students can take {course_name} (code: {course_id}). It is taught by {teacher_name}, categorized as a {course_type} course. {required_prerequisites_description}. Learning outcomes include: {learning_outcomes}. Topics covered in this course are: {content}.',
    '{course_name} is a {course_type} course (ID: {course_id}) available in the {semester} semester. It is led by {teacher_name}. Prerequisites: {required_prerequisites}. The course aims to deliver the following outcomes: {learning_outcomes}. Main content: {content}.',
    'Course {course_name} (ID: {course_id}) falls under the {course_type} category and is scheduled for the {semester} semester. The instructor is {teacher_name}. Note: {required_prerequisites_description}. Learning outcomes expected: {learning_outcomes}. The course discusses: {content}.'
]

def parse_course_info(info_list):
    info_dict = {}
    for item in info_list:
        if ": " in item:
            key, value = item.split(": ", 1)
            info_dict[key.strip()] = value.strip()
        else:
            print(f"[!] Warning: Skipped malformed line: {item}")

    prereq = info_dict.get("required prerequisites", "none")
    if prereq.lower() == "none":
        description = "There are no prerequisites for this course"
    else:
        description = f"It requires prior completion of {prereq}"

    info_dict["required_prerequisites_description"] = description
    return info_dict

def generate_course_descriptions(json_data):
    output_list = []
    for idx, (course_id, info_list) in enumerate(json_data.items(), start=1):
        info = parse_course_info(info_list)
        template = random.choice(TEMPLATES)
        description = template.format(
            course_id=info.get("course id", course_id),
            course_name=info.get("course name", ""),
            semester=info.get("semester", ""),
            teacher_name=info.get("teacher name", ""),
            course_type=info.get("course type", ""),
            required_prerequisites=info.get("required prerequisites", ""),
            required_prerequisites_description=info.get("required_prerequisites_description", ""),
            learning_outcomes=info.get("learning outcomes", ""),
            content=info.get("content", "")
        )
        output_list.append({
            "idx": idx,
            "title": course_id,
            "text": description
        })
    return output_list


In [10]:
if __name__ == "__main__":
    directory = "./outputs"
    abs_path = os.path.abspath(directory)
    
    print(f"Current working directory: {os.getcwd()}")
    print(f"Absolute path being checked: {abs_path}")

    if not os.path.exists(directory):
        print("Folder does not exist!")
    else:
        print("Folder exists!")
        print("Files in folder:")
        print(os.listdir(directory))

        for filename in os.listdir(directory):
            if filename.endswith("_output.json"):
                module_name = filename.replace("_output.json", "")
                input_path = os.path.join(directory, filename)

                with open(input_path, "r", encoding="utf-8") as f:
                    course_data = json.load(f)

                output = generate_course_descriptions(course_data)

                output_path = os.path.join(directory, f"{module_name}_corpus.json")
                with open(output_path, "w", encoding="utf-8") as f:
                    json.dump(output, f, indent=4, ensure_ascii=False)

                print(f"Your {module_name}_corpus.json is done!")

Current working directory: d:\Minh Anh\app\data_processing_pipeline
Absolute path being checked: d:\Minh Anh\app\data_processing_pipeline\outputs
Folder exists!
Files in folder:
['DS_output.json']
Your DS_corpus.json is done!
