In [None]:
import json
import requests
from bs4 import BeautifulSoup
from collections import defaultdict

In [40]:
with open('requirement.txt', 'r') as file:
    site = file.read()

In [41]:
soup = BeautifulSoup(site, 'html.parser')

In [42]:
tab_id = "block-views-block-entry-requirements-block-1"  # WASSCE applicants
    # "block-views-block-entry-requirements-block-2",  # Other applicants
    # "block-views-block-entry-requirements-block-7",


all_programs = []
no_req_programs = []

tabs = soup.find_all(id=tab_id)
for tab in tabs:
    print(f"Processing tab: {tab} \n\n\n\n")
    for i, item in enumerate(tab.find_all(class_=["accordion-item", "views-row"])):
        program = defaultdict(lambda: defaultdict(lambda: defaultdict(dict)))

        # Extracting program name from content of button
        button = item.find(class_="accordion-button collapsed")
        if not button:
            print(f"Warning: Missing accordion-button for item #{i}")
            continue
        program_name = button.get_text(strip=True)
        print(f"\n>>> Processing Program: {program_name}")
        program["program"] = program_name
        program["filteringsubject"] = "Social Studies"
        program["alternatecore"] = "Integrated Science"
        
        program_reqs = item.find(class_="accordion-body")
        if not program_reqs:
            print(f"No requirements section found for {program_name}")
            no_req_programs.append(program_name)
            continue

        requirements = program_reqs.find_all("p")
        if not requirements:
            print(f"No <p> tags (requirements) found for {program_name}")
            no_req_programs.append(program_name)
            continue

        core = main = tracks = ""
        electives_found = False

        # Combine all <p> tags in the requirement section into a single string (HTML)
        full_html = "".join(str(p) for p in requirements)
        parsed_html = BeautifulSoup(full_html, "html.parser")

        # Find all strong tags and their associated text chunks
        strong_tags = parsed_html.find_all("strong")

        for j, tag in enumerate(strong_tags):
            label = tag.get_text(strip=True).lower()
            value = ""

            # Collect all text until the next <strong> or end
            current = tag.next_sibling
            while current and (not hasattr(current, "name") or current.name != "strong"):
                if hasattr(current, "get_text"):
                    value += current.get_text(" ", strip=True)
                else:
                    value += str(current)
                current = current.next_sibling

            value = BeautifulSoup(value, "html.parser").get_text(" ", strip=True)
            value = value.replace('\u00a0', ' ').strip(": ").strip()


            print(f"  [Label #{j}] {label} => {value}")

            if "core" in label:
                core = value.replace(" &", ",").replace("Credit passes in ", "")
            elif "elective" in label and not electives_found:
                electives_found = True
                electives = value.replace("Credit passes in ", "")
                if " and any " in electives:
                    parts = electives.split(" and any ", 1)
                    main = parts[0].strip()
                    tracks = parts[1].strip() if len(parts) > 1 else ""
                    print(f"    ↪ Main Elective: {main.strip()}")
                    print(f"    ↪ Track Options: {tracks.strip()}")
                else:
                    main = electives.strip()
                    tracks = ""
                    print(f"    ↪ All Electives: {main.strip()}")

                program["core subjects"] = core
                program["elective subjects"]["main"] = [main]
                track_list = []
                if tracks:
                    # crude way to guess it's a list: presence of ' or ', ' and ', or commas
                    if any(x in tracks for x in [",", " and ", " or "]):
                        track_list = [t.strip() for t in tracks.replace(" and ", ",").replace(" or ", ",").split(",") if t.strip()]
                    else:
                        # fallback — just keep it as-is (it's not a subject list)
                        print(f"⚠️  Track text not parsed as subject list: {tracks}")

                program["elective subjects"]["tracks"]["General"] = track_list

                program["special requirements / general information"] = None

                if requirements[1:]:
                    additional_reqs = []
                    for req in requirements[1:]:
                        req_text = req.get_text(strip=True)
                        if req_text:
                            additional_reqs.append(req_text)
                    if additional_reqs:
                        program["special requirements / general information"] = additional_reqs
                        print(f"  ➕ Additional Info: {additional_reqs}")

                # Find tables for concentrations
                tables = item.find_all("table")
                if tables:
                    print(f"  📋 Concentration table(s) found for {program_name}")
                    concentrations = []
                    for table in tables:
                        concentration = {}
                        rows = table.find_all("tr")
                        for row in rows:
                            cols = row.find_all("td")
                            if len(cols) == 2:
                                conc = cols[0].get_text(strip=True).lower()
                                required_subs = cols[1].get_text(strip=True)
                                concentration[conc] = required_subs
                                print(f"    - {conc}: {required_subs}")
                        program["concentrations"] = concentration
                print(f"  ✅ Update: {program}")
        all_programs.append(program)

Processing tab: <div aria-labelledby="tab-one" class="views-element-container tab-pane collapse active show mt-5 block block-views block-views-blockentry-requirements-block-1" id="block-views-block-entry-requirements-block-1" role="tabpanel">
<div><div class="view view-entry-requirements view-id-entry_requirements view-display-id-block_1 js-view-dom-id-c6ca8d10f0522fe2a072e51502b9cd0b8d6b067c3b28ce51d518f0ddc3f92043" data-once="ajax-pager">
<div class="view-header">
<h4>Ghanaian Applicants with WASSCE/SSSCE Qualification</h4>
<br/>
<h5>A. General Entry Requirements into the Full-Time Degree Programmes</h5>
<p>An applicant for admission to an undergraduate degree programme in the University of Ghana must have at least credits (A1 - C6 in WASSCE or A - D in SSSCE) in the following subjects: <br/>
•	The Four Core Subjects: English, Core Mathematics, Integrated Science/Core Science, and Social Studies. <br/>
•	Three elective subjects. <br/> <br/>
</p><h5>B. Calculation of Aggregates for Ad

In [43]:
all_programs

[defaultdict(<function __main__.<lambda>()>,
             {'program': 'BSc. Actuarial Science',
              'filteringsubject': 'Social Studies',
              'alternatecore': 'Integrated Science',
              'core subjects': 'English, Core Mathematics, Integrated Science, Social Studies',
              'elective subjects': defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>,
                          {'main': ['Elective Mathematics'],
                           'tracks': defaultdict(dict, {'General': []})}),
              'special requirements / general information': None}),
 defaultdict(<function __main__.<lambda>()>,
             {'program': 'BSc. Computer Science',
              'filteringsubject': 'Social Studies',
              'alternatecore': 'Integrated Science',
              'core subjects': 'English, Core Mathematics, Integrated Science, Social Studies',
              'elective subjects': defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>,
         

In [44]:
with open('ug_programs.json', 'w') as f:
    json.dump(all_programs, f, indent=4)

In [2]:
with open('ug_programs copy 2.json', 'r') as f:
    data = json.load(f)
# load the JSON file and print its content
with open('ug_cutoffPoints.json', 'r') as f:
    cutoffpoints = json.load(f)

# update the programs with cutoff points, college
def format_names(full_program):
    program = full_program["program"].replace("BSc. ", "").replace("B.Sc. ", '').replace("BA ", "").replace("BFA ", "").replace("BSc ", "").replace("BA ", "")
    program = program.replace("Bachelor of ", "").replace("Bachelor ", "").strip()
    return program

for program in data:
    program_name = format_names(program["program"])
    program["program"] = program_name

KeyError: 'program'

In [None]:
# update the programs with cutoff points, college
def format_names(full_program):
    program = full_program["program"].replace("BSc. ", "").replace("B.Sc. ", '').replace("BA ", "").replace("BFA ", "").replace("BSc ", "").replace("BA ", "")
    program = program.replace("Bachelor of ", "").replace("Bachelor ", "").strip()
    return program

for program in data:
    program_name = format_names(program["program"])
    program["program"] = program_name

In [8]:
# update cutoff points from list of programs dict in ug_cutoffPoints.json
for program in data:
    for cutoff in cutoffpoints:
        if program["program"].lower() == cutoff["program"].lower():
            program["cutoff points"] = cutoff["cutoff points"]
            program["college"] = cutoff["college"]
            print(f"Updated {program['program']} with cutoff points: {cutoff['cutoff points']} and college: {cutoff['college']}")
            break
    else:
        print(f"No cutoff points found for {program['program']}")

NameError: name 'data' is not defined