In [143]:
import json
from bs4 import BeautifulSoup

In [144]:

# Read the HTML file
with open('html.html', 'r', encoding='utf-8') as f:
    html_content = f.read()

# Parse HTML
soup = BeautifulSoup(html_content, 'html.parser')

# Initialize the data structure
cutoff_data = []

# Find all college sections
college_sections = soup.find_all('div', class_="table-responsible-md")
college_names = [college.text.strip() for college in soup.find_all('h4')]

In [145]:

for i, section in enumerate(college_sections):
    # Get college name from h4 tag
    college_name = college_names[i]
    
    # Find the table
    table = section.find('table')
    if not table:
        continue
        
    # Get all rows except header
    rows = table.find_all('tr')[1:]  # Skip header row
    
    for row in rows:
        cells = row.find_all('td')
        if len(cells) >= 5:  # Ensure we have all required columns
            first_choiceNS, first_choiceSA = cells[1].text.strip().split('/') if '/' in cells[1].text.strip() else (None, cells[1].text.strip())
            program = {
                "program": cells[0].text.strip(),
                "college": college_name,
                "cutoff point": {
                    "firstChoiceSubjectArea": first_choiceSA,
                    "firstChoiceNonSubject": first_choiceNS,
                    "fullFeePaying": cells[2].text.strip() if cells[2].text.strip() != '-' else None,
                    "secondChoice": cells[3].text.strip() if cells[3].text.strip() != '-' else None,
                    "subjectRequirements": cells[4].text.strip() if cells[4].text.strip() != '-' else None
                    }
            }
            cutoff_data.append(program)

In [146]:
cutoff_data

[{'program': 'BSc. Agricultural Engineering',
  'college': 'College of Basic & Applied Sciences',
  'cutoff point': {'firstChoiceSubjectArea': '17',
   'firstChoiceNonSubject': None,
   'fullFeePaying': '19/20',
   'secondChoice': None,
   'subjectRequirements': 'B3 in Elective Maths'}},
 {'program': 'BSc. Biomedical Engineering',
  'college': 'College of Basic & Applied Sciences',
  'cutoff point': {'firstChoiceSubjectArea': '7',
   'firstChoiceNonSubject': None,
   'fullFeePaying': '9',
   'secondChoice': None,
   'subjectRequirements': 'B3 in Elective Maths'}},
 {'program': 'BSc. Computer Engineering',
  'college': 'College of Basic & Applied Sciences',
  'cutoff point': {'firstChoiceSubjectArea': '6',
   'firstChoiceNonSubject': None,
   'fullFeePaying': '7',
   'secondChoice': None,
   'subjectRequirements': 'B3 in Elective Maths'}},
 {'program': 'BSc. Food Process Engineering',
  'college': 'College of Basic & Applied Sciences',
  'cutoff point': {'firstChoiceSubjectArea': '12',


In [147]:
# Save to JSON file
with open("ug_cutoffPointsnew.json", 'w', encoding='utf-8') as f:
    json.dump(cutoff_data, f, indent=2)

In [148]:
with open('updated_ug.json', 'r') as f:
    existing_data = json.load(f)

existing_data

[{'schoolname': 'University of Ghana'},
 [{'program': 'BSc. Actuarial Science',
   'filteringsubject': ['Social Studies'],
   'alternatecore': ['Integrated Science'],
   'cutoff point': {'firstChoice': '10/11',
    'fullFeePaying': '14',
    'secondChoice': '-',
    'subjectRequirements': 'B3 in Elective Mathematics'},
   'campus': 'Main Campus (Legon)',
   'college': 'College of Basic & Applied Sciences',
   'core subjects': ['English Language',
    'Core Mathematics',
    'Integrated Science',
    'Social Studies'],
   'elective subjects': {'main': ['Elective Mathematics'],
    'tracks': {'Agric': ['General Agriculture',
      'Animal Husbandry',
      'Agricultural Economics',
      'Crop Science',
      'Chemistry',
      'Physics'],
     'GArts': ['Economics',
      'Geography',
      'Government',
      'History',
      'Christian Religious Studies',
      'Music',
      'Elective ICT',
      'Literature in English',
      'French',
      'Fante',
      'Akuapem Twi',
      'Asan

In [149]:
# update the programs with cutoff points, college
def format_names(full_program):
    program_name = full_program.replace("BSc. ", "").replace("B.Sc. ", '').replace("BA ", "").replace("BFA ", "").replace("BSc ", "").replace("BA ", "").replace("Bachelor of ", "").replace("Bachelor ", "").replace("Bachelor of Education in Science", "").strip()
    return program_name

for cutoff in cutoff_data:
    program_name = format_names(cutoff["program"])
    cutoff["program"] = program_name

In [150]:
# update cutoff points from list of programs dict in ug_cutoffPoints.json
matched_programs = []
for program in existing_data[1]:
    for cutoff in cutoff_data:
        if (cutoff["program"].lower() in program["program"].lower()) and cutoff['program'] not in matched_programs:
            program["cutoff point"] = cutoff["cutoff point"]
            matched_programs.append(cutoff['program'])
            print(f"Updated program: {program['program']} with cutoff: {cutoff['program']}")
            break
    else:
        print(f"No cutoff points found: PROGRAM: {program['program']} CUTOFF: {cutoff['program']}")

Updated program: BSc. Actuarial Science with cutoff: Actuarial Science
Updated program: BSc. Computer Science with cutoff: Computer Science
Updated program: Doctor of Veterinary Medicine with cutoff: Doctor of Veterinary Medicine
Updated program: BSc. Materials Science & Engineering with cutoff: Materials Science & Engineering
Updated program: BSc. Food Process Engineering with cutoff: Food Process Engineering
Updated program: BSc. Computer Engineering with cutoff: Computer Engineering
Updated program: BSc. Biomedical Engineering with cutoff: Biomedical Engineering
Updated program: BSc. Agricultural Engineering with cutoff: Agricultural Engineering
No cutoff points found: PROGRAM: Bachelor of Science in Family & Consumer Sciences (Family and Child Studies Option) CUTOFF: Administration
No cutoff points found: PROGRAM: Bachelor of Science in Family & Consumer Sciences (Food and Clothing Option) CUTOFF: Administration
Updated program: Bachelor of Science in Agriculture with cutoff: Agric

In [151]:
matched_programs

['Actuarial Science',
 'Computer Science',
 'Doctor of Veterinary Medicine',
 'Materials Science & Engineering',
 'Food Process Engineering',
 'Computer Engineering',
 'Biomedical Engineering',
 'Agricultural Engineering',
 'Agriculture',
 'Psychology',
 'Biological Sciences',
 'Information Technology',
 'Earth Sciences',
 'Mathematical Sciences',
 'Physical Sciences',
 'Public Health',
 'Midwifery',
 'Occupational Therapy',
 'Dietetics',
 'Physiotherapy',
 'Respiratory Therapy',
 'Nursing',
 'Doctor of Pharmacy',
 'Dental Surgery',
 'Arts',
 'Laws',
 'Administration']

In [152]:
existing_data

[{'schoolname': 'University of Ghana'},
 [{'program': 'BSc. Actuarial Science',
   'filteringsubject': ['Social Studies'],
   'alternatecore': ['Integrated Science'],
   'cutoff point': {'firstChoiceSubjectArea': '11',
    'firstChoiceNonSubject': '10',
    'fullFeePaying': '14',
    'secondChoice': None,
    'subjectRequirements': 'B3 in Elective Mathematics'},
   'campus': 'Main Campus (Legon)',
   'college': 'College of Basic & Applied Sciences',
   'core subjects': ['English Language',
    'Core Mathematics',
    'Integrated Science',
    'Social Studies'],
   'elective subjects': {'main': ['Elective Mathematics'],
    'tracks': {'Agric': ['General Agriculture',
      'Animal Husbandry',
      'Agricultural Economics',
      'Crop Science',
      'Chemistry',
      'Physics'],
     'GArts': ['Economics',
      'Geography',
      'Government',
      'History',
      'Christian Religious Studies',
      'Music',
      'Elective ICT',
      'Literature in English',
      'French',
   

In [None]:
# Save to JSON file
with open("updated_ug_new1.json", 'w', encoding='utf-8') as f:
    json.dump(existing_data, f, indent=2)