In [1]:
import sys
import os
import re

import bs4
import pymupdf
import matplotlib.pyplot as plt
import requests

from bs4 import Tag
from PIL import Image

sys.path.append("..")  # Add the parent directory to the Python path

In [2]:
url = "http://www.fcyt.umss.edu.bo/horarios/"

In [3]:
response = requests.get(url)
soup = bs4.BeautifulSoup(response.text, "html.parser")

In [4]:
def get_pdf_link(link: Tag) -> str:
    url = link.get("href")
    return url and url.endswith(".pdf")


urls = [url for url in map(get_pdf_link, soup.find_all("a")) if url]

In [5]:
semester = soup.find("p", {"class": "titulo2"}).getText()
semester = re.search(r"\d{1,2}-\d{4}", semester).group().replace("-", "/")
semester

'1/2025'

In [6]:
from datetime import datetime
from src.core.schemas import Career


def get_careers():
    careers: list[Career] = []

    careers_filter: list[Tag] = filter(
        lambda tag: tag.get_text() != "",
        soup.find_all("font", {"size": "3"}),
    )

    for career_tag in careers_filter:
        text: str = career_tag.get_text(strip=True)
        if text.isdigit():
            careers.append(
                Career(
                    madeIn="SCESI UMSS",
                    semester=semester,
                    support="@willypaz243, @ceci, @pablox",
                    path="-".join(semester.split("/")[::-1]),
                )
            )
            continue
        if not careers:
            continue
        current = careers[-1]

        code_and_career = re.search(r"(\d{6})\s(.*)", text)
        if code_and_career:
            current["code"] = int(code_and_career.group(1))
            current["name"] = code_and_career.group(2)
            continue
        link_url = career_tag.find("a")
        if link_url:
            current["url"] = link_url.get("href")
            continue
        updated_at = re.search(r"\d{2}:\d{2}\s\d{2}-\d{2}-\d{4}", text)
        if updated_at:
            date = updated_at.group()
            current["updatet_at"] = datetime.strptime(
                date, "%H:%M %d-%m-%Y"
            ).isoformat()
            current["levels"] = []
            continue

    return careers


In [7]:
careers = get_careers()
careers_dict = {career["code"]: career for career in careers}

In [8]:
import json

In [9]:
print(json.dumps(careers, indent=4))

[
    {
        "madeIn": "SCESI UMSS",
        "semester": "1/2025",
        "support": "@willypaz243, @ceci, @pablox",
        "path": "2025-1",
        "code": 409701,
        "name": "ALIMENTOS",
        "url": "http://sagaa.fcyt.umss.edu.bo/pre_academico/horarios/409701.pdf",
        "updatet_at": "2025-03-04T21:50:00",
        "levels": []
    },
    {
        "madeIn": "SCESI UMSS",
        "semester": "1/2025",
        "support": "@willypaz243, @ceci, @pablox",
        "path": "2025-1",
        "code": 399501,
        "name": "BIOLOGIA",
        "url": "http://sagaa.fcyt.umss.edu.bo/pre_academico/horarios/399501.pdf",
        "updatet_at": "2025-03-04T21:50:00",
        "levels": []
    },
    {
        "madeIn": "SCESI UMSS",
        "semester": "1/2025",
        "support": "@willypaz243, @ceci, @pablox",
        "path": "2025-1",
        "code": 165221,
        "name": "BIOTECNOLOGIA",
        "url": "http://sagaa.fcyt.umss.edu.bo/pre_academico/horarios/165221.pdf",
        "

# text extraction

In [10]:
import io

In [11]:
career  = careers_dict[339701] # quimica

In [12]:
res  = requests.get(career['url'])
filestream = io.BytesIO(res.content)
pdf_doc = pymupdf.open(stream=filestream, filetype="pdf")

In [13]:
def extract_all_text(doc):
    text = ""
    for page in doc:
        text += page.get_text() + "\n"
    return text

In [14]:
def extract_position(text):
    pattern = r"[A-Z]\s\d{7}"

    occurrences = 0
    positions = []
    for match in re.finditer(pattern, text):
        pos = match.start()
        positions.append(pos)
        occurrences += 1
    return positions, occurrences

In [16]:
def clear_text(text: str) -> str:
    patron = re.compile(r"\bProcesado CPD.*?A\w*\s\d/\d{4}", flags=re.DOTALL)
    cabeceras = re.compile(r"\bNIVEL.*?AULA", flags=re.DOTALL)
    return cabeceras.sub("", patron.sub("", text))

In [17]:
raw_text = extract_all_text(pdf_doc)
text = clear_text(raw_text)
print(text)


A
2008237 ALGEBRA LINEAL Y TEORIA MATRICIAL
1
OMONTE OJALVO JOSE ROBERTO
JU
815 -0945
623
1
OMONTE OJALVO JOSE ROBERTO
VI
815 -0945
622
1
[TP] SERRANO MENA DIEGO LEONARDO
SA
815 -0945
691A
2
SOTO MOREIRA JUAN CARLOS
LU
1415-1545
612
2
SOTO MOREIRA JUAN CARLOS
MA
945 -1115
661
2
SOTO MOREIRA JUAN CARLOS
JU
1415-1545
691F
3
SALINAS PERICON WALTER OSCAR 
GONZALO
MA
815 -0945
617B
3
SALINAS PERICON WALTER OSCAR 
GONZALO
JU
1715-1845
612
A
2008054 CALCULO I
14
FIORILO LOZADA AMERICO
LU
645 -0815
607
14
FIORILO LOZADA AMERICO
JU
645 -0815
607
14
FIORILO LOZADA AMERICO
SA
1115-1245
691F
18
GONZALES CASTELLON CARLOS 
ESTEBAN
LU
1415-1545
692D
18
GONZALES CASTELLON CARLOS 
ESTEBAN
MA
1415-1545
692A
18
[TP] HEREDIA CORRALES JOAQUIN MANUEL
VI
1545-1715
617B
22
ZEGARRA DORADO LUIS ROBERTO
LU
1415-1545
691D
22
ZEGARRA DORADO LUIS ROBERTO
MA
1415-1545
692B
A
2006018 FISICA BASICA I
A
VISCARRA VARGAS MARCO ANTONIO
LU
1115-1245
622
A
VISCARRA VARGAS MARCO ANTONIO
MA
945 -1115
691A
A
VISCARRA VARGAS M