From 2092afd522b31287f5e72bec8d4d693cb4cdb723 Mon Sep 17 00:00:00 2001 From: ykerus Date: Fri, 19 Jan 2024 11:41:15 +0100 Subject: [PATCH] Fixes: language and status --- src/crea_scraper/course.py | 1 - src/crea_scraper/scraper.py | 27 ++++++++------------------- 2 files changed, 8 insertions(+), 20 deletions(-) diff --git a/src/crea_scraper/course.py b/src/crea_scraper/course.py index d944ce9..d379c5a 100644 --- a/src/crea_scraper/course.py +++ b/src/crea_scraper/course.py @@ -37,5 +37,4 @@ class Course(CourseClass): cursusnummer: str # 12345 docent: str # e.g. Yke Rusticus taal: str # e.g. 🇳🇱 - cursus_type: str # e.g. Fysiek status: str # e.g. open diff --git a/src/crea_scraper/scraper.py b/src/crea_scraper/scraper.py index e4322a0..9cf0e1c 100644 --- a/src/crea_scraper/scraper.py +++ b/src/crea_scraper/scraper.py @@ -139,8 +139,13 @@ def _get_raw_table_data(table_html, incl_empty_values=True): rows = table_html.find_all("tr") for row in rows: cols = row.find_all("td") - cols = [e.text.strip() for e in cols] - table_data.append([e for e in cols if e or incl_empty_values]) + cols_extracted = [] + for col in cols: + if col.find("img"): + cols_extracted.append(", ".join([img["title"] for img in col.find_all("img")])) + else: + cols_extracted.append(col.text.strip()) + table_data.append([e for e in cols_extracted if e or incl_empty_values]) return table_data @@ -171,17 +176,6 @@ def _separate_time_from_day(table_dict): return table_dict -def _rename_table_columns(table_dict): - try: - table_dict["cursus_type"] = table_dict["cursus type"] - del table_dict["cursus type"] - except KeyError: - print(f"Key 'cursus type' not found in {table_dict.keys()}") - table_dict["cursus_type"] = "" - - return table_dict - - def _extract_data_from_table(table_html): table_data = _get_raw_table_data(table_html) table_dict = _get_dict_from_raw_table_data(table_data) @@ -190,11 +184,7 @@ def _extract_data_from_table(table_html): def _get_course_status(register_link_html): - if "vol" in register_link_html.text: - return "vol" - if "gestart" in register_link_html.text: - return "gestart" - return "open" + return register_link_html.text def _get_course_table_data(course_html) -> List[Dict]: @@ -206,7 +196,6 @@ def _get_course_table_data(course_html) -> List[Dict]: table_data = [] for table_html, register_link_html in zip(tables_html, register_links_html): table_dict = _extract_data_from_table(table_html) - table_dict = _rename_table_columns(table_dict) course_status = _get_course_status(register_link_html) table_dict["status"] = course_status table_data.append(table_dict)