In [None]:
import os
import json
import hashlib
import requests
from bs4 import BeautifulSoup
import time
from tqdm import tqdm
import random
import re
import chardet
import requests

In [None]:
meta_urls = ["https://www.trjlseng.com/cyst/", "https://www.trjlseng.com/cyst/list_2.html", 
             "https://www.trjlseng.com/cest/", "https://www.trjlseng.com/cest/list_2.html", 
             "https://www.trjlseng.com/csst/", "https://www.trjlseng.com/csst/list_2.html"]
HEADERS = {"User-Agent": "Mozilla/5.0"}

def get_page_links(base_url):
    response = requests.get(base_url, headers=HEADERS, timeout=20)
    if response.status_code != 200:
        raise Exception(f"Failed to access {base_url}")
    
    soup = BeautifulSoup(response.content, 'html.parser')
    article_links = []

    for link in soup.select("a"):
        href = link.get("href")
        # check if the link is of the format /cyst/digit.html
        if href and re.match(r"/(cyst|cest|csst)/\d+\.html", href):
            full_url = "https://www.trjlseng.com" + href
            article_links.append(full_url)
    
    return list(set(article_links))

In [None]:
page_links = {}
for url in meta_urls:
    page_links[url] = get_page_links(url)

In [None]:
import pickle

with open("trjlseng_links.pkl", "wb") as f:
    pickle.dump(page_links, f)

In [None]:
os.makedirs("trjlseng", exist_ok=True)

In [None]:
def get_resource_link(base_url):
    response = requests.get(base_url, headers=HEADERS, timeout=20)
    if response.status_code != 200:
        raise Exception(f"Failed to access {base_url}")
    
    soup = BeautifulSoup(response.content, 'html.parser')
    for link in soup.select("a"):
        href = link.get("href")
        # check if the link is of the format /file/*
        if href and "/file/" in href:
            full_url = "https://www.trjlseng.com" + href
            return full_url
    
    return None

In [None]:
file_links = {}
meta_category = ["cyst", "cest", "csst"]
for category in meta_category:
    file_links[category] = []

for raw_category, links in page_links.items():
    for cat in meta_category:
        if cat in raw_category:
            break
    for link in links:
        file_links[cat].append(get_resource_link(link))

for cat, file in file_links.items():
    file_links[cat] = list(set(file))
    with open(f"trjlseng/{cat}.txt", "w") as f:
        for link in file_links[cat]:
            f.write(link + "\n")

In [None]:
!pip install rarfile

In [None]:
import rarfile

def download_file(url, save_path):
    """Download a file from a given URL and save it locally."""
    response = requests.get(url, stream=True)
    if response.status_code == 200:
        with open(save_path, "wb") as f:
            for chunk in response.iter_content(1024):
                f.write(chunk)
        # print(f"Downloaded: {save_path}")
    else:
        raise Exception(f"Failed to download {url}")

def extract_rar(file_path, extract_to):
    """Extract a .rar file to the specified directory."""
    try:
        with rarfile.RarFile(file_path) as rf:
            rf.extractall(extract_to)
        print(f"Extracted: {file_path} -> {extract_to}")
    except rarfile.RarCannotExec as e:
        raise Exception(f"Extraction failed: {e}.")

In [None]:
meta_category = ["cyst", "cest", "csst"]

failed_links = {}
for category in meta_category:
    failed_links[category] = []
    with open(f"trjlseng/{category}.txt", "r") as f:
        links = f.readlines()
    
    for link in tqdm(links):
        try:
            link = link.strip()
            file_name = os.path.basename(link)
            os.makedirs(os.path.join("trjlseng", category, "raw_zips"), exist_ok=True)
            download_path = os.path.join("trjlseng", category, "raw_zips", file_name)
            print(download_path)
            
            download_file(link, download_path)
            
            extract_dir = os.path.join("trjlseng", category, "unzip")
            os.makedirs(extract_dir, exist_ok=True)
            
            extract_rar(download_path, extract_dir)
        except Exception as e:
            print(f"Failed to process {link}: {e}")
            failed_links[category].append(link)