In [1]:
import requests
from bs4 import BeautifulSoup
import csv
import xml.etree.ElementTree as ET

def fetch_sitemap(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        return response.text
    except requests.RequestException as e:
        print(f"Không thể tải sitemap: {e}")
        return None

def save_to_csv(filename, data, fieldnames):
    with open(filename, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(data)

def read_from_csv(filename):
    with open(filename, "r", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        return [row for row in reader]


In [2]:
# 1. Crawl sitemap để lấy danh sách journal URLs
sitemap_url = "https://link.springer.com/sitemap-springer-journals.xml"
sitemap_content = fetch_sitemap(sitemap_url)

def parse_sitemap(xml_content):
    urls = []
    try:
        root = ET.fromstring(xml_content)
        for loc in root.iter("{http://www.sitemaps.org/schemas/sitemap/0.9}loc"):
            urls.append(loc.text.strip())
    except ET.ParseError as e:
        print(f"Lỗi phân tích sitemap: {e}")
    return urls

if sitemap_content:
    journal_urls = parse_sitemap(sitemap_content)
    journal_csv = "journals.csv"
    save_to_csv(journal_csv, [{"journal_url": url} for url in journal_urls], ["journal_url"])
    print(f"Đã lưu {len(journal_urls)} journal URLs vào {journal_csv}.")


Đã lưu 2960 journal URLs vào journals.csv.


In [None]:
# 2. Đọc danh sách journal từ file
journal_csv = "journals.csv"
journals = read_from_csv(journal_csv)

def fetch_article_links(journal_url):
    all_links = []
    page = 1
    while True:
        url = f"{journal_url}/articles?filterOpenAccess=false&page={page}"
        print(f"Đang crawl trang: {url}")
        try:
            response = requests.get(url)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, "html.parser")
            articles = soup.find_all("h3", {"class": "app-card-open__heading"})
            if not articles:  # Dừng nếu không còn bài viết
                break
            for article in articles:
                link = article.find("a")["href"] if article.find("a") else None
                if link:
                    all_links.append(link)
            page += 1
        except requests.RequestException as e:
            print(f"Lỗi khi tải trang {url}: {e}")
            break
    return all_links

# Crawl các bài viết 
all_articles = []
journals_to_crawl = journals[:]  
for journal in journals_to_crawl:
    articles = fetch_article_links(journal["journal_url"])
    for article in articles:
        all_articles.append({"article_url": article, "journal_url": journal["journal_url"]})

# Lưu danh sách bài viết vào file CSV
articles_csv = "article_links.csv"  
save_to_csv(articles_csv, all_articles, ["article_url", "journal_url"])
print(f"Đã lưu {len(all_articles)} bài viết từ journal vào {articles_csv}.")


In [None]:
# 3. Đọc danh sách bài viết từ file
articles_csv = "article_links.csv"
articles = read_from_csv(articles_csv)

def fetch_article_data(article_url):
    try:
        response = requests.get(article_url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")
        title = soup.find("h1", {"class": "c-article-title"}).get_text(strip=True) if soup.find("h1", {"class": "c-article-title"}) else "N/A"

        abstract_section = soup.find("div", {"id":"Abs1-section"})
        abstract = abstract_section.find("p").get_text(strip=True) if abstract_section else "N/A"

        return {
            "url": article_url,
            "title": title,
            "abstract": abstract
        }
    except requests.RequestException as e:
        print(f"Lỗi khi tải bài viết {article_url}: {e}")
        return None

# Crawl dữ liệu chi tiết của từng bài viết
articles_data = []
for article in articles:
    article_data = fetch_article_data(article["article_url"])
    if article_data:
        articles_data.append(article_data)

# Lưu dữ liệu chi tiết vào file CSV
articles_data_csv = "articles_data.csv"
save_to_csv(articles_data_csv, articles_data, ["url", "title", "abstract"])
print(f"Đã lưu dữ liệu chi tiết của bài viết vào {articles_data_csv}.")
