In [15]:
import re
import requests
from bs4 import BeautifulSoup

In [16]:
import spacy
# Tải mô hình ngôn ngữ tiếng Anh từ spaCy
nlp = spacy.load("en_core_web_lg")

In [13]:
def extract_text(url, news_name):
    result = ""
    try:
        response =requests.get(url)
        response.raise_for_status()  # Trả ra ngoại lệ nếu gặp lỗi.
        html_content = response.text # Lấy nội dung HTML phản hồi.
    except requests.exceptions.RequestException as e:
        print(f"Error: {e}")
        return None

    soup = BeautifulSoup(html_content, 'html.parser')
    if news_name == "CNN": # Nội dung bài báo của CNN được lưu trong các thẻ <p> trong thẻ <main>
        main_content = soup.find('main')
        if main_content:
            paragraphs = main_content.find_all('p')
            for p in paragraphs:
                result += p.get_text()
            return result
        else:
            print("Can't extract text from this news")
            return None
        
    elif news_name == "DailMail": # Nội dung bài báo của Daily Mail được lưu trong thẻ <div itemprop='articleBody'>
        article_body = soup.find('div', {'itemprop': 'articleBody'})
        if article_body:
            paragraphs = article_body.find_all('p')
            for p in paragraphs:
                result += p.get_text()
            return result
        else:
            print("Can't extract text from this news")
            return None

In [17]:
# Biểu thức chính quy để xác định các ký tự đặc biệt nằm giữa hai số
pattern = r'(?<!\d)[^\w\s%](?!\d)'

def preprocessing(sentence):
    doc = nlp(sentence)
    lemmatized_text = " ".join([token.lemma_ for token in doc]).lower().strip()

    # Xóa các ký tự đặc biệt không thuộc trường hợp đã nêu
    lemmatized_text = re.sub(fr'(?<!\d)[^a-zA-Z0-9\s]|[^a-zA-Z0-9\s%](?!\d)|{pattern}', '', lemmatized_text)
    lemmatized_text = re.sub(r'\s+', ' ', lemmatized_text)

    return lemmatized_text

In [18]:
# Ví dụ sử dụng hàm
url_example = "https://edition.cnn.com/2023/12/14/world/ukraines-european-allies-are-letting-it-down/index.html"
html_content = extract_text(url_example, "CNN")

if html_content:
    print("Nội dung HTML:")
    print(html_content)  # In ra 500 ký tự đầu tiên cho mục đích kiểm tra
else:
    print("Không thể lấy nội dung từ URL.")

Nội dung HTML:

      Before the war between Israel and Hamas took over virtually all of the international community’s attention, the Russian invasion of Ukraine was the most pressing crisis diplomats were seeking to solve.
  
      In Brussels, this week had been slated as a big moment for the Europification of Ukraine, as EU member states meet for their final council summit of the year to green light both more funding for Kyiv and, finally, the opening of negotiations for Ukrainian membership of the bloc.
  
      All of that has been thrown into question by one member state: Hungary.
  
      Hungary’s populist PM Viktor Orban has been Western Europe’s number one problem child for some time. Even before the war in Ukraine, Hungary had form for holding the rest of the EU to ransom. Most of the big decisions made by Brussels require unanimous approval from all 27 member states. This means that every member state effectively has a veto it can use to block core EU policy, like sending b

In [19]:
preprocessing(html_content)

'before the war between israel and hamas take over virtually all of the international community s attention the russian invasion of ukraine be the most pressing crisis diplomat be seek to solve in brussels this week have be slate as a big moment for the europification of ukraine as eu member state meet for their final council summit of the year to green light both more funding for kyiv and finally the opening of negotiation for ukrainian membership of the bloc all of that have be throw into question by one member state hungary hungary s populist pm viktor orban have be western europe s number one problem child for some time even before the war in ukraine hungary have form for hold the rest of the eu to ransom most of the big decision make by brussels require unanimous approval from all 27 member state this mean that every member state effectively have a veto it can use to block core eu policy like send billion of euro to a war tear country or let a country into the club in theory this 

In [14]:
# Ví dụ sử dụng hàm
url_example = "https://www.dailymail.co.uk/debate/article-12865975/JAN-MOIR-no-wonder-Rebekah-Vardy-wants-sweep-Wagatha-trial-tatty-carpet-shame.html"
html_content = extract_text(url_example, "DailMail")

if html_content:
    print("Nội dung HTML:")
    print(html_content)  # In ra 500 ký tự đầu tiên cho mục đích kiểm tra
else:
    print("Không thể lấy nội dung từ URL.")

Nội dung HTML:
When Rebekah Vardy left court after losing the libel case she brought against Coleen Rooney last year, she said that the judge 'got it wrong'. And that the English legal system let her down.Hmm. An interesting position for the losing party, I thought. A bracing take on a thumping defeat; like a boxer flat out on the canvas after a knockout punch, counted out, covered in blood but still claiming: 'Look, mate, the ref got it wrong.'See also: the umpire is a liar, the magistrate is out of date, the arbitrator is too arbitrary, the dog ate my homework and the moon is a balloon.You could say that on the pitch, football's VAR system is unpopular and much mocked, while off the pitch the chaotic VARDY system suffers much the same fate.For in bringing the infamous Wagatha Christie case and then losing it, in suing over accusations she had leaked details of Mrs Rooney's private life to newspapers and then failing miserably to show it was not true, we must all agree that Rebekah sc