In [1]:
!pip install requests beautifulsoup4 lxml spacy
!python -m spacy download en_core_web_trf
!pip install transformers
!pip install torch torchvision torchaudio
!pip install tensorflow
!pip install requests beautifulsoup4 lxml spacy
!python -m spacy download en_core_web_sm
!pip install hf_xet


Collecting en-core-web-trf==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.8.0/en_core_web_trf-3.8.0-py3-none-any.whl (457.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m457.4/457.4 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting spacy-curated-transformers<1.0.0,>=0.2.2 (from en-core-web-trf==3.8.0)
  Downloading spacy_curated_transformers-0.3.0-py2.py3-none-any.whl.metadata (2.7 kB)
Collecting curated-transformers<0.2.0,>=0.1.0 (from spacy-curated-transformers<1.0.0,>=0.2.2->en-core-web-trf==3.8.0)
  Downloading curated_transformers-0.1.1-py2.py3-none-any.whl.metadata (965 bytes)
Collecting curated-tokenizers<0.1.0,>=0.0.9 (from spacy-curated-transformers<1.0.0,>=0.2.2->en-core-web-trf==3.8.0)
  Downloading curated_tokenizers-0.0.9-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.9 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.12.0->spacy-curated-transformers<

In [5]:
import requests
from bs4 import BeautifulSoup
import spacy
from functools import lru_cache
import os
import re
from transformers import pipeline # For zero-shot, summarization, text2text-generation, and NER

# --- Configuration ---
SPACY_MODEL_TO_LOAD = "en_core_web_trf" # Using a consistent model name
REQUEST_HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
OUTPUT_DIR = "processed_articles" # Directory for output files

# --- 1. Load English spaCy model ---
nlp_en = None
try:
    nlp_en = spacy.load(SPACY_MODEL_TO_LOAD, disable=["parser", "ner"]) # spaCy NER is disabled as we will use HF NER
    nlp_en.add_pipe("sentencizer")
    print(f"English spaCy model ({SPACY_MODEL_TO_LOAD}) loaded successfully.")
except OSError:
    print(f"ERROR: English spaCy model ({SPACY_MODEL_TO_LOAD}) not installed. Please install it with the command:")
    print(f"python -m spacy download {SPACY_MODEL_TO_LOAD}")
    print("The program will exit as the model is not installed.")
    exit()

# --- 2. Load Hugging Face Pipelines ---
zero_shot_classifier = None
summarizer_pipeline = None
title_generator_pipeline = None
ner_pipeline = None # NEW: NER Pipeline

try:
    print("Loading Zero-shot classification pipeline...")
    zero_shot_classifier = pipeline("zero-shot-classification",model="MoritzLaurer/DeBERTa-v3-large-mnli-fever-anli-ling-wanli")
    print("Zero-shot classification pipeline loaded successfully.")
except Exception as e:
    print(f"ERROR: Failed to load the zero-shot classification pipeline: {e}")
    print("Categorization will not be available.")

try:
    print("Loading Summarization pipeline...")
    summarizer_pipeline = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
    print("Summarization pipeline loaded successfully.")
except Exception as e:
    print(f"ERROR: Failed to load the summarization pipeline: {e}")
    print("Summarization will not be available.")

try:
    print("Loading Text-to-Text Generation pipeline for titles...")
    title_generator_pipeline = pipeline("text2text-generation", model="t5-large")
    print("Text-to-Text Generation pipeline (for titles using t5-large) loaded successfully.")
except Exception as e:
    print(f"ERROR: Failed to load the Text-to-Text Generation pipeline (t5-large): {e}")
    print("Title generation will not be available.")

try:
    print("Loading NER pipeline (dslim/bert-base-NER)...") # NEW: Load NER pipeline
    ner_pipeline = pipeline("ner", model="dslim/bert-base-NER", tokenizer="dslim/bert-base-NER", aggregation_strategy="simple")
    print("NER pipeline (dslim/bert-base-NER) loaded successfully.")
except Exception as e:
    print(f"ERROR: Failed to load the NER pipeline (dslim/bert-base-NER): {e}")
    print("Named Entity Recognition will not be available.")


# --- Helper Functions ---
def fetch_html_content(url):
    """Fetches HTML content from the given URL."""
    try:
        if "news.google.com/read" in url:
            print(f"Warning: The URL '{url}' appears to be a Google News redirector. ")
            print("Attempting to fetch, but the content might be from the redirect page, not the final article.")
            print("For best results, try to use the direct article URL.")

        response = requests.get(url, headers=REQUEST_HEADERS, timeout=20, allow_redirects=True)
        response.raise_for_status()
        response.encoding = response.apparent_encoding if response.apparent_encoding else 'utf-8'
        return response.text
    except requests.exceptions.RequestException as e:
        print(f"Error occurred while fetching URL '{url}': {e}")
        return None

def extract_text_from_html(html_string, main_content_selectors=None, selectors_to_exclude=None):
    """
    Extracts clean text from an HTML string.
    """
    if not html_string:
        return ""
    soup = BeautifulSoup(html_string, 'lxml')
    for element in soup(["script", "style"]):
        element.decompose()
    content_scope = soup
    if main_content_selectors:
        for selector in main_content_selectors:
            selected_block = soup.select_one(selector)
            if selected_block:
                content_scope = selected_block
                break
    if selectors_to_exclude and content_scope:
        for selector in selectors_to_exclude:
            for element_to_remove in content_scope.select(selector):
                element_to_remove.decompose()
    if content_scope == soup: # If no specific main content found, try to remove common boilerplate
        common_boilerplate_tags = ["header", "footer", "nav", "aside", "form", "figure", "figcaption"] # Added figure/figcaption
        for tag_name in common_boilerplate_tags:
            for tag in content_scope.find_all(tag_name):
                tag.decompose()
    text = ""
    if content_scope:
        text = content_scope.get_text(separator=' ', strip=True)
    text = ' '.join(text.split())
    return text

# --- NLP Processing Functions ---
@lru_cache(maxsize=128)
def get_tokenized_text_spacy(text_to_process):
    """Tokenizes the text using spaCy."""
    if not nlp_en or not text_to_process:
        return ""
    doc = nlp_en(text_to_process)
    tokens = [token.text for token in doc if not token.is_space]
    return " ".join(tokens)

@lru_cache(maxsize=128)
def get_lemmatized_text_spacy(text_to_process):
    """Lemmatizes the English text."""
    if not nlp_en or not text_to_process:
        return [], ""
    doc = nlp_en(text_to_process)
    lemmas = []
    for token in doc:
        if (not token.is_punct and
            not token.is_space and
            len(token.lemma_) > 0):
            lemma_to_add = token.lemma_.lower()
            if token.pos_ == "PRON" and token.lemma_ == "-PRON-": # spaCy uses -PRON- for pronouns
                lemma_to_add = token.text.lower()
            if lemma_to_add and (token.is_alpha or token.like_num): # Ensure lemma is alphanumeric or number-like
                lemmas.append(lemma_to_add)
    return lemmas, " ".join(lemmas)

# --- Categorization Function ---
def categorize_text_zero_shot(text_to_categorize, categories, classifier_pipeline):
    """
    Categorizes the text using a zero-shot method.
    """
    if not classifier_pipeline:
        print("Warning: Zero-shot classifier not loaded. Skipping categorization.")
        return None
    if not text_to_categorize or not categories:
        print("Warning: No text or categories provided for categorization.")
        return None

    try:
        max_chars = 2048 # Model dependent, check model card for specific limits if issues arise
        truncated_text = text_to_categorize[:max_chars]
        result = classifier_pipeline(truncated_text, candidate_labels=categories, multi_label=False) # Assuming single label prediction
        return result
    except Exception as e:
        print(f"Error occurred during text categorization: {e}")
        return None

# --- Summarization Function ---
def generate_summary_abstractive(text_to_summarize, summarizer):
    """
    Generates an abstractive summary of the text.
    """
    if not summarizer:
        print("Warning: Summarizer pipeline not loaded. Skipping summarization.")
        return None
    if not text_to_summarize:
        print("Warning: No text provided for summarization.")
        return None

    try:
        max_input_chars_for_summary = 4000
        input_text = text_to_summarize[:max_input_chars_for_summary]
        summary = summarizer(input_text, max_length=150, min_length=40, do_sample=False)

        if summary and isinstance(summary, list) and len(summary) > 0 and 'summary_text' in summary[0]:
            return summary[0]['summary_text']
        else:
            print("Warning: Summarizer did not return the expected output format.")
            return None

    except Exception as e:
        print(f"Error occurred during text summarization: {e}")
        return None

# --- Title Generation Function ---
def generate_title_t5(text_to_get_title_from, title_generator, max_input_chars=2048):
    """
    Generates a title for the text using a T5 model.
    """
    if not title_generator:
        print("Warning: Title generator pipeline not loaded. Skipping title generation.")
        return None
    if not text_to_get_title_from:
        print("Warning: No text provided for title generation.")
        return None

    try:
        input_text = "generate headline: " + text_to_get_title_from[:max_input_chars]
        title_result = title_generator(input_text, max_length=30, min_length=5, num_beams=4, early_stopping=True)

        if title_result and isinstance(title_result, list) and len(title_result) > 0 and 'generated_text' in title_result[0]:
            return title_result[0]['generated_text']
        else:
            print("Warning: Title generator did not return the expected output format.")
            return None

    except Exception as e:
        print(f"Error occurred during title generation: {e}")
        return None

# --- NER Post-processing Helper Function (NEW) ---
def post_process_ner_entities(raw_entities_list, score_threshold=0.70):
    """
    Post-processes a list of raw NER entities.
    - Filters by confidence score.
    - Attempts to merge fragmented entities (basic).
    - Filters known false positives.
    - Collects unique entities, keeping the one with the highest score.
    - Sorts entities by score.
    """
    if not raw_entities_list:
        return []

    # 1. Filter by score
    processed_entities = [ent for ent in raw_entities_list if ent['score'] >= score_threshold]

    # 2. Attempt to merge fragmented entities (basic approach)
    # This is a simplified merge and might need refinement for complex cases.
    if processed_entities:
        merged_attempt_entities = []
        i = 0
        while i < len(processed_entities):
            current_entity = processed_entities[i]
            # Try to merge if current starts with "##" and there's a previous entity of the same type
            # and the previous entity does not end with a space (suggesting it might be an incomplete token)
            if current_entity['text'].startswith("##") and merged_attempt_entities:
                prev_entity_dict = merged_attempt_entities[-1]
                if prev_entity_dict['type'] == current_entity['type'] and \
                   not prev_entity_dict['text'].endswith(" "):
                    original_prev_score = prev_entity_dict['score']
                    prev_entity_dict['text'] += current_entity['text'].replace("##", "")
                    prev_entity_dict['score'] = (original_prev_score + current_entity['score']) / 2 # Average score
                    i += 1
                    continue
            merged_attempt_entities.append(dict(current_entity)) # Use a copy
            i += 1
        processed_entities = merged_attempt_entities

    # 3. Filter known false positives or very short/generic entities
    #    (Customize this list based on observed errors)
    known_false_positives_text_type = {
        ("Piano", "PER"),
        ("Man", "PER"), # If "Man" alone as PER is usually wrong
        ("In", "ORG"),
        ("Pop", "PER"), # If "Pop" alone as PER is usually wrong
    }
    entities_after_fp_filter = []
    for ent in processed_entities:
        # Check against known false positives
        if (ent['text'], ent['type']) in known_false_positives_text_type:
            continue
        # Filter out very short, non-alphanumeric entities or common words misclassified
        # (this is a heuristic and can be adjusted)
        text_lower = ent['text'].lower()
        if len(text_lower) < 2 and not text_lower.isalnum(): # e.g. single punctuation
            continue
        if len(text_lower) < 3 and ent['type'] in ['ORG', 'PER'] and not any(char.isupper() for char in ent['text']):
            # e.g. "an" as ORG, "he" as PER, unless it has caps (which might indicate actual name)
            if text_lower in ["he", "she", "it", "we", "us", "an", "in", "on", "at", "of", "to", "is", "a"]: # Common stopwords
                 continue
        entities_after_fp_filter.append(ent)
    processed_entities = entities_after_fp_filter

    # 4. Collect unique entities (text, type), keeping the one with the highest score
    unique_entities_dict = {}
    for ent in processed_entities:
        key = (ent['text'].strip(), ent['type']) # Use stripped text for key
        if key not in unique_entities_dict or ent['score'] > unique_entities_dict[key]['score']:
            # Ensure text is stripped before storing if it wasn't already
            ent_copy = dict(ent)
            ent_copy['text'] = ent['text'].strip()
            unique_entities_dict[key] = ent_copy
    processed_entities = list(unique_entities_dict.values())

    # 5. Sort entities by score (descending) for consistent output
    processed_entities.sort(key=lambda x: x['score'], reverse=True)

    return processed_entities

# --- NER Function (MODIFIED to include post-processing) ---
def extract_named_entities_bert(text_to_process, ner_model_pipeline, max_chars=4000):
    """
    Extracts and post-processes named entities from text using a Hugging Face NER pipeline.
    """
    if not ner_model_pipeline:
        print("Warning: NER pipeline not loaded. Skipping entity extraction.")
        return None, ""
    if not text_to_process:
        print("Warning: No text provided for entity extraction.")
        return [], "" # Return empty list and string if no text

    try:
        truncated_text = text_to_process[:max_chars]
        entities_from_pipeline = ner_model_pipeline(truncated_text)

        raw_formatted_entities = []
        if entities_from_pipeline:
            for entity in entities_from_pipeline:
                raw_formatted_entities.append({
                    "text": entity['word'],
                    "type": entity['entity_group'],
                    "score": float(entity['score'])
                })

        # Call post-processing function
        if raw_formatted_entities:
            # You can adjust the score_threshold here if needed for different contexts
            processed_entities_list = post_process_ner_entities(raw_formatted_entities, score_threshold=0.65)
        else:
            processed_entities_list = []

        if processed_entities_list:
            entity_string_parts = [f"Entity: {e['text']}, Type: {e['type']} (Score: {e['score']:.4f})" for e in processed_entities_list]
            return processed_entities_list, "\n".join(entity_string_parts)
        else:
            return [], "" # Return empty if no entities after processing or initially

    except Exception as e:
        print(f"Error occurred during Named Entity Recognition: {e}")
        return None, "" # Indicate error to the caller

# --- File Handling Functions ---
def save_text_to_file(text_content, filepath):
    """Saves text content to a file."""
    try:
        os.makedirs(os.path.dirname(filepath), exist_ok=True)
        with open(filepath, "w", encoding="utf-8") as f:
            f.write(text_content)
        print(f"   File saved successfully: {filepath}")
    except IOError as e:
        print(f"   Error writing file ({filepath}): {e}")

def generate_safe_filename(url_or_title, suffix=""):
    """Generates a safe filename."""
    if not url_or_title:
        base_name = f"article_{suffix}"
    else:
        name_part = url_or_title.split('/')[-1] if '/' in url_or_title else url_or_title
        name_part = name_part.split('?')[0] # Remove query parameters
        safe_name = re.sub(r'[^a-zA-Z0-9_-]+', '_', name_part).strip('_')
        safe_name = safe_name[:50] # Limit length
        base_name = f"{safe_name}_{suffix}" if safe_name else f"article_{suffix}"
    return os.path.join(OUTPUT_DIR, f"{base_name}.txt")

# --- Main Processing Function ---
def process_article_url(article_url, main_content_selectors=None, selectors_to_exclude=None):
    """
    Executes the full processing pipeline for a given URL.
    """
    if not nlp_en: # spaCy for tokenization/lemmatization
        print("spaCy model not available. Some NLP processing (tokenization, lemmatization) cannot be started.")

    print(f"Processing started: {article_url}")

    if not os.path.exists(OUTPUT_DIR):
        try:
            os.makedirs(OUTPUT_DIR)
            print(f"Output directory created: {OUTPUT_DIR}")
        except OSError as e:
            print(f"Error creating output directory ({OUTPUT_DIR}): {e}")
            return

    print(f"\n1. Fetching HTML content...")
    html_code = fetch_html_content(article_url)
    if not html_code:
        print("   Failed to download HTML content. Processing stopped.")
        return

    print("\n2. Extracting text from HTML...")
    raw_cleaned_text = extract_text_from_html(html_code,
                                               main_content_selectors=main_content_selectors,
                                               selectors_to_exclude=selectors_to_exclude)
    if not raw_cleaned_text:
        print("   Failed to extract text from HTML. Processing stopped.")
        return
    print(f"   Raw text extracted (first 200 characters): '{raw_cleaned_text[:200]}...'")
    raw_text_filepath = generate_safe_filename(article_url, "raw_specific")
    save_text_to_file(raw_cleaned_text, raw_text_filepath)

    if nlp_en:
        print("\n3. Tokenizing text with spaCy...")
        tokenized_text = get_tokenized_text_spacy(raw_cleaned_text)
        if tokenized_text:
            tokenized_text_filepath = generate_safe_filename(article_url, "tokenized_specific")
            save_text_to_file(tokenized_text, tokenized_text_filepath)
        else:
            print("   Failed to tokenize text.")

        print("\n4. Lemmatizing text with spaCy...")
        _, lemmatized_text_string = get_lemmatized_text_spacy(raw_cleaned_text) # We need the string output here
        if lemmatized_text_string:
            lemmatized_text_filepath = generate_safe_filename(article_url, "lemmatized_specific")
            save_text_to_file(lemmatized_text_string, lemmatized_text_filepath)
        else:
            print("   Failed to lemmatize text.")
    else:
        print("\nWarning: spaCy model not loaded, skipping tokenization and lemmatization.")

    print("\n5. Categorizing text (zero-shot)...")
    if raw_cleaned_text and zero_shot_classifier:
        candidate_article_categories = [
            "sport", "football", "politics", "business", "finance", "technology", "science",
            "health", "medicine", "education",  "music", "movie","world news", "culture", "art", "travel", "food", "lifestyle", "environment",
            "social issues", "mental health", "entertainment"
        ]
        category_results = categorize_text_zero_shot(raw_cleaned_text, candidate_article_categories, zero_shot_classifier)

        if category_results and category_results['labels'] and category_results['scores']:
            predicted_label = category_results['labels'][0]
            predicted_score = category_results['scores'][0]
            print(f"   Predicted category: {predicted_label} (Score: {predicted_score:.4f})")
        else:
            print("   Could not determine a clear category or an error occurred.")
    elif not zero_shot_classifier:
        print("   Skipping categorization because the zero-shot classifier is not loaded.")
    else:
        print("   No text available for categorization.")

    print("\n6. Generating text summary...")
    if raw_cleaned_text and summarizer_pipeline:
        article_summary = generate_summary_abstractive(raw_cleaned_text, summarizer_pipeline)
        if article_summary:
            print(f"   Generated summary: {article_summary}")
            summary_filepath = generate_safe_filename(article_url, "summary_specific")
            save_text_to_file(article_summary, summary_filepath)
        else:
            print("   Failed to generate summary.")
    elif not summarizer_pipeline:
        print("   Skipping summarization because the summarizer pipeline is not loaded.")
    else:
        print("   No text available for summarization.")

    print("\n7. Generating title...")
    if raw_cleaned_text and title_generator_pipeline:
        generated_raw_title = generate_title_t5(raw_cleaned_text, title_generator_pipeline)
        processed_title = ""
        if generated_raw_title:
            generated_raw_title = generated_raw_title.strip()
            first_period_index = generated_raw_title.find('.')
            if first_period_index != -1:
                processed_title = generated_raw_title[:first_period_index + 1]
            else:
                processed_title = generated_raw_title
            if processed_title:
                processed_title = processed_title[0].upper() + processed_title[1:]
            print(f"   Generated title: {processed_title}")
            title_filepath = generate_safe_filename(article_url, "title_specific")
            save_text_to_file(processed_title, title_filepath)
        else:
            print("   Failed to generate title (empty result from generator).")
    elif not title_generator_pipeline:
        print("   Skipping title generation because the title generator pipeline is not loaded.")
    else:
        print("   No text available for title generation.")

    # --- 8. Extracting Named Entities (BERT with Post-processing) ---
    print("\n8. Extracting Named Entities (BERT with Post-processing)...")
    if raw_cleaned_text and ner_pipeline:
        # extract_named_entities_bert now returns post-processed entities
        processed_entities_list, processed_entities_string = extract_named_entities_bert(raw_cleaned_text, ner_pipeline)

        if processed_entities_list is not None: # Check if NER processing encountered an error
            if processed_entities_list: # Check if the list is not empty after post-processing
                print(f"   Extracted {len(processed_entities_list)} entities after post-processing.")
                # Print first few (e.g., 10) post-processed entities
                for i, entity in enumerate(processed_entities_list[:10]):
                    print(f"     - {entity['text']} ({entity['type']}, Score: {entity['score']:.3f})")
                if len(processed_entities_list) > 10:
                    print(f"     ... and {len(processed_entities_list) - 10} more.")

                # Save the post-processed entities string to a file
                entities_filepath = generate_safe_filename(article_url, "entities_processed_specific") # Changed suffix
                save_text_to_file(processed_entities_string, entities_filepath)
            else:
                print("   No entities found in the text after post-processing.")
        else:
            # This case handles if extract_named_entities_bert returned None (due to an internal error)
            print("   Failed to extract entities or an error occurred during NER processing.")
    elif not ner_pipeline:
        print("   Skipping Named Entity Recognition because the NER pipeline is not loaded.")
    else:
        print("   No text available for Named Entity Recognition.")

    print(f"\nProcessing finished for: {article_url}")


# --- Main execution block ---
if __name__ == "__main__":
    # Check if essential models are loaded. The script can still run partially if some are missing.
    if not nlp_en:
        print("Warning: The spaCy NLP model (for tokenization/lemmatization) did not load. Some functionalities will be skipped.")
    if not zero_shot_classifier:
        print("Warning: Zero-shot classification pipeline did not load. Categorization will be skipped.")
    if not summarizer_pipeline:
        print("Warning: Summarization pipeline did not load. Summarization will be skipped.")
    if not title_generator_pipeline:
        print("Warning: Title generation pipeline did not load. Title generation will be skipped.")
    if not ner_pipeline: # NEW: Check for NER pipeline
        print("Warning: NER pipeline did not load. Named Entity Recognition will be skipped.")


    url_to_process = "https://www.goal.com/en-sa/lists/winners-losers-2024-25-premier-league-season/blt2a1709cc33840e04"
    # Example with more prominent entities:
    # url_to_process = "https://www.reuters.com/world/europe/putin-xi-map-out-new-era-press-conference-after-kremlin-talks-2023-03-21/"

    main_selectors_for_billboard = None
    exclude_selectors_for_billboard = None

    process_article_url(url_to_process,main_selectors_for_billboard,exclude_selectors_for_billboard)

    # Example of processing another URL with potentially different selectors
    # url_reuters = "https://www.reuters.com/technology/musk-says-xaitests-grok-chatbot-with-more-users-after-political-bias-concerns-2024-03-15/"
    # main_selectors_reuters = ['article[data-testid="ArticlePage-article-body"]']
    # exclude_selectors_reuters = ['div[data-testid="Paywall-Container"]', 'div[data-testid="AdditionalCoverage-Container"]']
    # process_article_url(url_reuters, main_selectors_reuters, exclude_selectors_reuters)

English spaCy model (en_core_web_trf) loaded successfully.
Loading Zero-shot classification pipeline...


Device set to use cuda:0


Zero-shot classification pipeline loaded successfully.
Loading Summarization pipeline...


Device set to use cuda:0


Summarization pipeline loaded successfully.
Loading Text-to-Text Generation pipeline for titles...


Device set to use cuda:0


Text-to-Text Generation pipeline (for titles using t5-large) loaded successfully.
Loading NER pipeline (dslim/bert-base-NER)...


Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0


NER pipeline (dslim/bert-base-NER) loaded successfully.
Processing started: https://www.goal.com/en-sa/lists/winners-losers-2024-25-premier-league-season/blt2a1709cc33840e04

1. Fetching HTML content...

2. Extracting text from HTML...
   Raw text extracted (first 200 characters): 'Arne Slot, Sir Jim Ratcliffe and the biggest winners and losers of the 2024-25 Premier League season | Goal.com English Saudi Arabia Getty Images Mark Doyle Winners & Losers Premier League Liverpool A...'
   File saved successfully: processed_articles/blt2a1709cc33840e04_raw_specific.txt

3. Tokenizing text with spaCy...
   File saved successfully: processed_articles/blt2a1709cc33840e04_tokenized_specific.txt

4. Lemmatizing text with spaCy...
   File saved successfully: processed_articles/blt2a1709cc33840e04_lemmatized_specific.txt

5. Categorizing text (zero-shot)...
   Predicted category: football (Score: 0.4574)

6. Generating text summary...
   Generated summary:  Liverpool won the Premier League title 