IMPORTS IMPLEMENTATION

In [51]:
from bertopic import BERTopic
from bs4 import BeautifulSoup
from collections import Counter, deque
import csv
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import numpy as np
import os
import pandas as pd
import random
import re 
import requests
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import string
import time
import token 
from urllib.parse import urljoin, urlparse, unquote

In [52]:
nltk.download('punkt')
nltk.download('punkt_tab') # Required for new versions
nltk.download('wordnet')
nltk.download('omw-1.4')   # Required for the lemmatiser
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

IMPLEMENTATION OF CONSTANTS

In [53]:
# --- CONFIGURATION OF COLORS ---

GREEN = "\033[92m"
RED = "\033[91m"
YELLOW = "\033[93m"
BLUE = "\033[94m"
RESET = "\033[0m"

In [54]:
# --- CONFIGURATION OF HEADER ---

HEADER_BROWSER = {
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36 Edg/143.0.0.0",
    "Accept-Language": "en-US,en;q=0.5",
    "Referer": "https://www.google.com/"}

session = requests.Session()
session.headers.update(HEADER_BROWSER)

In [55]:
# --- CONFIGURATION OF FILTERS ---

filters_noise = [
    # E-commerce
    "/product", "/products", "/shop", "/store", "/cart", "/checkout",
    # User
    "/account", "/login", "/register", "/profile", 
    # Legal and Administrative pages
    "/about", "/contact", "/privacy", "/terms", "/policy", "/legal", "/accessibility", 
    "/editorial-process", "/data-collection", "/disclaimer", "/cookies", "/sponsor",
    "/advertise", "/jobs", "/faq", "/help", "/wc", 
    # Site Structure
    "/tag", "/search", "/author",
    "mailto:", "tel:", "javascript:", ".jpg", ".png",
    "youtube.com", "instagram.com", "facebook.com", "twitter.com", "x.com",
    "pinterest.com", "linkedin.com", "tiktok.com", "amazon.com"
]

In [56]:
# --- CONFIGURATION OF SEED LINKS FOR WIKIPEDIA ---

wiki_seed_pages = [ # Our selection of a few pages = starting list
    "https://en.wikipedia.org/wiki/Lifestyle",
    "https://en.wikipedia.org/wiki/Lifestyle_trends_and_media",
    "https://en.wikipedia.org/wiki/Self-care",
    "https://en.wikipedia.org/wiki/Physical_fitness",
    "https://en.wikipedia.org/wiki/Healthy_diet",
    "https://en.wikipedia.org/wiki/Travel",
    "https://en.wikipedia.org/wiki/Outdoor_recreation",
    "https://en.wikipedia.org/wiki/Sustainable_living",
    "https://en.wikipedia.org/wiki/Fashion",
]

IMPLEMENTATION OF DEFINITIONS

In [57]:
def get_filenames(source_name): 
    # Create all files from a source using the name specified in the source cell [21]
    
    directory_name = "data"
    if not os.path.exists(directory_name): # Checks if the 'data' folder exists
        os.makedirs(directory_name) # Setup the output directory
        print(f"{GREEN}Created directory : {directory_name}{RESET}")
    
    # We assume that source_name contains no spaces. If changed, add : source_name = source_name.replace(" ", "_")

    return {"raw":       os.path.join(directory_name, f"{source_name}_raw_html.csv"),
            "corpus":    os.path.join(directory_name, f"{source_name}_corpus.csv"),
            "cleaned":   os.path.join(directory_name, f"{source_name}_cleaned.csv"),
            "norm":      os.path.join(directory_name, f"{source_name}_corpus_norm.csv"),
            "tokenized": os.path.join(directory_name, f"{source_name}_corpus_tokenized.csv"),
            "nodes" :    os.path.join(directory_name, f"{source_name}_nodes.csv"),
            "edges" :    os.path.join(directory_name, f"{source_name}_edges.csv")}

In [58]:
def fetch_url(url) :
    # Fetches the content of a URL
    
    time.sleep(random.uniform(1, 3)) # Random delay between 1 and 3 seconds to avoid overloading the server

    try:
        response = session.get(url, timeout=10) # Timeout set to 10s to prevent the crawler from hanging on slow pages
        if response.status_code == 200: 
            return response # If the status code is not OK (200), the function returns none and an error message
        else:
            print(f"Failed to fetch the url: {url} with status code {response.status_code}")
            return None
    
    except requests.RequestException: # Catch network-related errors (DNS failure, connection refused, etc.)
        return None

In [59]:
def to_soup(url): 
    # Parses the HTML content of a URL into a BeautifulSoup object
    
    response = fetch_url(url) 

    if response: # If the fetch was successful, parse the HTML
        return BeautifulSoup(response.text, 'html.parser') 
    else: # If the response is not none, the function return the beautiful soup object
        return None

In [60]:
def extract_links_breadth_first(url, max_levels=1): 
    # Crawls a website using Breadth-First Search (BFS)

    queue = deque([(url, 0)]) # Initialize BFS queue with (url, current_depth)
    visited_links = [url]  # Track visited URLs to avoid duplicates (list preserves order)
    last_level_links = [] # Store links found at the maximum level
    dico = {} # For the graph structure {parent : [child1, child2]}

    while queue:
        current_url, current_level = queue.popleft()

        if current_level == max_levels: # if the limit is reached, we store the links in the list
            if current_url not in last_level_links:
                last_level_links.append(current_url)
            continue
        elif current_level > max_levels:
            continue
    
        soup = to_soup(current_url) # Fetching and parsing
        if not soup:
            continue

        dico.setdefault(current_url, [])

        content_area = soup.body # Broad extraction scope: analysis of the entire <body> to retrieve as many links as possible
        if not content_area:
            continue

        links = sorted(content_area.find_all('a', href=True), key=lambda x: x.get('href')) # Extract all 'a' tags with an 'href' attribute and sort alphabetically
        
        for item in links :
            href = item.get('href') # href is short for Hypertext REFerence. It indicates the destination of the link
            full_url = urljoin(current_url, href)
            full_url = full_url.split('?')[0].split('#')[0] # Cut at the "?" of "#" and keep only the beginning : [0]
            full_url = full_url.lower()

            if any(filter in full_url for filter in filters_noise):
                continue
            
            dico[current_url].append(full_url) # Add edge to the adjacency list (Parent -> Child)

            if full_url not in visited_links: # If the link is new
                visited_links.append(full_url) 
                queue.append((full_url, current_level + 1)) # Add it to the queue for future crawling
                    
    return visited_links, last_level_links, dico

In [61]:
def extract_links_depth_first(start_url, visited_links, dico, max_depth=3, max_links_per_page=10, current_depth=0): 
    #Crawls a website using Depth-First Search (DFS)
    
    if current_depth >= max_depth:
        return

    soup = to_soup(start_url) # Fetching and parsing
    if not soup :
        return

    dico.setdefault(start_url, []) # Ensure the current URL exists in the adjacency list
    links_followed = 0 # Counter to max_links_per_page

    content_area = soup.body # Broad extraction scope: analysis of the entire <body> to retrieve as many links as possible
    if not content_area:
        return
    
    links = sorted(content_area.find_all('a', href=True), key=lambda x: x.get('href')) # Extract all 'a' tags with an 'href' attribute and sort alphabetically
    
    for link in links:
        if links_followed >= max_links_per_page: # Maximum number of links followed (crawled) from a given page
            break

        href = link.get('href')
        full_url = urljoin(start_url, href) # Convert relative paths to absolute URLs
        full_url = full_url.split('?')[0].split('#')[0] # Cut at the "?" of "#" and keep only the beginning : [0]
        full_url = full_url.lower()

        if any(filter in full_url for filter in filters_noise):
                continue

        dico[start_url].append(full_url) # Add edge to the adjacency list (Parent -> Child)

        if full_url not in visited_links: # If the link is new, explore it deeply before moving to the next one
            visited_links.append(full_url)
            links_followed += 1

            # Recursive call: dive deeper into this branch
            extract_links_depth_first( start_url=full_url, visited_links=visited_links, dico=dico, max_depth=max_depth, max_links_per_page=max_links_per_page, current_depth=current_depth + 1)
    
    return visited_links, dico


In [62]:
def extract_links_depth_from_list(start_links, visited_links, dico, max_depth=3, max_links_per_page=10):
    # Function to launch multiple DFS crawls from a list of seed URLs
    
    if visited_links is None:
        visited_links = []
    
    sorted_links = sorted(start_links) # Sorted because same input order = same results
    
    for link in sorted_links:
        if link not in visited_links:
            visited_links.append(link) # Add the starting links to visited links

    for start_url in sorted_links: # Maximum number of seeds (starting points) that we want to crawl in depth
        extract_links_depth_first(start_url=start_url, visited_links=visited_links, 
                                  dico=dico, max_depth=max_depth, 
                                  max_links_per_page=max_links_per_page, current_depth=0)

    return visited_links

In [63]:
def crawl_MBG_seed_links(url):
    # Fixed parameters
    
    MAX_BREADTH_LEVEL = 2
    MAX_DEPTH = 2
    MAX_LINKS_PER_PAGE = 5 # Number of links per DFS

    # Breadth-First Search (wide exploration)
    breadth_links, last_level_links, dico = extract_links_breadth_first(url, max_levels=MAX_BREADTH_LEVEL)

    # Depth-First Search from the BFS frontier
    depth_links = extract_links_depth_from_list(start_links=last_level_links, 
                                                visited_links=breadth_links.copy(), 
                                                dico=dico, max_depth=MAX_DEPTH, 
                                                max_links_per_page=MAX_LINKS_PER_PAGE)

    all_links = set(breadth_links) | set(depth_links) # Union of BFS and DFS results (order is not preserved, but duplicates are removed)
    
    return list(all_links), dico

In [64]:
def extract_wikipedia_links(url):
    # Extracts internal Wikipedia article links from a given Wikipedia page
    
    wiki_links = set()

    soup = to_soup(url) 
    if not soup:
        return set() # Returns a list of URLs pointing to Wikipedia articles only

    # Focus on : Main content paragraphs (article body)
    main_content = soup.find_all('div', class_="mw-content-ltr mw-parser-output")
    for div in main_content:
        for p in div.find_all('p'): # Target: paragraphs (<p>) within the class (main content div)
            for item in p.find_all('a', href=True):
                href = item.get('href')
                if href.startswith("/wiki/") and ":" not in href: # Keep links "/wiki/..." and exclude pages containing ":"
                    wiki_links.add(urljoin(url, href))

    # Focus on : "See also" section (related articles)
    see_also = soup.find_all('div', class_='div-col')
    for div in see_also:
        for li in div.find_all('li'): # Target: lists (<li>) within columns ('div-col')
            for item in li.find_all('a', href=True):
                href = item.get('href')
                if href.startswith("/wiki/") and ":" not in href:
                    wiki_links.add(urljoin(url, href))

    return list(wiki_links)

In [65]:
def crawl_wiki_seed_links(wiki_seed_pages):
    
    wiki_all_links = set(wiki_seed_pages) # Set() avoids duplication, unlike lists
    dico_wiki_links = {}
    for url in wiki_seed_pages:
        crawl_links = extract_wikipedia_links(url)
        dico_wiki_links[url] = crawl_links
        wiki_all_links.update(crawl_links) # Only add new links

    return list(wiki_all_links), dico_wiki_links

In [66]:
def save_to_csv(data, filename):
    
    if not data:
        print(f"{RED}No data to save.{RESET}")
        return

    # Warn if the file already exists
    if os.path.exists(filename):
        print(f"{BLUE}Note: '{filename}' already exists and will be overwritten{RESET}")
 
    # Get the keys from the first dictionary for the CSV header from get_html_content
    try:
        fieldnames = data[0].keys() # Detection of the existing colons in the data file
    
        with open(filename, 'w', newline='', encoding='utf-8') as f: # Opens the csv file as utf-8 (= character encoding system)
            writer = csv.DictWriter(f, fieldnames=fieldnames, quoting=csv.QUOTE_ALL) # Initialising a writer to write the dictionary into the csv file
            writer.writeheader() # Writes the colons headers
            writer.writerows(data) # Writes the rows

        print(f"{GREEN}Success! Data saved to: {RESET}{os.path.abspath(filename)}")
        
    except Exception as e:
        print(f"{RED}Error saving file: {e}{RESET}")

In [67]:
def get_html_content(links, output_csv, max=None):
    # Fetches raw HTML content for a list of URLs

    content = []
    failed_links = []

    for link in links[:max]:
        response = fetch_url(link)
        if response:
            content.append({'url': link, 'html': response.text}) # Store raw HTML as text for text extraction
        else:
            failed_links.append(link) # Keep track of failures for reporting

    print(f"{GREEN}Fetched: {len(content)}{RESET}")
    print(f"{RED}Failed: {len(failed_links)}{RESET}")

    if output_csv and content:
        save_to_csv(content, output_csv)

    return content, failed_links 

PrÃ©paration au link analysis

In [68]:
def create_nodes_csv(all_links, output_file):
    # Builds a 'nodes' CSV from an adjacency list (graph dictionary)

    nodes = set(all_links.keys())  # Each key of the dictionnary "all_links" is a node (set() don't keep any possible duplicate)
    
    for targets in all_links.values():
        nodes.update(targets)  # Each value of the dictionnary is a node

    with open(output_file, mode="w", newline="", encoding="utf-8") as f:  # Create (or overwrite) a csv file 
        writer = csv.writer(f)  # Create an object that will enable our code to write within the file
        writer.writerow(["node_id"])

        for node in nodes:
            writer.writerow([node])  

In [69]:
def create_edges_csv(links, output_file):
    # Builds an 'edges' CSV from an adjacency list (directed graph)
    
    with open(output_file, mode="w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["source", "target"])  # We start by writing the headers of our column

        for source, targets in links.items():
            for target in targets:  
                if target != source:  # Avoid self-loops (A -> A)
                    writer.writerow([source, target])  # If the target is different form the source (not A -> A), we add the relation in our file
                else :
                    continue

CONFIGURATION

In [None]:
sources = [
    {"name": "MBG",
     "start_url": "https://www.mindbodygreen.com/",
     "crawler_func": crawl_MBG_seed_links}, # We write the specific function without calling it ()

    {"name": "wiki",
     "start_url": wiki_seed_pages,
     "crawler_func": crawl_wiki_seed_links}
    ]

EXECUTION

In [None]:
"""========================== SOURCES PIPELINE =========================="""

"""=== SETUP ==="""
for source in sources:
    source_name = source["name"]
    start_point = source["start_url"]

    files = get_filenames(source_name) # Build all standardized output paths for this source 

    print(f"=== SOURCE PROCESSING : {source_name} ===")
    
    print("\n=== CRAWLING ===")

    print(f"{BLUE}[{source_name}] Starting the crawling...{RESET}")    
    crawled_links, dico_links = source["crawler_func"](start_point) # Ex : crawled_links = extract_website_links(https://....)
    print(f"-> Found {len(crawled_links)} links")

    print("\n=== SCRAPING ===")
    
    print(f"{BLUE}[{source_name}] Starting the scraping...{RESET}")
    get_html_content((crawled_links), files['raw'], max=15) # Fetch HTML for each crawled URL and save it as a CSV (url, html)
    print(f"{GREEN}Raw HTML data saved to {files['raw']}{RESET}")    

    print("\n=== CREATE CSV OF NODES ===")

    print(f"{BLUE}[{source_name}] Creating nodes CSV...{RESET}")
    create_nodes_csv(dico_links, files['nodes']) # We convert the list of links into a dictionary {url: []} to be compatible
    print(f"{GREEN}Nodes CSV saved to {files['nodes']}{RESET}")

    print("\n=== CREATE CSV OF EDGES ===\n")
    print(f"{BLUE}[{source_name}] Creating edges CSV...{RESET}")
    create_edges_csv(dico_links, files['edges']) # Export directed edges (source -> target) from the adjacency list
    print(f"{GREEN}Edges CSV saved to {files['edges']}{RESET}") 

In [None]:
def filter_links(links, required_keywords=None, domain=None, already_seen=None):
    if required_keywords is None:  # If no list of required keywords is given then we create an empty one
        required_keywords = []
    if already_seen is None:  # If no list of already seen links is given then we create an empty one
        already_seen = set()
    
    filtered = []
    for l in links:
        l_lower = l.lower()  # Transformation of capital letter into lower case letter
        if required_keywords and not any(keyword.lower() in l_lower for keyword in required_keywords):  # The function skip the url if no required keywords are in the url
            continue
        if domain and urlparse(l).netloc != domain:  # The function filter the links that are not in the domain
            continue
        if l in already_seen:  # The function filter the links already in the list of links
            continue
        filtered.append(l) # If the url past all the filter, it is added to the list of links
    return filtered

In [None]:
{"name": "blogs",
     "start_url": "https://bloggers.feedspot.com/lifestyle_blogs/?fbclid=IwY2xjawPEh9BleHRuA2FlbQIxMQBzcnRjBmFwcF9pZAEwAAEeokJD-GCqzqIYQFsjgINZ-moY9eFBlfazeS-PqjVE0WpSysKa_blRr5IkTts_aem_iiBbwTznbHBpSm1tmO_7Eg",
     "crawler_func": crawl_feedspot_seed_links,
     "output_file": "data/blogs.csv"}

In [None]:
def extract_feedspot_seed_pages(url):
    
    soup = to_soup(url)
    links_blogs = set()

    if not soup:
        print(f"{RED}Error: Could not fetch main blogs page: {url}{RESET}")
        return set()

    blogs = soup.find_all('a', href=True) # and tag.get('class') and 'wb-ba' in tag.get('class') and any('ext' in c for c in tag.get('class')))

    for link in blogs:
        href = link.get('href') #if link.name == 'a' else link.text.strip()  if href and "http" in href and "bloggers.feedspot.com" not in href:
        
        if not href.startswith('https?'):
            href = urljoin(url, href)

        if urlparse(href).netloc != urlparse(url).netloc:
            if any(filter in href for filter in filters_noise):
                continue
            if "mindbodygreen.com" in href:
                 continue
            if href and "http" in href and "feedspot" not in href :
                links_blogs.add(href)
    
    return links_blogs

In [None]:
# === Exemple d'utilisation ===
url_test = "https://bloggers.feedspot.com/lifestyle_blogs/?fbclid=IwY2xjawPEh9BleHRuA2FlbQIxMQBzcnRjBmFwcF9pZAEwAAEeokJD-GCqzqIYQFsjgINZ-moY9eFBlfazeS-PqjVE0WpSysKa_blRr5IkTts_aem_iiBbwTznbHBpSm1tmO_7Eg"

crawl = extract_feedspot_seed_pages(url_test)
print(len(crawl))
print(crawl)

In [None]:
# === Exemple d'utilisation ===
url_test = list(crawl)[:2]

for url in url_test:

    # Crawl en largeur
    breath_links, last_visited = extract_links_breadth_first(url, max_levels=1)
    print("Breath Links:", len(breath_links))
    print(breath_links)
    print("Last Level Links:", len(last_visited))
    print(last_visited)

    # Crawl en profondeur
    depth_links = extract_links_depth_from_list(
        start_links=last_visited, visited_links=breath_links.copy(),  # on copie pour ne pas modifier l'original
        max_depth=1, max_links_per_page=1)
    print("Depth Links:", len(depth_links))
    print(depth_links)