IMPORT IMPLEMENTATION

In [248]:
from bertopic import BERTopic
from bs4 import BeautifulSoup
from collections import Counter, deque
import csv
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import numpy as np
import os
import pandas as pd
import random
import re 
import requests
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import string
import time
import token 
from urllib.parse import urljoin, urlparse, unquote

In [310]:
nltk.download('punkt')
nltk.download('punkt_tab') # Nécessaire pour les nouvelles versions
nltk.download('wordnet')
nltk.download('omw-1.4')   # Souvent requis par le lemmatizer
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

IMPLEMENTATION OF CONSTANTS

In [249]:
# --- CONFIGURATION OF COLORS ---

GREEN = "\033[92m"
RED = "\033[91m"
YELLOW = "\033[93m"
BLUE = "\033[94m"
RESET = "\033[0m"

In [250]:
# --- CONFIGURATION OF HEADERS ---

HEADER_BROWSER = {
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36 Edg/143.0.0.0",
    "Accept-Language": "en-US,en;q=0.5",
    "Referer": "https://www.google.com/"}

session = requests.Session()
session.headers.update(HEADER_BROWSER)

In [None]:
# --- CONFIGURATION OF FILTERS ---

filters_noise = [
    "/product", "/products", "/shop", "/store", "/cart", "/checkout",
    "/account", "/login", "/register", "/tag", "/search",
    "/about", "contact", "privacy", "terms", "#",
    "mailto:", "tel:", "javascript:",
    ".jpg", ".png", "accessibility", "editorial-process",
    "data-collection", "disclaimer", "wc", "sponsor",
    "youtube.com", "instagram.com", "facebook.com", "twitter.com",
    "pinterest.com", "linkedin.com", "tiktok.com", "amazon.com"
]

In [252]:
# --- CONFIGURATION OF SEED LINKS FOR WIKIPEDIA ---

wiki_seed_pages = [ # Our selection of a few pages = starting list
    "https://en.wikipedia.org/wiki/Lifestyle",
    "https://en.wikipedia.org/wiki/Lifestyle_trends_and_media",
    "https://en.wikipedia.org/wiki/Self-care",
    "https://en.wikipedia.org/wiki/Physical_fitness",
    "https://en.wikipedia.org/wiki/Healthy_diet",
    "https://en.wikipedia.org/wiki/Travel",
    "https://en.wikipedia.org/wiki/Outdoor_recreation",
    "https://en.wikipedia.org/wiki/Sustainable_living",
    "https://en.wikipedia.org/wiki/Fashion",
]

IMPLEMENTATION OF DEFINITIONS

In [None]:
def get_filenames(source_name):
    directory_name = "data"
    if not os.path.exists(directory_name):
            os.makedirs(directory_name)
            print(f"{GREEN}Created directory : {directory_name}{RESET}")
    
    # Avertissement si écrasement (juste pour info)
    if os.path.exists(source_name):
        print(f"{BLUE}Note : The file is overwritten{RESET}") 
    
    base_name = source_name.replace(" ", "_")

    return {"raw":       os.path.join(directory_name, f"{base_name}_raw_html.csv"),
            "corpus":    os.path.join(directory_name, f"{base_name}_corpus.csv"),
            "cleaned":   os.path.join(directory_name, f"{base_name}_cleaned.csv"),
            "norm":      os.path.join(directory_name, f"{base_name}_corpus_norm.csv"),
            "tokenized": os.path.join(directory_name, f"{base_name}_corpus_tokenized.csv"),
            "nodes" :    os.path.join(directory_name, f"{base_name}_nodes.csv"),
            "edges" :    os.path.join(directory_name, f"{base_name}_edges.csv")}

In [254]:
def fetch_url(url) :

    time.sleep(random.uniform(1, 3)) 

    try:
        response = session.get(url, timeout=10)
        if response.status_code == 200: 
            return response # If the status code is not OK (200) the function return none and an error message
        else:
            print(f"Failed to fetch the url: {url} with status code {response.status_code}")
            return None
    
    except requests.RequestException:
        return None

In [255]:
def to_soup(url):
    response = fetch_url(url)
    if response:  # If the response is not none, the function return the beautiful soup object
        return BeautifulSoup(response.text, 'html.parser') 
    else:
        return None

In [None]:
def extract_links_breadth_first(url, max_levels=1): 
    queue = deque([(url, 0)])
    visited_links = [url]  # liste pour garder l'ordre
    last_level_links = []
    start_domain = urlparse(url).netloc
    dico = {}

    while queue:
        current_url, current_level = queue.popleft()

        if current_level == max_levels:
            if current_url not in last_level_links:
                last_level_links.append(current_url)
            continue
        elif current_level > max_levels:
            continue
    
        soup = to_soup(current_url)
        if not soup:
            continue

        dico.setdefault(current_url, [])

        content_area = soup.body # Stay large (here) to get more links
        if not content_area:
            continue
        links = sorted(content_area.find_all('a', href=True), key=lambda x: x.get('href'))
        
        for item in links :
            href = item.get('href')

            full_url = urljoin(current_url, href)
            full_url = full_url.split('?')[0].split('#')[0]
            full_url = full_url.lower()

            # Si nouveau lien trouvé
            if any(filter in full_url for filter in filters_noise):
                continue
            
            dico[current_url].append(full_url)

            if full_url not in visited_links:
                visited_links.append(full_url)
                queue.append((full_url, current_level + 1))
                    
    return visited_links, last_level_links, dico

In [None]:
def extract_links_depth_first(start_url, visited_links, dico, max_depth=3, max_links_per_page=10, current_depth=0):
    if current_depth >= max_depth:
        return

    soup = to_soup(start_url)
    if not soup :
        return

    dico.setdefault(start_url, [])
    links_followed = 0

    content_area = soup.body
    if not content_area:
        return
    
    links = sorted(content_area.find_all('a', href=True), key=lambda x: x.get('href'))
    for item in links:
        if links_followed >= max_links_per_page:
            break

        href = item.get('href')

        full_url = urljoin(start_url, href)
        full_url = full_url.split('?')[0].split('#')[0]
        full_url = full_url.lower()

        if any(filter in full_url for filter in filters_noise):
                continue

        dico[start_url].append(full_url)

        if full_url not in visited_links:
            visited_links.append(full_url)
            links_followed += 1
            extract_links_depth_first( start_url=full_url, visited_links=visited_links, dico=dico, max_depth=max_depth, max_links_per_page=max_links_per_page, current_depth=current_depth + 1)
    return visited_links, dico


SyntaxError: invalid syntax (1529110911.py, line 36)

In [258]:
def extract_links_depth_from_list(start_links, visited_links, dico, max_depth=3, max_links_per_page=10):
    if visited_links is None:
        visited_links = []
    
    # Ajouter les liens de départ au set des déjà visités
    for link in sorted(start_links):
        if link not in visited_links:
            visited_links.append(link)

    for start_url in sorted(start_links)[:max_links_per_page]:
        extract_links_depth_first( start_url=start_url, visited_links=visited_links, dico=dico, max_depth=max_depth, max_links_per_page=max_links_per_page, current_depth=0)

    return visited_links

In [259]:
def crawl_MBG_seed_links(url):
    # Paramètres fixes
    MAX_BREADTH_LEVEL = 2
    MAX_DEPTH = 2
    MAX_LINKS_PER_PAGE = 10

    # Crawl en largeur
    breadth_links, last_level_links, dico = extract_links_breadth_first(url, max_levels=MAX_BREADTH_LEVEL)

    # Crawl en profondeur à partir des liens du dernier niveau
    depth_links = extract_links_depth_from_list( start_links=last_level_links, visited_links=breadth_links.copy(), dico=dico, max_depth=MAX_DEPTH, max_links_per_page=MAX_LINKS_PER_PAGE)

    # Retourne tous les liens uniques sous forme de liste
    all_links = set(breadth_links) | set(depth_links)
    return list(all_links), dico

In [260]:
def extract_wikipedia_links(url):
    soup = to_soup(url)
    if not soup:
        return set()

    wiki_links = set()

    # --- Main content ---
    main_content = soup.find_all('div', class_="mw-content-ltr mw-parser-output")
    for div in main_content:
        for p in div.find_all('p'):
            for item in p.find_all('a', href=True):
                href = item.get('href')
                if href.startswith("/wiki/") and ":" not in href:
                    wiki_links.add(urljoin(url, href))

    # --- See also section ---
    see_also = soup.find_all('div', class_='div-col')
    for div in see_also:
        for li in div.find_all('li'):
            for item in li.find_all('a', href=True):
                href = item.get('href')
                if href.startswith("/wiki/") and ":" not in href:
                    wiki_links.add(urljoin(url, href))

    return list(wiki_links)

In [261]:
def crawl_wiki_seed_links(wiki_seed_pages):
    wiki_all_links = set(wiki_seed_pages) # Set() avoids duplication, unlike lists
    dico_wiki_links = {}
    for url in wiki_seed_pages:
        crawl_links = extract_wikipedia_links(url)
        dico_wiki_links[url] = crawl_links
        wiki_all_links.update(crawl_links) # Only add new links

    return list(wiki_all_links), dico_wiki_links

In [None]:
def save_to_csv(data, filename):
    
    if not data:
        print(f"{RED}No data to save.{RESET}")
        return

    if not os.path.exists(filename):
        print(f"{RED}Error: File {filename} was not created")
 
    # Get the keys from the first dictionary for the CSV header from get_html_content
    fieldnames = data[0].keys() # Detection of the existing colons in the data file
    
    try:
        with open(filename, 'w', newline='', encoding='utf-8') as f: # Opens the csv file as utf-8
            writer = csv.DictWriter(f, fieldnames=fieldnames, quoting=csv.QUOTE_ALL) # Initialising a writer to write the dictionary into the csv file
            writer.writeheader() # writes the colons headers
            writer.writerows(data) # writes the rows
        print(f"{GREEN}Success! Data saved to: {RESET}{os.path.abspath(filename)}")
        
    except Exception as e:
        print(f"{RED}Error saving file: {e}{RESET}")

retirer le max=None et links[:max] pour run tout 

In [263]:
def get_html_content(links, output_csv, max=None):
    content = []
    failed_links = []

    for link in links[:max]:
        response = fetch_url(link)
        if response:
            content.append({'url': link, 'html': response.text})
        else:
            failed_links.append(link)

    print(f"{GREEN}Fetched: {len(content)}{RESET}")
    print(f"{RED}Failed: {len(failed_links)}{RESET}")

    if output_csv and content:
        save_to_csv(content, output_csv)

    return content, failed_links 

Préparation au link analysis

In [264]:
def create_nodes_csv(all_links, output_file="nodes.csv"):
    nodes = set(all_links.keys())  # Each key of the dictionnary "all_links" is a node (we don't keep any possible duplicate)
    for targets in all_links.values():
        nodes.update(targets)  # Each value of the dictionnary is a node

    with open(output_file, mode="w", newline="", encoding="utf-8") as f:  # Create (or open) a csv file 
        writer = csv.writer(f)  # Create an object that will enable our code to write within the file
        writer.writerow(["node_id"])
        for node in nodes:
            writer.writerow([node])  

In [265]:
def create_edges_csv(links, output_file="edges.csv"):
    
    with open(output_file, mode="w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["source", "target"])  # We start by writing the headers of our column

        for source, targets in links.items():
            for target in targets:  # Mettre "set(targets)"" si je veux éviter les doublons (A -> B), il comprend bien que A -> B x=x B -> A donc tkt
                # A ajouter / tester (devrait empecher les boucles infines)
                if target != source:  
                    writer.writerow([source, target])  # If the target is different form the source (not A -> A), we add the relation in our file
                else :
                    continue

CONFIGURATION

In [266]:
sources = [
    {"name": "MBG",
     "start_url": "https://www.mindbodygreen.com/",
     "crawler_func": crawl_MBG_seed_links,
     "output_file": "data/MBG.csv"},  # We pass the specific function without calling it ()
    {"name": "wiki",
     "start_url": wiki_seed_pages,
     "crawler_func": crawl_wiki_seed_links,
     "output_file": "data/wiki.csv"}
    ]

EXECUTION

In [None]:
"""========================== SOURCES PIPELINE =========================="""

"""=== SETUP ==="""
for source in sources:
    #filename = source["output_file"]
    source_name = source["name"]
    start_point = source["start_url"]

    files = get_filenames(source_name)


    print(f"=== SOURCE TRAITEMENT : {source_name} ===")

    
    print("\n=== CRAWLING ===")

    print(f"{BLUE}[{source_name}] Starting the crawling...{RESET}")    
    crawled_links, dico_links = source["crawler_func"](start_point) # Ex : crawled_links = extract_website_links(https://....)
    print(f"-> Found {len(crawled_links)} links")

   
    print("\n=== SCRAPING ===")
    
    print(f"{BLUE}[{source_name}] Starting the scraping...{RESET}")
    get_html_content((crawled_links), files['raw'])
    print(f"{GREEN}Raw HTML data saved to {files['raw']}{RESET}")
    

    print("\n=== CREATE CSV OF NODES ===")

    print(f"{BLUE}[{source_name}] Creating nodes CSV...{RESET}")
    # Ici on transforme la liste de liens en dictionnaire {url: []} pour être compatible
    create_nodes_csv(dico_links, output_file=files['nodes'])
    print(f"{GREEN}Nodes CSV saved to {files['nodes']}{RESET}")

    print("\n=== CREATE CSV OF EDGES ===")
    print(f"{BLUE}[{source_name}] Creating edges CSV...{RESET}")
    create_edges_csv(dico_links, output_file=files['edges'])
    print(f"{GREEN}Edges CSV saved to {files['edges']}{RESET}") 

=== SOURCE TRAITEMENT : MBG ===

=== CRAWLING ===
[94m[MBG] Starting the crawling...[0m
Failed to fetch the url: https://bit.ly/cawalnutsgalette with status code 404
Failed to fetch the url: https://bit.ly/cawalnutsswisschardsalad with status code 404
Failed to fetch the url: https://bit.ly/cawalnutswellington with status code 404
Failed to fetch the url: https://go.skimresources.com with status code 404
-> Found 1032 links

=== SCRAPING ===
[94m[MBG] Starting the scraping...[0m
Failed to fetch the url: https://podcasts.apple.com/us/podcast/id1246494475/episodes with status code 500
Failed to fetch the url: https://www.amjmed.com/article/s0002-9343(14)00138-7/fulltext with status code 403
Failed to fetch the url: https://onlinelibrary.wiley.com/doi/10.1111/sms.70113 with status code 403
Failed to fetch the url: https://pubmed.ncbi.nlm.nih.gov/28329045/ with status code 403
Failed to fetch the url: https://karger.com/ger/article/62/2/138/149063/kicking-back-cognitive-ageing-leg-powe

In [None]:
def filter_links(links, required_keywords=None, domain=None, already_seen=None):
    if required_keywords is None:  # If no list of required keywords is given then we create an empty one
        required_keywords = []
    if already_seen is None:  # If no list of already seen links is given then we create an empty one
        already_seen = set()
    
    filtered = []
    for l in links:
        l_lower = l.lower()  # Transformation of capital letter into lower case letter
        if required_keywords and not any(keyword.lower() in l_lower for keyword in required_keywords):  # The function skip the url if no required keywords are in the url
            continue
        if domain and urlparse(l).netloc != domain:  # The function filter the links that are not in the domain
            continue
        if l in already_seen:  # The function filter the links already in the list of links
            continue
        filtered.append(l) # If the url past all the filter, it is added to the list of links
    return filtered

In [None]:
{"name": "blogs",
     "start_url": "https://bloggers.feedspot.com/lifestyle_blogs/?fbclid=IwY2xjawPEh9BleHRuA2FlbQIxMQBzcnRjBmFwcF9pZAEwAAEeokJD-GCqzqIYQFsjgINZ-moY9eFBlfazeS-PqjVE0WpSysKa_blRr5IkTts_aem_iiBbwTznbHBpSm1tmO_7Eg",
     "crawler_func": crawl_feedspot_seed_links,
     "output_file": "data/blogs.csv"}

In [129]:
def extract_feedspot_seed_pages(url):
    
    soup = to_soup(url)
    links_blogs = set()

    if not soup:
        print(f"{RED}Error: Could not fetch main blogs page: {url}{RESET}")
        return set()

    blogs = soup.find_all('a', href=True) # and tag.get('class') and 'wb-ba' in tag.get('class') and any('ext' in c for c in tag.get('class')))

    for link in blogs:
        href = link.get('href') #if link.name == 'a' else link.text.strip()  if href and "http" in href and "bloggers.feedspot.com" not in href:
        
        if not href.startswith('https?'):
            href = urljoin(url, href)

        if urlparse(href).netloc != urlparse(url).netloc:
            if any(filter in href for filter in filters_noise):
                continue
            if "mindbodygreen.com" in href:
                 continue
            if href and "http" in href and "feedspot" not in href :
                links_blogs.add(href)
    
    return links_blogs

In [133]:
# === Exemple d'utilisation ===
url_test = "https://bloggers.feedspot.com/lifestyle_blogs/?fbclid=IwY2xjawPEh9BleHRuA2FlbQIxMQBzcnRjBmFwcF9pZAEwAAEeokJD-GCqzqIYQFsjgINZ-moY9eFBlfazeS-PqjVE0WpSysKa_blRr5IkTts_aem_iiBbwTznbHBpSm1tmO_7Eg"

crawl = extract_feedspot_seed_pages(url_test)
print(len(crawl))
print(crawl)

99
{'http://www.mrpogitips.com/', 'https://theperennialstyle.com/', 'https://flow-spirit.com/lifestyle-blogs/', 'https://onbetterliving.com/', 'https://julieblanner.com/', 'https://joylynnlifestyle.com/', 'https://an-ideal-life.com/category/an-ideal-life/', 'https://freshexchange.com/blog/', 'https://www.thirteenthoughts.com/', 'https://terilynadams.com/', 'https://www.zoella.co.uk/', 'https://helloadamsfamily.com/', 'https://www.mrglitterati.com/', 'https://witanddelight.com/', 'https://www.elizabethrider.com/', 'https://www.primermagazine.com/', 'https://cupofjo.com/', 'https://witwhimsy.com/', 'https://socialifestylemag.com/', 'https://freshlandmag.com/', 'https://lifestyleandberries.com/category/lifestyle/', 'https://jaceyoutwest.com/the-blog/', 'https://hollybeetells.com/', 'https://www.theskinnyconfidential.com/', 'https://treasuresandtravelsblog.com/blog/', 'https://goddesslikebody.com/', 'https://www.sazan.me/blog', 'https://livinginyellow.com/', 'https://www.ahealthysliceoflif

In [None]:
# === Exemple d'utilisation ===
url_test = list(crawl)[:2]

for url in url_test:

    # Crawl en largeur
    breath_links, last_visited = extract_links_breadth_first(url, max_levels=1)
    print("Breath Links:", len(breath_links))
    print(breath_links)
    print("Last Level Links:", len(last_visited))
    print(last_visited)

    # Crawl en profondeur
    depth_links = extract_links_depth_from_list(
        start_links=last_visited, visited_links=breath_links.copy(),  # on copie pour ne pas modifier l'original
        max_depth=1, max_links_per_page=1)
    print("Depth Links:", len(depth_links))
    print(depth_links)

Breath Links: 117
['http://www.mrpogitips.com/', 'http://www.mrpogitips.com/feeds/comments/default', 'http://www.mrpogitips.com/feeds/posts/default', 'http://blogsngpinoy.com', 'http://feedjit.com/', 'http://ho.lazada.com.ph/shvukn', 'http://jasperroberts.com', 'http://mrpogitips.blogspot.com/p/blog-page.html', 'http://mrpogitips.blogspot.com/p/new-site.html', 'http://mrpogitips.blogspot.com/p/sample.html', 'http://www.addtoany.com/share_save', 'http://www.blogger.com', 'http://www.mrpogitips.com', 'http://www.mrpogitips.com/2013/', 'http://www.mrpogitips.com/2013/05/', 'http://www.mrpogitips.com/2013/06/', 'http://www.mrpogitips.com/2013/06/pigeon-powder-review.html', 'http://www.mrpogitips.com/2014/', 'http://www.mrpogitips.com/2014/07/', 'http://www.mrpogitips.com/2014/07/zen-zest-tea-tree-oil-review.html', 'http://www.mrpogitips.com/2014/10/', 'http://www.mrpogitips.com/2015/', 'http://www.mrpogitips.com/2015/03/', 'http://www.mrpogitips.com/2015/06/', 'http://www.mrpogitips.com/20