IMPORT IMPLEMENTATION

In [226]:
from bertopic import BERTopic
from bs4 import BeautifulSoup
from collections import Counter, deque
import csv
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import numpy as np
import os
import pandas as pd
import random
import re 
import requests
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import string
import time
import token 
from urllib.parse import urljoin, urlparse, unquote

In [227]:
nltk.download('punkt')
nltk.download('punkt_tab') # Nécessaire pour les nouvelles versions
nltk.download('wordnet')
nltk.download('omw-1.4')   # Souvent requis par le lemmatizer
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

IMPLEMENTATION OF CONSTANTS

In [228]:
# --- CONFIGURATION OF COLORS ---

GREEN = "\033[92m"
RED = "\033[91m"
YELLOW = "\033[93m"
BLUE = "\033[94m"
RESET = "\033[0m"

In [229]:
# --- CONFIGURATION OF HEADERS ---

HEADER_BROWSER = {
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36 Edg/143.0.0.0",
    "Accept-Language": "en-US,en;q=0.5",
    "Referer": "https://www.google.com/"}

session = requests.Session()
session.headers.update(HEADER_BROWSER)

In [230]:
# --- CONFIGURATION OF FILTERS ---

filters_noise = [r'/products?/', r'/shop/', r'/store/', r'/cart', r'/checkout', r'/account', 
           r'/login', r'/register', r'/tag', r'/search', r'/about', r'contact', r'privacy', r'terms', r'#', "youtube.com", 
           "instagram.com", "facebook.com", "twitter.com", "pinterest.com", "linkedin.com", "tiktok.com", 
           "amazon.com"]

In [231]:
# --- CONFIGURATION OF SEED LINKS FOR WIKIPEDIA ---

wiki_seed_pages = [ # Our selection of a few pages = starting list
    "https://en.wikipedia.org/wiki/Lifestyle",
    "https://en.wikipedia.org/wiki/Quality_of_life",
    "https://en.wikipedia.org/wiki/Standard_of_living",
    "https://en.wikipedia.org/wiki/Healthy_lifestyle",
    "https://en.wikipedia.org/wiki/Physical_fitness",
    "https://en.wikipedia.org/wiki/Well-being",
    "https://en.wikipedia.org/wiki/Mental_health",
    "https://en.wikipedia.org/wiki/Healthy_diet",
    "https://en.wikipedia.org/wiki/Nutrition",
    "https://en.wikipedia.org/wiki/Work%E2%80%93life_balance", 
    "https://en.wikipedia.org/wiki/Leisure",
    "https://en.wikipedia.org/wiki/Hobby",
    "https://en.wikipedia.org/wiki/Travel",
    "https://en.wikipedia.org/wiki/Outdoor_recreation",
    "https://en.wikipedia.org/wiki/Sustainable_living",
    "https://en.wikipedia.org/wiki/Sport",
    "https://en.wikipedia.org/wiki/Home",
    "https://en.wikipedia.org/wiki/Fashion",
    "https://en.wikipedia.org/wiki/Personal_care"]

IMPLEMENTATION OF DEFINITIONS

In [232]:
def get_filenames(source_name):
    directory_name = "data"
    if not os.path.exists(directory_name):
            os.makedirs(directory_name)
            print(f"{GREEN}Created directory : {directory_name}{RESET}")
    
    # Avertissement si écrasement (juste pour info)
    if os.path.exists(source_name):
        print(f"{BLUE}Note : The file is overwritten{RESET}") 
    
    base_name = source_name.replace(" ", "_")

    return {"raw":       os.path.join(directory_name, f"{base_name}_raw_html.csv"),
            "corpus":    os.path.join(directory_name, f"{base_name}_corpus.csv"),
            "cleaned":   os.path.join(directory_name, f"{base_name}_cleaned.csv"),
            "norm":      os.path.join(directory_name, f"{base_name}_corpus_norm.csv"),
            "tokenized": os.path.join(directory_name, f"{base_name}_corpus_tokenized.csv")}

In [233]:
def fetch_url(url) :

    try:
        response = session.get(url, timeout=10)
        if response.status_code == 200: 
            return response # If the status code is not OK (200) the function return none and an error message
        else:
            print(f"Failed to fetch the url: {url} with status code {response.status_code}")
            return None
    
    except requests.RequestException:
        return None

In [234]:
def to_soup(url):
    response = fetch_url(url)
    if response:  # If the response is not none, the function return the beautiful soup object
        return BeautifulSoup(response.text, 'html.parser') 
    else:
        return None

In [235]:
def extract_website_links(url, max_levels=3): 
    queue = deque([(url, 0)])
    visited_links = set([url])
    start_domain = urlparse(url).netloc

    is_feedspot = "feedspot.com" in start_domain

    while queue:
        current_url, current_level = queue.popleft()

        if current_level >= max_levels:
            continue
    
        soup = to_soup(current_url)
        if not soup:
            continue

        content_area = soup.body # Stay large (here) to get more links
        if not content_area:
            continue

        for item in content_area.find_all('a', href=True):
            href = item.get('href')
            if href.startswith('mailto:') or href.startswith('tel:') or href.startswith('javascript:'):
                continue

            full_url = urljoin(current_url, href)
            full_url = full_url.split('?')[0].split('#')[0]
            full_url = full_url.lower()

            # Si nouveau lien trouvé
            for filter in filters_noise:
                if filter in full_url:
                    break
            else:
                if full_url not in visited_links:
                    parsed_url = urlparse(full_url)
                    if is_feedspot: # Sur feedspot on prend uniquement les liens sortants
                        if parsed_url.netloc != start_domain:
                            visited_links.add(full_url)
                    else:
                        if parsed_url.netloc == start_domain: # Sur MBG on crawl uniquement les liens qui ont le même domaine
                            visited_links.add(full_url)
                            queue.append((full_url, current_level + 1))
                    
    return visited_links

In [236]:
def extract_wikipedia_links(url):
    soup = to_soup(url)

    if not soup:
        return set()

    wiki_links = set()

    # --- Main content ---
    to_visit = soup.find_all('div', class_="mw-content-ltr mw-parser-output") # Specific to wikipedia

    for div in to_visit:
        for item in div.find_all('a', href=True):
            href = item.get('href')
            if href.startswith("/wiki/") and ":" not in href:
                wiki_links.add(urljoin(url, href))

    return wiki_links

In [237]:
def extract_feedspot_seed_pages(url):
    
    soup = to_soup(url)
    links_blogs = set()

    if not soup:
        print(f"{RED}Error: Could not fetch main blogs page: {url}{RESET}")
        return set()

    blogs = soup.find_all('a', href=True) # and tag.get('class') and 'wb-ba' in tag.get('class') and any('ext' in c for c in tag.get('class')))

    for link in blogs:
        href = link.get('href') #if link.name == 'a' else link.text.strip()
        
        if not href.startswith('https?'):
            href = urljoin(url, href)

        if urlparse(href).netloc != urlparse(url).netloc:
            if any(filter in href for filter in filters_noise):
                continue
            if "mindbodygreen.com" in href:
                 continue
            if href not in links_blogs:
                links_blogs.add(href)
    
    return links_blogs

In [238]:
def crawl_wiki_seed_links(wiki_seed_pages):
    wiki_all_links = set(wiki_seed_pages) # Set() avoids duplication, unlike lists
    dico_wiki_links = {}

    for url in wiki_seed_pages:
        crawl_links = extract_wikipedia_links(url)
        dico_wiki_links[url] = crawl_links
        wiki_all_links.update(crawl_links) # Only add new links

    return wiki_all_links

In [239]:
def crawl_feedspot_seed_links(feedspot_seed_page):
    feedspot_all_links = set()
    dico_blogs_links = {}
    excluded = ["mindbodygreen.com", "www.mindbodygreen.com"]

    blogs_list = extract_feedspot_seed_pages(feedspot_seed_page)
    print(f" -> {len(blogs_list)} blogs trouvés sur Feedspot.")

    for url in blogs_list:
        if any(ex in url for ex in excluded):
            continue
        
        crawl_links = extract_website_links(url, max_levels=1)
        dico_blogs_links[url] = crawl_links
        feedspot_all_links.update(crawl_links)
    
    return feedspot_all_links

In [240]:
def wiki_url_to_label(url):
    title = urlparse(url).path.split("/")[-1]
    title = unquote(title)
    title = title.replace("_", " ")
    # ajout d'un suffixe si la page contient parenthèses
    if "(" in title:
        title = title.replace("(", " (").replace(")", ")")
    return title

In [241]:
def save_to_csv(data, filename):
    
    if not data:
        print(f"{RED}No data to save.{RESET}")
        return

    if not os.path.exists(filename):
        print(f"{RED} ERROR : File {filename} was not created")
 
    # Get the keys from the first dictionary for the CSV header from get_html_content
    fieldnames = data[0].keys() # Detection of the existing colons in the data file
    
    try:
        with open(filename, 'w', newline='', encoding='utf-8') as f: # Opens the csv file as utf-8
            writer = csv.DictWriter(f, fieldnames=fieldnames, quoting=csv.QUOTE_ALL) # Initialising a writer to write the dictionary into the csv file
            writer.writeheader() # writes the colons headers
            writer.writerows(data) # writes the rows
        print(f"{GREEN}Success! Data saved to: {RESET}{os.path.abspath(filename)}")
        
    except Exception as e:
        print(f"{RED}Error saving file: {e}{RESET}")

retirer le max=None et links[:max] pour run tout 

In [242]:
def get_html_content(links, output_csv, max=None):
    content = []
    failed_links = []

    for link in links[:max]:
        response = fetch_url(link)
        if response:
            content.append({'url': link, 'html': response.text})
        else:
            failed_links.append(link)

    print(f"{GREEN}Fetched: {len(content)}{RESET}")
    print(f"{RED}Failed: {len(failed_links)}{RESET}")

    if output_csv and content:
        save_to_csv(content, output_csv)

    return content, failed_links 

In [243]:
def clean_html(html):

    if html is None or pd.isna(html) or not isinstance(html, str):
        return ""
    
    soup = BeautifulSoup(html, 'html.parser')

    for tag in soup(['script', 'style', 'noscript']):  # Supress any unessecary tags
        tag.decompose()
    
    text = soup.get_text(separator=' ', strip=True)  # Collect all visible text
    text = re.sub(r'\s+', ' ', text)  # Supress any unecessary spaces

    return text

In [244]:
def clean_csv_file(input_csv, output_csv):
    df = pd.read_csv(input_csv)  # Take the csv file with the raw html as the input

    if 'corpus_text' not in df.columns:  # Verify that the html colon exists
        raise ValueError(f"The corpus column is missing in: {input_csv}")

    df['corpus_text'] = df['corpus_text'].fillna("")   
    df['cleaned_text'] = df['corpus_text'].apply(clean_html)  # Cleans the html column
    df = df[['url', 'cleaned_text']]  # Keep the url and text colon (not the raw html)

    df.to_csv(output_csv, index=False, encoding='utf-8')  # Creats a new csv file as the output of the function

    return df

In [245]:
def get_corpus(html):
    if not html: 
        return ""

    try:
        soup = BeautifulSoup(html, "lxml") #pip install lmxl requis
    except:
        soup = BeautifulSoup(html, "html.parser")

    unwanted_tags = ["script", "style", "nav", "noscript", "header", "footer", "aside", "form", "input", "button", "meta", "svg"]

    for tag in soup(unwanted_tags):
        tag.decompose()
    
    is_wikipedia = soup.find_all('div', class_="mw-content-ltr mw-parser-output")

    if not is_wikipedia: # For blogs
        regex_string = r"cookie-banner|popup|ad(vertisement)?|social|share|banner|newsletter|widget"
        unwanted_patterns = re.compile(regex_string, re.IGNORECASE)

        tags_to_remove = soup.find_all(attrs={"class": unwanted_patterns}) + soup.find_all(attrs={"id": unwanted_patterns})     

        for tag in tags_to_remove:
            tag.decompose()
        
    raw_text = soup.get_text(separator=' ', strip=True)
    text = ' '.join(raw_text.split())

    return text

In [246]:
def extract_corpus_csv_file(input_csv, output_csv):
    df = pd.read_csv(input_csv)
    df['corpus_text'] = df['html'].apply(get_corpus)
    df[['url', 'corpus_text']].to_csv(output_csv, index=False)
    
    return df

In [247]:
def normalize_html(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()  # convert all letters to lowercase
    text = re.sub(r'\[\d+\]', ' ', text)  # remove reference numbers like [1], [2], etc.
    text = re.sub(r'\d+', ' ', text)  # remove all numbers
    text = re.sub(r'[^a-z\s]', ' ', text)  # keep only English letters and spaces
    text = re.sub(r'\s+', ' ', text)  # replace multiple spaces with a single space
    return text.strip()  # remove leading and trailing spaces

In [248]:
def normalize_csv_file(input_csv, output_csv):
    df = pd.read_csv(input_csv)  # Take the csv file with the cleaned text as the input

    if 'cleaned_text' not in df.columns:  # Verify that the cleaned text colon exists
        raise ValueError(f"The cleaned text is missing in: {input_csv}")

    df['cleaned_text'] = df['cleaned_text'].fillna("") # On force tout en string vide si c'est NaN
    df['normalized_text'] = df['cleaned_text'].apply(normalize_html)  # normalize the cleaned text
    df = df[['url', 'normalized_text']]  # keep the url and normalized text 

    df.to_csv(output_csv, index=False, encoding='utf-8')  # creats a new csv file as the output of the function

    return df

In [None]:
nltk.download('stopwords') 
stop_words = list(set(stopwords.words('english'))) + ["'s"]

stop_words = set(stopwords.words('english'))

#stem = nltk.stem.SnowballStemmer("english")
lemmatizer = WordNetLemmatizer()

def tokenize_html(text):
    if not isinstance(text, str) or pd.isna(text):
        return [] # Return empty list for non-string or NaN inputs (= when text is missing)
    text = text.lower()
    tokens = nltk.word_tokenize(text)
    tokens = [token for token in tokens if token not in string.punctuation]  # remove punctuation
    tokens = [token for token in tokens if token not in stop_words]  # remove stopwords
    #tokens = [stem.stem(token) for token in tokens]  # apply stemming (racinisation)
    tokens = [token for token in tokens if len(token) > 1]  # Remove very short or meaningless tokens 
    tokens = [token for token in tokens if len(token) < 20]  # Remove very long or meaningless tokens 
    tokens = [lemmatizer.lemmatize(token) for token in tokens]  # apply lemmatization
    tokens = [lemmatizer.lemmatize(token, pos='v') for token in tokens]
    return tokens

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [250]:
def tokenize_csv_file(input_csv, output_csv):
    df = pd.read_csv(input_csv)  # Take the csv file with the normalized text as the input

    if 'normalized_text' not in df.columns:  # Verify that the normalized text colon exists
        raise ValueError(f"The normalized text is missing in: {input_csv}")

    df['normalized_text'] = df['normalized_text'].fillna("") # Replace holes by nothing to avoid error due to 'NaN'

    df['tokenized_text'] = df['normalized_text'].apply(tokenize_html)  # cleans the html colon
    df = df[['url', 'tokenized_text']]  # keep the url and text colon (not the raw html)

    df.to_csv(output_csv, index=False, encoding='utf-8')  # creats a new csv file as the output of the function
    print(df.head()) # Display the 5 first lines of the final dataframe

    return df

CONFIGURATION

In [251]:
sources = [
    {"name": "MBG",
     "start_url": "https://www.mindbodygreen.com/",
     "crawler_func": extract_website_links,
     "output_file": "data/MBG.csv"},  # We pass the specific function without calling it ()
    {"name": "wiki",
     "start_url": wiki_seed_pages,
     "crawler_func": crawl_wiki_seed_links,
     "output_file": "data/wiki.csv"},
    {"name": "blogs",
     "start_url": "https://bloggers.feedspot.com/lifestyle_blogs/?fbclid=IwY2xjawPEh9BleHRuA2FlbQIxMQBzcnRjBmFwcF9pZAEwAAEeokJD-GCqzqIYQFsjgINZ-moY9eFBlfazeS-PqjVE0WpSysKa_blRr5IkTts_aem_iiBbwTznbHBpSm1tmO_7Eg",
     "crawler_func": crawl_feedspot_seed_links,
     "output_file": "data/blogs.csv"}
    ]

EXECUTION

In [252]:
"""========================== SOURCES PIPELINE =========================="""

"""=== SETUP ==="""
for source in sources:
    filename = source["output_file"]
    source_name = source["name"]
    start_point = source["start_url"]

    files = get_filenames(source_name)

    print(f"=== SOURCE TRAITEMENT : {source_name} ===")

    print("\n=== CRAWLING ===")

    print(f"{BLUE}[{source_name}] Starting the crawling...{RESET}")    
    crawled_links = source["crawler_func"](start_point) # Ex : crawled_links = extract_website_links(https://....)
    print(f"-> Found {len(crawled_links)} links")

    print("\n=== SCRAPING ===")
    
    print(f"{BLUE}[{source_name}] Starting the scraping...{RESET}")
    get_html_content(list(crawled_links), files['raw'], max=10)
    print(f"{GREEN}Raw HTML data saved to {files['raw']}{RESET}")

    print("\n=== CORPUS EXTRACTION ===")

    print(f"{BLUE}[{source_name}] Extracting the corpus...{RESET}")
    extract_corpus_csv_file(files["raw"], files["corpus"])
    print(f"{GREEN}Corpus data saved to {files['corpus']}{RESET}")

    print("\n=== CLEANING ===")

    print(f"{BLUE}[{source_name}] Cleaning the data...{RESET}")
    clean_csv_file(files["corpus"], files["cleaned"])
    print(f"{GREEN}Cleaned text saved to {files['cleaned']}{RESET}")

    print("\n=== NORMALIZATION ===")

    print(f"{BLUE}[{source_name}] Normalizing the data...{RESET}")
    normalize_csv_file(files["cleaned"], files["norm"])
    print(f"{GREEN}Normalized text saved to {files['norm']}{RESET}")

    print("\n=== TOKENIZATION ===")

    print(f"{BLUE}[{source_name}] Tokenizing text...{RESET}")
    tokenize_csv_file(files["norm"], files["tokenized"])
    print(f"{GREEN}Tokenized data in {files['tokenized']}{RESET}\n")

print(f"\n=== MERGING DATASETS ===")
all_dfs = []
for source in sources:
    f_token = get_filenames(source["name"])["tokenized"]
    df = pd.read_csv(f_token)
    df['source_origin'] = source["name"] # On garde une trace de l'origine !
    all_dfs.append(df)

df_global = pd.concat(all_dfs, ignore_index=True)
df_global.to_csv("GLOBAL_dataset_tokenized.csv", index=False)
print(f"{GREEN}SUCCESS ! Final data in GLOBAL_dataset_tokenized.csv{RESET}\n")

=== SOURCE TRAITEMENT : MBG ===

=== CRAWLING ===
[94m[MBG] Starting the crawling...[0m
-> Found 4331 links

=== SCRAPING ===
[94m[MBG] Starting the scraping...[0m
[92mFetched: 10[0m
[91mFailed: 0[0m
[92mSuccess! Data saved to: [0mc:\Users\User\Desktop\Web Mining\Projet 2025 - Arshik, Violaine\Projet_Web_Mining\Data_Collection_Jupyter_Lab\data\MBG_raw_html.csv
[92mRaw HTML data saved to data\MBG_raw_html.csv[0m

=== CORPUS EXTRACTION ===
[94m[MBG] Extracting the corpus...[0m
[92mCorpus data saved to data\MBG_corpus.csv[0m

=== CLEANING ===
[94m[MBG] Cleaning the data...[0m
[92mCleaned text saved to data\MBG_cleaned.csv[0m

=== NORMALIZATION ===
[94m[MBG] Normalizing the data...[0m
[92mNormalized text saved to data\MBG_corpus_norm.csv[0m

=== TOKENIZATION ===
[94m[MBG] Tokenizing text...[0m
                                                 url  \
0  https://www.mindbodygreen.com/articles/solawav...   
1  https://www.mindbodygreen.com/articles/read-th...   
2  ht

In [253]:
def filter_links(links, required_keywords=None, domain=None, already_seen=None):
    if required_keywords is None:  # If no list of required keywords is given then we create an empty one
        required_keywords = []
    if already_seen is None:  # If no list of already seen links is given then we create an empty one
        already_seen = set()
    
    filtered = []
    for l in links:
        l_lower = l.lower()  # Transformation of capital letter into lower case letter
        if required_keywords and not any(keyword.lower() in l_lower for keyword in required_keywords):  # The function skip the url if no required keywords are in the url
            continue
        if domain and urlparse(l).netloc != domain:  # The function filter the links that are not in the domain
            continue
        if l in already_seen:  # The function filter the links already in the list of links
            continue
        filtered.append(l) # If the url past all the filter, it is added to the list of links
    return filtered