#### Libraries

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from datasets import load_dataset

#### First Website - labyrintos

In [None]:
# Find interpretations
def get_section_text(start_tag, stop_tags=('h1', 'h2', 'h3', 'h4')):
    texts = []
    for sibling in start_tag.next_siblings:
        if sibling.name in stop_tags:
            break
        if sibling.name == 'p':
            texts.append(sibling.get_text(strip=True))
        elif sibling.name == 'div': 
            p_tags = sibling.find_all('p')
            texts.extend([p.get_text(strip=True) for p in p_tags if p.get_text(strip=True)])
    return " ".join(texts)

# Extract the name of a Tarot Card
def extract_card_name(soup):
    title_tag = soup.find('title')
    if not title_tag:
        return "Unknown Card"
    title_text = title_tag.get_text(strip=True)
    card_name = re.split(r'\bMeaning\b', title_text)[0].strip()
    card_name = card_name.replace("Tarot Card", "").strip()
    return card_name

def parse_card_page(url):
    topics_patterns = {
        'general': re.compile(
            r"(Upright.*Meaning)"
            r"|(.*Strengths)"  # (2) Suit of Cups Strengths
            r"|(.*Weaknesses)" # (3) Suit of Cups Weaknesses
        ),

        'love': re.compile(
            r"(Upright.*Meaning:\s*Love\s*$)"           # (1) "Upright ... Meaning: Love"
            r"|(Tarot Love Meaning\s*-\s*Upright.*)"    # (2) "Tarot Love Meaning - Upright ..."
            r"|(Upright.*Tarot Love Meaning.*)"         # (3) "Upright Ace of Wands Tarot Love Meaning"
            r"|(.*Love Meaning\s*\(Upright\).*)"        # (4) "Death Tarot Card Love Meaning (Upright)"
            r"|(.*in Love)"                             # (5) The Suit of Cups in Love
            r"|(Upright.*Love Meaning)"                 # (6) Upright Hierophant Love Meaning

        ),

        'career': re.compile(
            r"(Career Meaning\s*[–-]\s*Upright.*)" 
            r"|(Upright Career Meaning -.*)"            # (2) Upright Career Meaning - 2 of Cups
            r"|(.*in Career and Creativity)"            # (3) The Suit of Cups in Career and Creativity
        ),

        'finance': re.compile(
            r"(Finances Meaning\s*[–-]\s*Upright.*)"
            r"|(Upright Finances Meaning -.*)"          # (2) Upright Finances Meaning - 2 of Cups
            r"|(.*in Finances)"                         # (3) The Suit of Cups in Finances
        ),
    }

    response = requests.get(url)
    if not response.ok:
        print(f"[WARNING] Unable to retrieve page: {url}")
        return pd.DataFrame(columns=['source','topic','interpretation','card'])

    soup = BeautifulSoup(response.text, 'html.parser')
    card_name = extract_card_name(soup)

    # Collect all headings (h1..h4)
    headings = soup.find_all(re.compile('^h[1-4]$'))
    data = []

    for topic, pattern in topics_patterns.items():
        matched_tag = None
        # Find the first heading that matches the regex
        for heading in headings:
            heading_text = heading.get_text(strip=True)
            if pattern.search(heading_text):
                matched_tag = heading
                break

        if matched_tag:
            section_text = get_section_text(matched_tag, stop_tags=('h1','h2','h3','h4'))
            data.append({
                'source': url,
                'topic': topic,
                'interpretation': section_text,
                'card': card_name
            })
        else:
            print(f"[WARNING] No heading found for '{topic}' (pattern='{pattern.pattern}') on page: {url}")

    return pd.DataFrame(data, columns=['source','topic','interpretation','card'])


if __name__ == "__main__":
    # 1) Collect links to all cards from the main page
    MAIN_URL = "https://labyrinthos.co/blogs/tarot-card-meanings-list/"
    response_main = requests.get(MAIN_URL)
    soup_main = BeautifulSoup(response_main.text, 'html.parser')

    card_links = []
    for a_tag in soup_main.find_all('a', href=True):
        href = a_tag['href']
        if (href.startswith("/blogs/tarot-card-meanings-list/") 
            and href != "/blogs/tarot-card-meanings-list/"):
            full_url = "https://labyrinthos.co" + href
            card_links.append(full_url)

    # Remove duplicates
    card_links = list(set(card_links))

    # Filter out redundant namings
    suit_words = ("the-suit-of-cups", "the-suit-of-pentacles",
                  "the-suit-of-swords", "the-suit-of-wands")
    

    filtered_card_links = []
    for link in card_links:
        if not any(suit_word in link.lower() for suit_word in suit_words):
            filtered_card_links.append(link)

    # Parse each card
    all_data = pd.DataFrame(columns=['source','topic','interpretation','card'])
    for url in filtered_card_links:
        card_df = parse_card_page(url)
        all_data = pd.concat([all_data, card_df], ignore_index=True)

# Drop redundant column
all_data.drop(columns="source", inplace=True)

all_data['source'] = 'website_1'

# Final Edit in Card Naming
all_data['card'] = all_data['card'].str.replace(r'^The\s+', '', regex=True)

#### Second Dataset - Hugging Face

In [None]:
# Load the dataset
ds = load_dataset("quanquyt/Detailed_Tarot_meanings")

# Convert to DataFrame
df = ds["train"].to_pandas()

# Remove redundant text
mask_short = df['query'].str.startswith("The short meaning of", na=False)
mask_keywords = df['query'].str.startswith("the keywords meaning of", na=False)
df = df[~(mask_short | mask_keywords)].copy()

# Define the topic
def get_topic(query: str) -> str:
    q_lower = query.lower().strip()
    if q_lower.endswith("career"):
        return "career"
    elif q_lower.endswith("love"):
        return "love"
    elif q_lower.endswith("finances"):
        return "finance"
    elif "the general meaning of" in q_lower:
        return "general"
    else:
        return None

df['topic'] = df['query'].apply(get_topic)

# Exctract the name of a Tarot Card
def extract_card_name(text: str) -> str:
    prefix = text.split(":", 1)[0].strip()

    # Remove redundant text
    prefix = re.sub(r"(?i)^the general meaning of the\s*", "", prefix)
    prefix = re.sub(r"(?i)^the general meaning of\s*", "", prefix)
    prefix = re.sub(r"(?i)^the general meaning\s*", "", prefix)
    prefix = re.sub(r"(?i)^the\s+", "", prefix)
    prefix = re.sub(r"(?i)\s+is$", "", prefix)
    prefix = re.sub(r"(?i)\s+meaning$", "", prefix)

    prefix = prefix.strip()
    return prefix

df['card_name'] = df['document'].apply(extract_card_name)

# Delete Redundant text
def remove_prefix_before_colon(text: str) -> str:
    parts = text.split(":", 1)
    if len(parts) > 1:
        return parts[1].strip()
    else:
        return text  # if there's no colon, return as is

# Pre-final dataset
df['document'] = df['document'].apply(remove_prefix_before_colon)
df.sort_values(by=["card_name", "topic"], ascending=[True, True, False], inplace=True)
df.drop_duplicates(subset=["card_name", "topic"], keep="first", inplace=True)
df.drop(columns="doc_len", inplace=True)

# Final dataset
df = df[['topic', 'document', 'card_name']]
df = df.rename(columns = {'document': 'interpretation', 'card_name': 'card'})
df['source'] = 'website_2'

#### Third Website – astrotalk

In [None]:
# Find interpretations
def get_text_until_next_h5(heading_tag):
    texts = []
    for element in heading_tag.next_elements:
        if element is heading_tag:
            continue
        if element.name == 'h5':
            break
        if element.name == 'p':
            texts.append(element.get_text(strip=True))
    return " ".join(texts)

# Extract the name of a tarot card
def extract_card_name(soup):
    h1_tag = soup.find("h1", class_="main-heading")
    if h1_tag:
        return h1_tag.get_text(strip=True)
    return "Unknown Card"

def parse_card_page(url):

    topics_patterns = {
        'general': re.compile(
            r"(upright.*tarot\s+card\s+meaning)"   # Upright ... Tarot Card meaning
            r"|(upright.*of\s+\w+\s+meaning)"  # Upright Eight of Wands Meaning
            r"|(Reversed Five of Cups tarot card meaning)", # mistake from the website
            re.IGNORECASE
        ),
        'love': re.compile(r"love.*\(upright\)", re.IGNORECASE),
        'finance': re.compile(r"financ.*\(upright\)", re.IGNORECASE),
        'career': re.compile(r"career.*\(upright\)", re.IGNORECASE),
    }

    resp = requests.get(url)
    if not resp.ok:
        print(f"[WARNING] Unable to open page: {url}")
        return pd.DataFrame(columns=['source', 'topic', 'interpretation', 'card'])

    soup = BeautifulSoup(resp.text, 'html.parser')
    card_name = extract_card_name(soup)

    headings = soup.find_all('h5')
    data = []

    for heading in headings:
        heading_text = heading.get_text(strip=True)
        matched_topic = None
        for topic, pattern in topics_patterns.items():
            if pattern.search(heading_text):
                matched_topic = topic
                break

        if matched_topic:
            section_text = get_text_until_next_h5(heading)
            data.append({
                'source': url,
                'topic': matched_topic,
                'interpretation': section_text,
                'card': card_name
            })

    return pd.DataFrame(data, columns=['source','topic','interpretation','card'])

# Get all possible links for different tarot cards
def get_all_astrotalk_card_links():
    MAIN_URL = "https://astrotalk.com/tarot"
    resp = requests.get(MAIN_URL)
    if not resp.ok:
        print(f"[WARNING] Unable to open: {MAIN_URL}")
        return []

    soup = BeautifulSoup(resp.text, 'html.parser')
    links = []
    for a in soup.find_all('a', href=True):
        href = a['href']
        if href.startswith("/tarot/") and len(href) > len("/tarot/"):
            links.append("https://astrotalk.com" + href)
    return list(set(links))

if __name__ == "__main__":
    all_card_urls = get_all_astrotalk_card_links()
    all_data_third = pd.DataFrame(columns=['source','topic','interpretation','card'])
    for url in all_card_urls:
        card_df = parse_card_page(url)
        all_data_third = pd.concat([all_data_third, card_df], ignore_index=True)

all_data_third.drop(columns="source", inplace=True)
all_data_third['source'] = 'website_3'
all_data_third['card'] = all_data_third['card'].str.replace(r'^The\s+', '', regex=True)
all_data_third['card'] = all_data_third['card'].replace("five of Pentacles", "Five of Pentacles")

#### JOIN parsed data-sets

In [None]:
final_df = pd.concat([all_data, df, all_data_third])
final_df = final_df.sort_values(by=['source', 'card', 'topic'])
final_df

Unnamed: 0,topic,interpretation,card,source
310,career,The upright Ace of Cups is a sign of both new ...,Ace of Cups,website_1
311,finance,The friendly and sociable quality of the Ace o...,Ace of Cups,website_1
308,general,Getting the Ace of Cups upright shows that it'...,Ace of Cups,website_1
309,love,One of happiest cards to get in a love tarot r...,Ace of Cups,website_1
254,career,"New opportunities are available to you now, th...",Ace of Pentacles,website_1
...,...,...,...,...
45,love,If you are in a relationship with someone or s...,Wheel of Fortune,website_3
308,career,The World might stand in for achieving your pr...,World,website_3
307,finance,The World suggests that wealth should be direc...,World,website_3
305,general,Getting The World (upright) in your tarot card...,World,website_3
