In [21]:
import getpass
import os

if "GROQ_API_KEY" not in os.environ:
    os.environ["GROQ_API_KEY"] = getpass.getpass("Enter your Groq API key: ")

Enter your Groq API key:  ········


In [22]:
from langchain_community.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from bs4 import BeautifulSoup
import requests
from typing import List, Dict
import os
import pickle

class URLContentProcessor:
    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        """
        Initialize the URL content processor.
        Args:
            model_name: Name of the sentence-transformer model for embeddings
        """
        # Initialize embedding model
        self.embeddings = HuggingFaceEmbeddings(
            model_name=model_name,
            model_kwargs={'device': 'cpu'},
            encode_kwargs={'normalize_embeddings': True}
        )
        
        # Initialize text splitter for chunking
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=500,
            chunk_overlap=50,
            separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]
        )

    def _extract_main_content(self, url: str) -> Dict[str, str]:
        """
        Extract the main content and title from a URL.
        """
        try:
            response = requests.get(url)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Extract title
            title = soup.title.string if soup.title else ''
            
            # Remove unwanted elements
            for element in soup(['script', 'style', 'nav', 'footer', 'header']):
                element.decompose()
            
            # Extract main content (focusing on article or main content areas)
            main_content = ""
            priority_elements = [
                soup.find('article'),
                soup.find('main'),
                soup.find(class_='content'),
                soup.find(class_='post-content'),
                soup.find(class_='article-content')
            ]
            
            for element in priority_elements:
                if element:
                    main_content = element.get_text(separator=' ', strip=True)
                    break
            
            # If no main content found, get all paragraph text
            if not main_content:
                paragraphs = soup.find_all('p')
                main_content = ' '.join(p.get_text(strip=True) for p in paragraphs)
            
            return {
                "title": title.strip(),
                "content": main_content.strip()
            }
            
        except Exception as e:
            print(f"Error processing URL {url}: {e}")
            return {"title": "", "content": ""}

    def process_urls(self, urls: List[str]) -> Dict[str, Dict]:
        """
        Process multiple URLs and store their content in vector stores.
        """
        url_data = {}
        
        for url in urls:
            print(f"Processing {url}...")
            
            # Extract content
            content_dict = self._extract_main_content(url)
            
            if content_dict["content"]:
                # Split content into chunks
                chunks = self.text_splitter.split_text(content_dict["content"])
                # Create vector store
                vectorstore = FAISS.from_texts(chunks, self.embeddings)
                
                # Store the processed data
                url_data[url] = {
                    "title": content_dict["title"],
                    "chunks": chunks,
                    "vectorstore": vectorstore
                }
                
                print(f"Successfully processed {url}")
                print(f"Title: {content_dict['title']}")
                print(f"Number of chunks: {len(chunks)}")
                print("-" * 50)
            else:
                print(f"No content extracted from {url}")
                print("-" * 50)
        
        return url_data

    def save_vectorstores(self, url_data: Dict[str, Dict], filepath: str):
        """
        Save the vector stores to disk.
        """
        try:
            # Create a new dictionary with just the vector stores
            vector_stores = {url: data["vectorstore"] for url, data in url_data.items()}
            
            # Save to disk
            with open(filepath, 'wb') as f:
                pickle.dump(vector_stores, f)
            
            print(f"Vector stores saved to {filepath}")
            
        except Exception as e:
            print(f"Error saving vector stores: {e}")

    def load_vectorstores(self, filepath: str) -> Dict[str, FAISS]:
        """
        Load vector stores from disk.
        """
        try:
            with open(filepath, 'rb') as f:
                vector_stores = pickle.load(f)
            
            print(f"Vector stores loaded from {filepath}")
            return vector_stores
            
        except Exception as e:
            print(f"Error loading vector stores: {e}")
            return {}


In [45]:
from langchain_groq import ChatGroq
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from typing import List, Dict, Tuple
import numpy as np

class AnchorTextAnalyzer:
    def __init__(self, api_key: str):
        self.llm = ChatGroq(
            api_key=api_key,
            model_name="llama-3.3-70b-versatile",
            temperature=0,
            # max_tokens=1024
        )
        
        self.anchor_prompt = PromptTemplate(
            input_variables=["context", "main_text"],
            template="""
            Analyze the following main text and context to suggest natural anchor text opportunities from main text only.
            Consider semantic relevance and SEO best practices.
            
            CONTEXT FROM RELATED CONTENT:
            {context}
            
            MAIN TEXT TO ANALYZE:
            {main_text}
            
            Provide a list of potential anchor text word or phrases from main text in the following format:
            - Anchor Text: [suggested word or phrase]
            - Context: [brief explanation why this is relevant]
            - Target URL: [url where this anchor should point]
            
            Focus on natural, contextual word or phrases that would make sense to readers.
            """
        )

    def find_similar_chunks(self, 
                            vectorstore, 
                            query_text: str,
                            url: str,
                            similarity_threshold: float = 0.7,
                            k: int = 3) -> List[Tuple[str, float, str]]:
        """
        Find semantically similar chunks from the vector store.
        Returns: List of tuples (chunk_text, similarity_score, source_url)
        """
        results = vectorstore.similarity_search_with_score(query_text, k=k)
        filtered_results = [
            (doc.page_content, score, url)
            for doc, score in results
            if score >= similarity_threshold
        ]
        return filtered_results

    def suggest_anchor_text(self, 
                          main_text: str, 
                          loaded_vectorstores: Dict,
                          chunk_size: int = 500) -> List[Dict]:
        """
        Analyze main text and suggest anchor text opportunities using vector stores.
        """
        # Split main text into manageable chunks
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=50
        )
        main_chunks = text_splitter.split_text(main_text)
        
        anchor_suggestions = []
        
        for chunk in main_chunks:
            # Find relevant content from vector stores
            relevant_contexts = []
            for url, vectorstore in loaded_vectorstores.items():
                similar_chunks = self.find_similar_chunks(vectorstore, chunk, url=url)
                relevant_contexts.extend(similar_chunks)
            
            # Sort by similarity score and take top matches
            relevant_contexts.sort(key=lambda x: x[1], reverse=True)
            top_contexts = relevant_contexts[:3]
            
            if not top_contexts:
                continue
                
            # Prepare context for LLM
            context_text = "\n".join([
                f"Related Content (from {url}):\n{text}\nSimilarity: {score:.2f}"
                for text, score, url in top_contexts
            ])
            
            # Generate anchor text suggestions using Groq
            prompt = self.anchor_prompt.format(
                context=context_text,
                main_text=chunk
            )
            
            response = self.llm.invoke(prompt)
            response_text = response.content
            
            # Parse and format suggestions
            suggestions = self._parse_suggestions(response_text, top_contexts)
            anchor_suggestions.extend(suggestions)
            
        return anchor_suggestions
    
    def _parse_suggestions(self, 
                         llm_response: str, 
                         context_chunks: List[Tuple[str, float, str]]) -> List[Dict]:
        """
        Parse LLM response into structured anchor text suggestions.
        """
        suggestions = []
        current_suggestion = {}
        
        for line in llm_response.split('\n'):
            line = line.strip()
            if not line:
                continue
                
            if line.startswith('- Anchor Text:'):
                if current_suggestion:
                    suggestions.append(current_suggestion)
                current_suggestion = {'anchor_text': line.split(':', 1)[1].strip()}
            elif line.startswith('- Context:'):
                current_suggestion['context'] = line.split(':', 1)[1].strip()
            elif line.startswith('- Target URL:'):
                current_suggestion['target_url'] = line.split(':', 1)[1].strip()
        
        if current_suggestion:
            suggestions.append(current_suggestion)
            
        return suggestions



In [46]:
new_blog = """What Is a Computerized Maintenance Management System (CMMS)?
A computerized maintenance management system (CMMS) is a software program that tracks the maintenance and repair activities of your equipment. The CMMS keeps track of all the equipment, including what it is and how much it costs, who owns it, who is responsible for its care, how old it is, when it was last serviced, what parts need to be replaced, how long it will take to complete the repair or service, and other information.

The CMMS software can also track the labor hours spent on each piece of equipment so you know how much time has been spent working on each unit. This helps you keep track of your employees' productivity levels and see which ones are doing their jobs well.

A CMMS may also allow you to create custom reports based on specific criteria so you can get the most out of your data collection efforts. For example, if you want to see which parts need replacement most often for a particular machine or group of machines then this feature would be very useful because it would give you immediate access to that information without having to wait until someone manually inputs data into another spreadsheet file or database system where it might take several days before those results become available.

Why Is Having a CMMS Important?
A CMMS is an essential tool for any business. It allows you to monitor your equipment, assets and maintenance projects, as well as keep track of your company's repair history and service records.

A CMMS can be a lifesaver for your company, especially when you are trying to keep track of all the maintenance needs of your facility. Without it, you will likely end up with too many issues that go unchecked, or even worse — you might not even know what the problems are in the first place!

With a CMMS, you will be able to keep track of things like:

What machines need repair and how long they have been out of service.
How often machines need repairs and how much money is being spent on them each year.
Which parts are failing most often on each machine and why they are failing so often (so that they can be replaced).
Generally, a CMMS can also help you:

Improve efficiency by using the latest technology.
Reduce costs by reducing waste and inefficiencies.
Save time by automating routine tasks and providing real-time updates on equipment performance.
Reduce risk by keeping track of all your critical assets with one easy-to-use tool that automatically updates in real-time so you always know what is going on at any given time!
How Do I Choose the Right CMMS for My Business?
Choosing the right CMMS for your business can be a difficult process, but it is important to get it right and to do your research so as to make sure you choose the right software for your particular needs. Here are some things to keep in mind when choosing a CMMS:

What kind of data do you need?
How much data do you have?
How many assets do you have?
What features are most important to you?"""

In [47]:
def main():
    # Previous code remains the same
    urls = [
        "https://www.xenia.team/articles/hotel-maintenance-management-software",
    ]
    
    processor = URLContentProcessor()
    url_data = processor.process_urls(urls)
    processor.save_vectorstores(url_data, "vectorstores.pkl")
    loaded_vectorstores = processor.load_vectorstores("vectorstores.pkl")
    
    # Initialize anchor text analyzer
    analyzer = AnchorTextAnalyzer(api_key=os.environ['GROQ_API_KEY'])
    
    
    # Get anchor text suggestions
    suggestions = analyzer.suggest_anchor_text(
        main_text=new_blog,
        loaded_vectorstores=loaded_vectorstores
    )
    
    # Print suggestions
    for suggestion in suggestions:
        print(f"\nSuggested Anchor Text: {suggestion['anchor_text']}")
        print(f"Context: {suggestion['context']}")
        print(f"Target URL: {suggestion['target_url']}")
    
    return suggestions

if __name__ == "__main__":
    main()

Processing https://www.xenia.team/articles/hotel-maintenance-management-software...
Successfully processed https://www.xenia.team/articles/hotel-maintenance-management-software
Title: 19 Best Hotel Maintenance Management Software
Number of chunks: 117
--------------------------------------------------
Vector stores saved to vectorstores.pkl
Vector stores loaded from vectorstores.pkl

Suggested Anchor Text: Custom Reports
Context: The main text mentions the ability to create custom reports based on specific criteria, making this a relevant anchor text for a URL that provides more information on report creation.
Target URL: https://www.xenia.team/articles/hotel-maintenance-management-software (or a specific section on report creation)

Suggested Anchor Text: Data Collection Efforts
Context: The main text highlights the importance of getting the most out of data collection efforts, making this a relevant anchor text for a URL that provides more information on data collection and analysis.