This code is for extracting title, headings, and the domain name of a website by using it's URL.

In [None]:
!pip install tldextract

Collecting tldextract
  Downloading tldextract-5.1.2-py3-none-any.whl.metadata (11 kB)
Collecting requests-file>=1.4 (from tldextract)
  Downloading requests_file-2.1.0-py2.py3-none-any.whl.metadata (1.7 kB)
Downloading tldextract-5.1.2-py3-none-any.whl (97 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m97.6/97.6 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading requests_file-2.1.0-py2.py3-none-any.whl (4.2 kB)
Installing collected packages: requests-file, tldextract
Successfully installed requests-file-2.1.0 tldextract-5.1.2


In [None]:
import requests
from bs4 import BeautifulSoup
import tldextract
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def get_website_info(url):
    # Fetch the page content
    response = requests.get(url)
    response.raise_for_status()  # Check if the request was successful

    # Parse the HTML content
    soup = BeautifulSoup(response.text, 'html.parser')

    # Extract the title
    title = soup.title.string if soup.title else 'No title found'

    # Extract all headings (h1, h2, h3, etc.)
    headings = []
    for level in range(1, 7):  # h1 to h6
        for heading in soup.find_all(f'h{level}'):
            headings.append(heading.text.strip())

    # Extract the domain name
    extracted_info = tldextract.extract(url)
    domain_name = extracted_info.domain

    return title, headings, domain_name

# Example usage
url = "https://bekushal.com"
query = "bekushal"

title, headings, domain_name = get_website_info(url)

print(f"Title: {title}")
print(f"Headings: {headings}")
print(f"Domain Name: {domain_name}")

Title: BEkushal
Headings: ['BEkushal.', 'Mind Body Mentor', 'About Us', 'Frequently Asked Questions', '1.\n\t\t    Does Yoga truly help in improving the quality of life?', '2.\n                    Is Yoga just another form of exercise?', '3.\n                    What if I am still unable to make progress even after practicing Yoga for some time?', '4.\n                    Do you have any community of like minded people that I can join?', 'Videos on Practical Spirituality', 'Blog on Indian Philosophy', 'IKS Texts GitHub Repo', 'Personalised Yoga Protocol']
Domain Name: bekushal


This code is for finding cosine similarity between title, headings and the query input.

In [None]:
def calculate_cosine_similarity(query, title, headings):
    # Combine the query, title, and headings into a single list
    documents = [query, title] + headings

    # Use TfidfVectorizer to convert the text to vectors
    vectorizer = TfidfVectorizer().fit_transform(documents)
    vectors = vectorizer.toarray()

    # Compute cosine similarity between the query and the title/headings
    query_vector = vectors[0]  # First vector is the query
    title_vector = vectors[1]  # Second vector is the title
    heading_vectors = vectors[2:]  # Remaining vectors are the headings

    # Calculate similarities
    title_similarity = cosine_similarity([query_vector], [title_vector]).flatten()[0]
    heading_similarities = cosine_similarity([query_vector], heading_vectors).flatten()

    return title_similarity, heading_similarities


# Calculate cosine similarity between the query and title/headings
title_similarity, heading_similarities = calculate_cosine_similarity(query, title, headings)

# Print similarities for title and each heading
print(f"\nQuery: {query}")
print(f"Similarity with Title: {title_similarity:.4f}")

for i, (heading, similarity) in enumerate(zip(headings, heading_similarities), 1):
    print(f"Similarity with Heading {i}: {heading} - {similarity:.4f}")



Query: bekushal
Similarity with Title: 1.0000
Similarity with Heading 1: BEkushal. - 1.0000
Similarity with Heading 2: Mind Body Mentor - 0.0000
Similarity with Heading 3: About Us - 0.0000
Similarity with Heading 4: Frequently Asked Questions - 0.0000
Similarity with Heading 5: 1.
		    Does Yoga truly help in improving the quality of life? - 0.0000
Similarity with Heading 6: 2.
                    Is Yoga just another form of exercise? - 0.0000
Similarity with Heading 7: 3.
                    What if I am still unable to make progress even after practicing Yoga for some time? - 0.0000
Similarity with Heading 8: 4.
                    Do you have any community of like minded people that I can join? - 0.0000
Similarity with Heading 9: Videos on Practical Spirituality - 0.0000
Similarity with Heading 10: Blog on Indian Philosophy - 0.0000
Similarity with Heading 11: IKS Texts GitHub Repo - 0.0000
Similarity with Heading 12: Personalised Yoga Protocol - 0.0000


The following code is to check whether the robots.txt file exist for a website using it's URL.

In [None]:
import requests
from urllib.parse import urlparse

def get_base_url(url):
    parsed_url = urlparse(url)
    base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
    return base_url

def is_valid_robots_content(content):
    # Basic validation for common robots.txt lines (User-agent, Disallow, Allow, Sitemap)
    lines = content.splitlines()
    for line in lines:
        if line.strip().startswith(("User-agent", "Disallow", "Allow", "Sitemap", "Crawl-delay", "Host")):
            return True
    return False

def check_robots_txt(url):
    base_url = get_base_url(url)
    robots_url = base_url + "/robots.txt"

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    }

    try:
        # Send request with a User-Agent header to mimic a browser
        response = requests.get(robots_url, headers=headers, allow_redirects=True)

        # Check if the status code is 200 (OK)
        if response.status_code == 200:
            # Check if the content of the robots.txt follows standard patterns
            if is_valid_robots_content(response.text):
                return True
            else:
                return False  # Not a valid robots.txt format
        elif response.status_code == 404:
            return "robots.txt file not found"
        elif response.status_code == 403:
            return "access to robots.txt is forbidden"
        elif response.status_code == 529:
            return "server error"
        else:
            return f"unexpected status code: {response.status_code}"
    except requests.RequestException as e:
        print(f"Error checking robots.txt for {base_url}: {e}")
        return False

if __name__ == "__main__":
    website_url = "https://bekushal.com"
    exists = check_robots_txt(website_url)

    if exists == True:
        print(f"robots.txt file exists for {website_url} and it's valid.")
    elif exists == "server error":
        print(f"Server error while checking {website_url}")
    elif exists == "robots.txt file not found":
        print(f"robots.txt file does not exist for {website_url}")
    elif exists == "access to robots.txt is forbidden":
        print(f"Access to robots.txt is forbidden for {website_url}")
    else:
        print(f"robots.txt file does not exist or is not valid for {website_url}")


robots.txt file does not exist for https://bekushal.com
