In [3]:
import requests
from bs4 import BeautifulSoup
from prettytable import PrettyTable
import urllib.robotparser


college_urls = [
    "http://www.coep.org.in",  
    "https://www.viit.ac.in",  
    "http://www.mitpune.edu.in", 
    "https://www.pict.edu",
    "https://www.vit.edu",
    "https://www.dypcoeakurdi.ac.in",
    "https://www.sitpune.edu.in",
    "https://www.pvgcoet.ac.in",
    "https://www.isquareit.edu.in"
   
]


def can_fetch(url):
    robot_parser = urllib.robotparser.RobotFileParser()
    robot_parser.set_url(url + "/robots.txt")
    robot_parser.read()
    return robot_parser.can_fetch("*", url)


def fetch_page_content(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            return soup
        else:
            print(f"Failed to retrieve {url} - Status Code: {response.status_code}")
            return None
    except Exception as e:
        print(f"Error occurred while fetching {url}: {e}")
        return None


index = {}


def fetch_college_data():
    for url in college_urls:
        print(f"\nChecking robots.txt for: {url}")
        if not can_fetch(url):
            print(f"Access to {url} is disallowed by robots.txt")
            continue 

        print(f"Fetching data from: {url}")
        page_content = fetch_page_content(url)

        if page_content:
            # Initialize the link counter for this website
            link_counter = 1

            # Extract all the anchor tags (<a>) for links
            links = page_content.find_all('a')
            print(f"\nLinks found on {url}:")

            # List to hold the indexed links found on this page
            link_list = []

            for link in links:
                href = link.get('href')
                if href:
                    # Index the link and store in the link list with the counter
                    indexed_link = f"{link_counter}: {href}"
                    link_list.append(indexed_link)
                    print(indexed_link)  # Detailed output of each link
                    link_counter += 1

            
            index[url] = link_list
        else:
            print(f"Could not retrieve content from {url}")


fetch_college_data()

# Create a PrettyTable for structured output
table = PrettyTable()
table.field_names = ["College Website", "Indexed Links"]

# Populate the table with data
for college, links in index.items():
    for link in links:
        table.add_row([college, link])

# Print the table
print("\nIndex of all links found:")
print(table)



Checking robots.txt for: http://www.coep.org.in
Fetching data from: http://www.coep.org.in
Failed to retrieve http://www.coep.org.in - Status Code: 406
Could not retrieve content from http://www.coep.org.in

Checking robots.txt for: https://www.viit.ac.in
Access to https://www.viit.ac.in is disallowed by robots.txt

Checking robots.txt for: http://www.mitpune.edu.in
Fetching data from: http://www.mitpune.edu.in

Links found on http://www.mitpune.edu.in:
1: #primary-nav
2: #lqd-site-content
3: https://mitpune.edu.in/
4: https://mitpune.edu.in/
5: #
6: /maeers-boards-and-bodies/
7: /vision-mission/
8: /history/
9: /our-founder-legacy/
10: https://mitpune.edu.in/our-universities/
11: https://mitpune.edu.in/affiliated-colleges-2/
12: https://mitpune.edu.in/hospitals/
13: https://mitpune.edu.in/junior-colleges/
14: https://mitpune.edu.in/schools/
15: /social-initiatives/
16: /rankings/
17: /recognition-and-accreditation/
18: /research-incubation/
19: /research-and-innovations/
20: https://