### Indian Gov. Websites

https://igod.gov.in/sectors

In [None]:
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
import pandas as pd
import time
import logging
from bs4 import BeautifulSoup
import requests
import urllib3

In [None]:
urllib3.disable_warnings()
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

In [None]:
def get_sector_links(url):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/91.0.4472.124 Safari/537.36'}
    response = requests.get(url, headers=headers, verify=False)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    sectors = []
    for sector_box in soup.find_all('a', class_='sector-box'):
        sectors.append({
            'title': sector_box.get('title'),
            'url': sector_box.get('href'),
            'image_url': sector_box.find('img').get('src')
        })
    
    df = pd.DataFrame(sectors)
    df.to_csv('../data/in_sector_links.csv', index=False)
    return df

def extract_websites_from_page(url, sector_title):
    options = Options()
    options.add_argument('--headless')
    
    try:
        driver = webdriver.Firefox(options=options)
        driver.get(url)
        time.sleep(2)
        
        prev_height = 0
        sites = []
        
        while True:
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(2)
            
            new_height = driver.execute_script("return document.body.scrollHeight")
            
            rows = driver.find_elements(By.CLASS_NAME, 'search-result-row')
            for row in rows:
                try:
                    link = row.find_element(By.CLASS_NAME, 'search-title')
                    sites.append({
                        'sector': sector_title,
                        'name': link.text.strip(),
                        'url': link.get_attribute('href') if link.tag_name == 'a' else ''
                    })
                except:
                    continue
            
            if new_height == prev_height:
                break
                
            prev_height = new_height
            
        sites = [dict(t) for t in {tuple(d.items()) for d in sites}]
        logging.info(f"Extracted {len(sites)} links from {sector_title}")
        return sites
        
    except Exception as e:
        logging.error(f"Error processing {url}: {str(e)}")
        return []
        
    finally:
        if 'driver' in locals():
            driver.quit()

In [None]:
def main():
    sector_df = get_sector_links('https://igod.gov.in/sectors')
    logging.info(f"Found {len(sector_df)} sectors")
    
    all_sites = []
    for _, row in sector_df.iterrows():
        sites = extract_websites_from_page(row['url'], row['title'])
        all_sites.extend(sites)
        
        current_df = pd.DataFrame(all_sites)
        current_df.to_csv('../data/in_gov_domain_list.csv', index=False)
        logging.info(f"Saved {len(current_df)} organizations so far")
        time.sleep(1)
    
    final_df = pd.DataFrame(all_sites)
    print(f"\nTotal organizations found: {len(final_df)}")
    return final_df

if __name__ == "__main__":
    df = main()