In [None]:
#automated grants ( specifically for start-ups in the pre-seed/ideation/mvp stage in Jordan or internationally if applicable) 
#Your first task for this project will be implementing a code that searches and gathers data about said grants, saves in a notepad the date of the search


from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import time
import pandas as pd
from datetime import datetime
import os


In [None]:

# ChromeDriver Path
chrome_driver_path = r"C:\Users\Syndictech\Downloads\chromedriver-win64\chromedriver-win64\chromedriver.exe"
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode (no browser window)
chrome_options.add_argument('--disable-blink-features=AutomationControlled')

# Setting up the webdriver
service = Service(chrome_driver_path)
driver = webdriver.Chrome(service=service, options=chrome_options)

# Search for grants specific to startups in Jordan or internationally
search_query = "grants for startups in pre-seed ideation MVP stage in Jordan"
apply_website_filter = input("Do you want to find in a specific website (yes/no)? ").strip().lower()

if apply_website_filter == 'yes':
    website_filter = input("Enter the website name (ex: linkedin): ").strip()
    search_query += " site:" + website_filter
    print("New search query:", search_query)

# User input for number of links to process
num_links_to_process = int(input("Enter the number of links to process: "))

def apply_filter(href):
    """
    Filter links based on certain criteria to avoid unwanted or irrelevant content.
    
    Args:
    href (str): The URL to be checked.

    Returns:
    bool: True if the link is considered valid, False otherwise.
    """
    unwanted_keywords = ['products', 'ads', 'jobs', 'signup']
    return (href and not href.startswith('/search') and 'google.com' not in href and
            all(keyword not in href for keyword in unwanted_keywords))

def save_to_notepad(data, file_name="grant_search_results.txt"):
    """
    Save the provided data to a text file.

    Args:
    data (str): The data to be saved.
    file_name (str): The name of the file to save the data in. Default is "grant_search_results.txt".
    """
    with open(file_name, 'a', encoding='utf-8') as file:
        file.write(data + "\n")

def extract_links_and_info(page_number):
    """
    Extracts links from the current page, visits each link, and saves the title and content to a file.

    Args:
    page_number (int): The page number from which links are being extracted.
    """
    links = driver.find_elements(By.XPATH, '//a[@href]')
    processed_count = 0

    for link in links:
        if processed_count >= num_links_to_process:
            break

        href = link.get_attribute('href')
        if apply_filter(href):
            try:
                # Open the link in a new tab
                driver.execute_script(f"window.open('{href}', '_blank');")
                driver.switch_to.window(driver.window_handles[1])

                # Extract the entire HTML content
                page_html = driver.page_source

                # Parse the HTML content
                soup = BeautifulSoup(page_html, 'lxml')

                # Extract and format the title
                title = soup.title.string if soup.title else 'No title found'

                # Extract and format the text content
                paragraphs = soup.find_all('p')
                text_content = "\n".join(p.get_text() for p in paragraphs)

                # Prepare data to be saved
                data_to_save = f"Link: {href}\nTitle: {title}\nText Content:\n{text_content}\n{'='*80}\n"

                # Save to notepad
                save_to_notepad(data_to_save)

                # Close the tab and switch back
                driver.close()
                driver.switch_to.window(driver.window_handles[0])

                processed_count += 1

            except Exception as e:
                print(f"Could not extract info from {href}: {e}")

    print(f"\nPage {page_number} links and info extracted.\n")

# Searching
driver.get('https://www.google.com/')
search_box = driver.find_element(By.NAME, 'q')
search_box.send_keys(search_query)
search_box.send_keys(Keys.RETURN)

# Wait for the page to load to take the links
time.sleep(6)

# Save the date of the search
current_date = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
save_to_notepad(f"Search Date: {current_date}\nQuery: {search_query}\n{'='*80}")

# Page number counter
page_number = 1

# Extract links and info from the first page
extract_links_and_info(page_number)

# Going to the next page till the last page or until enough links are processed
while True:
    try:
        if num_links_to_process <= 0:
            break
        
        # Check if there is a next button
        next_button = driver.find_elements(By.ID, 'pnnext')  # We used the ID to click on the next button
        if next_button:
            next_button[0].click()
        else:
            print("You're at the last page; 'Next' button not found.")
            break

        page_number += 1

        time.sleep(6)

        # Extract links and info from the next page
        extract_links_and_info(page_number)
    except Exception as e:
        print("No more pages:", e)
        break

# Close the browser
driver.quit()

print(f"Results saved in 'grant_search_results.txt'")