# School Search and Niche Scraper
This notebook searches for schools and scrapes their Niche pages using Firecrawl, then summarizes with Gemini Pro.

In [None]:
# Install required packages
!pip install requests beautifulsoup4 firecrawl-py google-generativeai python-dotenv

In [7]:
import pandas as pd
import re
import time

from firecrawl import FirecrawlApp
import google.generativeai as genai
from dotenv import load_dotenv
import os
from utils.gsearch import SchoolSearcher

# Load environment variables
load_dotenv(dotenv_path='.env.example')

True

In [3]:
from utils.gsearch import SchoolSearcher

searcher = SchoolSearcher()

# Method 1: DuckDuckGo (recommended)
# results = searcher.search_school_comprehensive("Thoreau School Concord niche", "duckduckgo")
res = searcher.find_niche_link('Thoreau School Concord')
# Method 2: Direct URL construction
# niche_urls = searcher.direct_niche_search("Stanford University")

# # Method 3: SerpAPI (if you have API key)
# results = searcher.search_school_comprehensive("MIT", "serpapi", "your_api_key")
# results


In [23]:
class NicheScraper:
    def __init__(self, firecrawl_api_key):
        self.firecrawl = FirecrawlApp(api_key=firecrawl_api_key)
    
    def scrape_niche_page(self, niche_url):
        """Scrape a Niche school page using Firecrawl"""
        try:
            print(f"Scraping: {niche_url}")
            
            # Use Firecrawl to scrape the page
            result = self.firecrawl.scrape(
                niche_url,
                  formats= ['markdown', 'html']
            )
            
            return {
                'url': niche_url,
                'content': result.markdown,
                'html': result.html,
                'metadata': result.metadata
            }
            
        except Exception as e:
            print(f"Error scraping {niche_url}: {e}")
            return None

In [18]:
firecrawl = FirecrawlApp(api_key=os.getenv('FIRECRAWL_API_KEY'))

print(f"Scraping: {res[0]['url']}")

# Use Firecrawl to scrape the page
result = firecrawl.scrape(
    res[0]['url'],
        formats= ['markdown', 'html'],
        

)

            


Scraping: https://www.niche.com/k12/thoreau-school-concord-ma/


In [22]:
result.metadata

DocumentMetadata(title='Thoreau School in West Concord, MA - Niche', description='Thoreau School is a highly rated, public school located in CONCORD, MA. It has 436 students in grades PK, K-5 with a student-teacher ratio of 10 to 1. According to state test scores, 69% of students are at least proficient in math and 69% in reading.', url='https://www.niche.com/k12/thoreau-school-concord-ma/', language='en', keywords='Thoreau School, West Concord, Massachusetts, MA, Public School, Thoreau School ratings, school ratings, school statistics, school information, PK, K-5', robots=None, og_title='Thoreau School in West Concord, MA', og_description='Rankings, stats, and reviews on academics, teachers, student life, and more.', og_url='https://www.niche.com/k12/thoreau-school-concord-ma/', og_image='https://d33a4decm84gsn.cloudfront.net/social-share/niche-k-12-1910px.png', og_audio=None, og_determiner=None, og_locale='en_US', og_locale_alternate=None, og_site_name='Niche', og_video=None, favicon

In [24]:
nscraper = NicheScraper(os.getenv('FIRECRAWL_API_KEY'))
nscraper.scrape_niche_page(res[0]['url'])

Scraping: https://www.niche.com/k12/thoreau-school-concord-ma/


{'url': 'https://www.niche.com/k12/thoreau-school-concord-ma/',
 'content': "[Skip to Main Content](https://www.niche.com/k12/thoreau-school-concord-ma/#maincontent)\n\n![profile header](<Base64-Image-Removed>)[© Mapbox](https://www.mapbox.com/about/maps/)\xa0/\xa0[© OpenStreetMap](http://www.openstreetmap.org/copyright)\n\nPublic School\n\n## Report Card\n\ngrade\xa0A\n\nOverall Niche Grade\n\n[How are grades calculated?](https://www.niche.com/about/where-niche-grades-come-from/) [Data Sources](https://www.niche.com/about/data/#data-k12)\n\n1. Academics\n\n\n\ngrade\xa0A\n\n2. Diversity\n\n\n\ngrade\xa0B\n\n3. Teachers\n\n\n\ngrade\xa0A+\n\n\nThoreau School is a highly rated, public school located in CONCORD, MA. It has 436 students in grades PK, K-5 with a student-teacher ratio of 10 to 1. According to state test scores, 69% of students are at least proficient in math and 69% in reading.\n\n[Compare Thoreau School to Other Schools](https://www.niche.com/k12/compare/?type=school&schoo

Scraping: https://www.niche.com/k12/thoreau-school-concord-ma/
Error scraping https://www.niche.com/k12/thoreau-school-concord-ma/: FirecrawlClient.scrape() got an unexpected keyword argument 'params'


In [None]:
class GeminiSummarizer:
    def __init__(self, api_key):
        genai.configure(api_key=api_key)
        self.model = genai.GenerativeModel('gemini-pro')
    
    def summarize_school_data(self, school_name, scraped_content):
        """Generate a summary of school data using Gemini Pro"""
        prompt = f"""
        Please analyze the following information about {school_name} from their Niche page and provide a comprehensive summary.
        
        Focus on:
        1. Overall rating and ranking
        2. Academic programs and strengths
        3. Campus life and student experience
        4. Admission requirements and statistics
        5. Tuition and financial aid
        6. Notable features or achievements
        7. Student reviews highlights
        
        Content to analyze:
        {scraped_content[:8000]}  # Limit content to avoid token limits
        
        Please provide a well-structured summary in markdown format.
        """
        
        try:
            response = self.model.generate_content(prompt)
            return response.text
        except Exception as e:
            print(f"Error generating summary: {e}")
            return f"Error generating summary for {school_name}"

In [None]:
# Main workflow function
def process_school(school_name, firecrawl_api_key, gemini_api_key):
    """Complete workflow: search -> scrape -> summarize"""
    
    # Step 1: Search for school and get Niche link
    searcher = SchoolSearcher()
    search_results = searcher.search_school_comprehensive(school_name)
    
    if not search_results['niche_results']:
        return f"No Niche page found for {school_name}"
    
    niche_url = search_results['niche_results'][0]['url']
    print(f"Found Niche URL: {niche_url}")
    
    # Step 2: Scrape the Niche page
    scraper = NicheScraper(firecrawl_api_key)
    scraped_data = scraper.scrape_niche_page(niche_url)
    
    if not scraped_data:
        return f"Failed to scrape Niche page for {school_name}"
    
    # Step 3: Generate summary with Gemini
    summarizer = GeminiSummarizer(gemini_api_key)
    summary = summarizer.summarize_school_data(school_name, scraped_data['content'])
    
    return {
        'school_name': school_name,
        'niche_url': niche_url,
        'scraped_content': scraped_data['content'][:1000] + '...',  # Preview
        'summary': summary
    }

In [None]:
# Configuration - Set your API keys here or in .env file
FIRECRAWL_API_KEY = os.getenv('FIRECRAWL_API_KEY', 'your_firecrawl_api_key_here')
GEMINI_API_KEY = os.getenv('GEMINI_API_KEY', 'your_gemini_api_key_here')

# Test the complete workflow
if FIRECRAWL_API_KEY != 'your_firecrawl_api_key_here' and GEMINI_API_KEY != 'your_gemini_api_key_here':
    test_school = "Stanford University"
    result = process_school(test_school, FIRECRAWL_API_KEY, GEMINI_API_KEY)
    
    if isinstance(result, dict):
        print(f"\n=== Summary for {result['school_name']} ===")
        print(f"Niche URL: {result['niche_url']}")
        print(f"\nSummary:\n{result['summary']}")
    else:
        print(result)
else:
    print("Please set your FIRECRAWL_API_KEY and GEMINI_API_KEY in the .env file or in the cell above")

In [None]:
# Batch processing function for multiple schools
def process_multiple_schools(school_list, firecrawl_api_key, gemini_api_key, delay=2):
    """Process multiple schools with rate limiting"""
    results = []
    
    for school in school_list:
        print(f"\n{'='*50}")
        print(f"Processing: {school}")
        print(f"{'='*50}")
        
        result = process_school(school, firecrawl_api_key, gemini_api_key)
        results.append(result)
        
        # Rate limiting
        time.sleep(delay)
    
    return results

# Example usage
# schools_to_process = [
#     "Harvard University",
#     "MIT",
#     "Stanford University"
# ]
# 
# batch_results = process_multiple_schools(schools_to_process, FIRECRAWL_API_KEY, GEMINI_API_KEY)