In [1]:
from bs4 import BeautifulSoup
import requests

from urllib.parse import urljoin, urlparse
import pandas as pd

In [2]:
def crawl_and_fetch_text(base_url):
    try:
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
        }

        # Set of visited URLs to avoid duplicates
        visited = set()
        text_data = {}

        # Fetch the base page
        response = requests.get(base_url, headers=headers)
        response.raise_for_status()
        response.encoding = response.apparent_encoding
        soup = BeautifulSoup(response.text, "html.parser")

        # Add the base URL to visited and fetch its text
        visited.add(base_url)
        text_data[base_url] = soup.get_text(separator="\n", strip=True)

        # Find all anchor tags with href attributes
        links = soup.find_all("a", href=True)
        for link in links:
            # Resolve relative URLs
            href = urljoin(base_url, link['href'])

            # Skip if already visited or if the link is external
            if href not in visited and urlparse(href).netloc == urlparse(base_url).netloc:
                try:
                    # Fetch and process the page
                    page_response = requests.get(href, headers=headers)
                    page_response.encoding = page_response.apparent_encoding
                    page_soup = BeautifulSoup(page_response.text, "html.parser")

                    # Extract text and store it
                    text_data[href] = page_soup.get_text(separator="\n", strip=True)
                    visited.add(href)
                except requests.exceptions.RequestException as e:
                    print(f"Failed to fetch {href}: {e}")

        return text_data

    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        return {}



# # Print results (text from each page)
# for url, text in text_by_page.items():
#     print(f"URL: {url}\n")
#     print(f"Text: {text[:500]}...\n")  # Print first 500 characters of each page's text


In [None]:
from bs4 import BeautifulSoup
import requests
from urllib.parse import urljoin, urlparse

def crawl_and_fetch_text(base_url, timeout=30):
    try:
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
        }

        # Set of visited URLs to avoid duplicates
        visited = set()
        text_data = {}

        # Fetch the base page with timeout
        response = requests.get(base_url, headers=headers, timeout=timeout)
        response.raise_for_status()
        response.encoding = response.apparent_encoding
        
        try:
            # Try parsing the page with BeautifulSoup
            soup = BeautifulSoup(response.text, "html.parser")

            # Add the base URL to visited and fetch its text
            visited.add(base_url)
            text_data[base_url] = soup.get_text(separator="\n", strip=True)

            # Print 'done' for the base URL
            print(f"Done for URL: {base_url}")

            # Find all anchor tags with href attributes
            links = soup.find_all("a", href=True)
            for link in links:
                # Resolve relative URLs
                href = urljoin(base_url, link['href'])

                # Skip if already visited or if the link is external
                if href not in visited and urlparse(href).netloc == urlparse(base_url).netloc:
                    try:
                        # Fetch and process the page with timeout
                        page_response = requests.get(href, headers=headers, timeout=timeout)
                        page_response.encoding = page_response.apparent_encoding
                        page_soup = BeautifulSoup(page_response.text, "html.parser")

                        # Extract text and store it
                        text_data[href] = page_soup.get_text(separator="\n", strip=True)
                        visited.add(href)

                        # Print 'done' for the second-level URL
                        print(f"Done for URL: {href}")

                    except requests.exceptions.Timeout:
                        print(f"Request timed out for URL: {href}")
                    except requests.exceptions.RequestException as e:
                        print(f"Failed to fetch {href}: {e}")

        except Exception as soup_error:
            print(f"Parser rejected markup for {base_url}: {soup_error}")
        
        return text_data

    except requests.exceptions.Timeout:
        print(f"Request timed out for URL: {base_url}")
        return {}
    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        return {}


In [None]:
skolky = pd.read_csv("input_skolky.csv", index_col=0)
skolky

Unnamed: 0,Plný název,WWW
0,Mateřská škola sv. Voršily v Praze,https://mssvvorsily.cz/
1,Církevní mateřská škola Studánka,http://www.cms-studanka.cz
2,Modrý klíč - základní škola speciální a mateřs...,https://www.webskoly.cz/modryklic
3,Církevní mateřská škola Srdíčko,https://www.cms-srdicko.cz
4,Bilingvální mateřská škola pro sluchově postiž...,http://www.pipan.cz
...,...,...
5455,Lesní mateřská škola sv. Františka,http://www.pardubice.charita.cz
5457,"Lesní mateřská škola Lesní děti, z.ú.",https://www.lesnideti.cz
5458,"Lesní mateřská škola Malala, z.ú.",https://www.malala.cz
5460,Maple Bear mateřská škola Olomouc s.r.o.,https://www.olomouc.maplebear.cz/


In [4]:
# Define the chunk size
chunk_size = 500

# Create a list of DataFrame slices
skolky_chunks = [skolky.iloc[i:i+chunk_size] for i in range(0, len(skolky), chunk_size)]




In [None]:
for i, chunk in enumerate(skolky_chunks):
    chunk['text_data'] = chunk['WWW'].apply(crawl_and_fetch_text)
    chunk['text_data'] = chunk['text_data'].apply(lambda x: "invalid url" if not x else x)

    # Save valid and invalid separately
    chunk_valid = chunk[chunk['text_data'] != "invalid url"]
    chunk_invalid = chunk[chunk['text_data'] == "invalid url"]

    chunk_valid.to_csv(f"/Users/annabrazdova/default/kindergartens/01_text_skolky/extracted_text_{i}.csv", index=False)
    chunk_invalid.to_csv(f"/Users/annabrazdova/default/kindergartens/01_ivalid_url_skolky/invalid_url_{i}.csv", index=False)


KeyboardInterrupt: 