In [1]:
import requests
from bs4 import BeautifulSoup
import os

In [2]:
# URL der Webseite, von der wir die PDFs herunterladen möchten
url = 'https://www.kmk.org/service/ferien/archiv-der-ferientermine.html'

In [3]:
# Ordner erstellen, falls noch nicht vorhanden
folder_path = './data/ferien'
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

In [4]:
def download_pdf(link, filename):
    response = requests.get(link)
    with open(os.path.join(folder_path, filename), 'wb') as f:
        f.write(response.content)
    print(f'Download completed: {filename}')

# Nur bei Bedarf downloaden

In [5]:
# # HTTP-Anfrage senden und Antwort bekommen
# response = requests.get(url)
# soup = BeautifulSoup(response.text, 'html.parser')

# # Alle Links finden, die auf PDFs verweisen
# for link in soup.find_all('a', href=True):
#     href = link['href']
#     if href.endswith('.pdf'):
#         pdf_url = f'https://www.kmk.org{href}' if not href.startswith('http') else href
#         filename = href.split('/')[-1]
#         download_pdf(pdf_url, filename)


In [6]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# URL der Webseite, von der du die Daten extrahieren möchtest
url = "https://www.schulferien.org/deutschland/ferien"


relevante_jahre = range(1994,2025)
all_data = []
for jahr in relevante_jahre:
    url = f"https://www.schulferien.org/deutschland/ferien/{jahr}/"
    print(f"Scraping {jahr})")
    # HTTP-Anfrage an die URL senden
    response = requests.get(url)
    response.raise_for_status()  # Sicherstellen, dass die Anfrage erfolgreich war
    
    # HTML-Inhalt der Seite mit BeautifulSoup analysieren
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Die Tabelle finden, die die Feriendaten enthält
    table = soup.find('table', class_='sf_table sf_table_responsive_block')
    
    # Die Daten aus der Tabelle extrahieren
    data = []
    headers = [th.text.strip() for th in table.find('thead').find_all('th')]
    for row in table.find('tbody').find_all('tr'):
        cols = [td.text.strip() for td in row.find_all('td')]
        cols.append(jahr)
        data.append(cols)
    all_data.append(data)
        

Scraping 1994)
Scraping 1995)
Scraping 1996)
Scraping 1997)
Scraping 1998)
Scraping 1999)
Scraping 2000)
Scraping 2001)
Scraping 2002)
Scraping 2003)
Scraping 2004)
Scraping 2005)
Scraping 2006)
Scraping 2007)
Scraping 2008)
Scraping 2009)
Scraping 2010)
Scraping 2011)
Scraping 2012)
Scraping 2013)
Scraping 2014)
Scraping 2015)
Scraping 2016)
Scraping 2017)
Scraping 2018)
Scraping 2019)
Scraping 2020)
Scraping 2021)
Scraping 2022)
Scraping 2023)
Scraping 2024)


In [7]:
#the first level of the list is the year, this is not necessary anymore so we flatten the list
all_data_flat = [item for sublist in all_data for item in sublist]

In [8]:
#remove any whitespace in every data in all_data_flat
for row in all_data_flat:
    for entry in row:
        if isinstance(entry, str):
            entry = entry.strip("*\n ")
            entry = entry.replace(" ", "")

In [9]:
#remove any list in all_data_flat that has only 2 elements
all_data_flat = [row for row in all_data_flat if len(row) > 2]

In [10]:
cleaned_data = [[s.strip().replace(" ", "") if isinstance(s, str) else s for s in sublist] for sublist in all_data_flat]

In [11]:
# Updating the cleaning function to preserve both dashes "-" and dots "." in the data

def clean_string_preserve_dashes_and_dots(s):
    # Strip leading/trailing whitespace and newlines, remove leading/trailing non-alphanumeric characters
    # except for dashes and dots which are preserved
    return ''.join(c if c.isalnum() or c in {'-', '.'} else '' for c in s.strip())

# Apply the revised cleaning function to all strings in the list
cleaned_data_preserve_dashes_and_dots = [[clean_string_preserve_dashes_and_dots(s) if isinstance(s, str) else s for s in sublist] for sublist in cleaned_data]

In [12]:
import re

# Define the regex patterns
date_regex = re.compile(r'(\d{2}\.\d{2}\.-\d{2}\.\d{2}\.)')
date_single = re.compile(r'(\d{2}\.\d{2}\.)')

# Define a function to filter and clean the data based on the regex
def filter_dates(data):
    cleaned_list = []
    for sublist in data:
        new_sublist = []
        for item in sublist:
            if isinstance(item, str):
                # Check if item matches date range pattern
                if date_regex.search(item):
                    new_sublist.append(item)
                elif date_single.search(item):  # Check for single date
                    new_sublist.append(item)
                elif item == '-':  # Preserve single dash
                    new_sublist.append(item)
                # Preserve the state name if it does not match any date patterns
                elif len(new_sublist) == 0:  
                    new_sublist.append(item)
            else:
                new_sublist.append(item)
        cleaned_list.append(new_sublist)
    return cleaned_list

# Apply the filter function to the cleaned data
filtered_data = filter_dates(cleaned_data_preserve_dashes_and_dots)

In [13]:
import numpy as np
# Die extrahierten Daten in einen Pandas DataFrame umwandeln
columns = ['Bundesland', 'Winterferien', 'Osterferien', 'Pfingstferien', 'Sommerferien', 'Herbstferien', 'Weihnachtsferien', 'Jahr']
df = pd.DataFrame(filtered_data, columns=columns)

In [14]:
# Function to validate and correct date ranges
import re

def correct_date_ranges(date_str):
    if pd.isna(date_str) or date_str.strip() == '-':
        return date_str
    match = re.match(r'^\d{2}\.\d{2}\.\-\d{2}\.\d{2}\.$', date_str)
    if match:
        return date_str
    else:
        return '-'

# Apply the function to the dataframe
for column in df.columns[2:-1]:  # excluding the first two columns and the year column
    df[column] = df[column].apply(correct_date_ranges)


In [15]:
# Function to handle the split and maintain '-' for both start and end if the original value is '-'
def split_vacation_periods(df, column_name):
    start_col = column_name + ' Start'
    end_col = column_name + ' Ende'
    df[[start_col, end_col]] = df[column_name].apply(lambda x: '-' if x == '-' else x).str.split('-', expand=True)
    return df

# List of vacation periods to split
vacation_periods = ['Winterferien', 'Osterferien', 'Pfingstferien', 'Sommerferien', 'Herbstferien', 'Weihnachtsferien']

# Applying the function to each vacation period
for period in vacation_periods:
    df = split_vacation_periods(df, period)


In [16]:
df = df.replace(r'^\s*$', np.nan, regex=True)

In [17]:
# We have to transform the date ranges into a format that can be used by pandas
# We add the year to the date ranges, from dd.mm. to dd.mm.yyyy
df['Winterferien Start'] = df['Winterferien Start'] + df['Jahr'].astype(str)
df['Winterferien Ende'] = df['Winterferien Ende']  + df['Jahr'].astype(str)
df['Osterferien Start'] = df['Osterferien Start'] + df['Jahr'].astype(str)
df['Osterferien Ende'] = df['Osterferien Ende']  + df['Jahr'].astype(str)
df['Pfingstferien Start'] = df['Pfingstferien Start']  + df['Jahr'].astype(str)
df['Pfingstferien Ende'] = df['Pfingstferien Ende']  + df['Jahr'].astype(str)
df['Sommerferien Start'] = df['Sommerferien Start']  + df['Jahr'].astype(str)
df['Sommerferien Ende'] = df['Sommerferien Ende']  + df['Jahr'].astype(str)
df['Herbstferien Start'] = df['Herbstferien Start']  + df['Jahr'].astype(str)
df['Herbstferien Ende'] = df['Herbstferien Ende']  + df['Jahr'].astype(str)
df['Weihnachtsferien Start'] = df['Weihnachtsferien Start']  + df['Jahr'].astype(str)
df['Weihnachtsferien Ende'] = df['Weihnachtsferien Ende']  + df['Jahr'].astype(str)


In [18]:
df = df.replace(r'^\s*$', np.nan, regex=True)

In [19]:
# Convert the date ranges into datetime objects for all specified columns
df['Winterferien Start'] = pd.to_datetime(df['Winterferien Start'], format='%d.%m.%Y', errors='coerce')
df['Winterferien Ende'] = pd.to_datetime(df['Winterferien Ende'], format='%d.%m.%Y', errors='coerce')
df['Osterferien Start'] = pd.to_datetime(df['Osterferien Start'], format='%d.%m.%Y', errors='coerce')
df['Osterferien Ende'] = pd.to_datetime(df['Osterferien Ende'], format='%d.%m.%Y', errors='coerce')
df['Pfingstferien Start'] = pd.to_datetime(df['Pfingstferien Start'], format='%d.%m.%Y', errors='coerce')
df['Pfingstferien Ende'] = pd.to_datetime(df['Pfingstferien Ende'], format='%d.%m.%Y', errors='coerce')
df['Sommerferien Start'] = pd.to_datetime(df['Sommerferien Start'], format='%d.%m.%Y', errors='coerce')
df['Sommerferien Ende'] = pd.to_datetime(df['Sommerferien Ende'], format='%d.%m.%Y', errors='coerce')
df['Herbstferien Start'] = pd.to_datetime(df['Herbstferien Start'], format='%d.%m.%Y', errors='coerce')
df['Herbstferien Ende'] = pd.to_datetime(df['Herbstferien Ende'], format='%d.%m.%Y', errors='coerce')
df['Weihnachtsferien Start'] = pd.to_datetime(df['Weihnachtsferien Start'], format='%d.%m.%Y', errors='coerce')
df['Weihnachtsferien Ende'] = pd.to_datetime(df['Weihnachtsferien Ende'], format='%d.%m.%Y', errors='coerce')



In [20]:
### Add df to postresql
from os import getenv
from sqlalchemy import create_engine
%load_ext dotenv
%dotenv

In [21]:
# Define database connection parameters
username = getenv('DB_USER').lower()
password = getenv('DB_PASSWORD')
host = getenv('DB_HOST')
port = getenv('DB_PORT')
database = getenv('DB_NAME')

# Define the connection string
# Format: dialect+driver://username:password@host:port/database
connection_string = f'postgresql://{username}:{password}@{host}:{port}/{database}'

# Create the engine
engine = create_engine(connection_string)

In [22]:
# Insert the DataFrame into the database
df.to_sql('ferien', engine, schema='original_data', if_exists='replace', index=False)

496