# Data collecting

## Importing libraries

In [15]:
import os
import requests
import pandas as pd
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin

## Data collection by Webscraping

'Making a script that Search through the url and then locating the download .xlsx / .xls url and then renaming the file and downloading it in the directory. 

In [17]:
# Function to create a folder if it does not exist
def create_folder(folder_name):
    try:
        os.makedirs(folder_name, exist_ok=True)
    except Exception as e:
        print(f"Error creating folder: {folder_name} - {e}")

# Function to download and save files based on the specified headers and year range
def download_and_save_files(h2_text, h3_text, folder_name, file_prefix, years):
     
    # Find the relevant h2 and h3 tags
    h2_tag = soup.find('h2', text=h2_text)
    h3_tag = h2_tag.find_next('h3', text=h3_text)

    # Find the <p> tag containing the file links
    p_files = h3_tag.find_next('p')

    # Iterate through the file links
    for link in p_files.find_all('a'):
        # Check if the year of the file is in the desired range
        try:
            year = int(link.text.strip())
        except ValueError:
            continue

        if year not in years:
            continue

        # Construct the absolute file URL
        file_url = urljoin(url, link['href'])

        # Extract the file name from the URL
        parsed_url = urlparse(file_url)
        file_name = os.path.basename(parsed_url.path)

        # Extract the file extension from the file name
        file_ext = os.path.splitext(file_name)[1]

        # Construct the new file name with the format "{file_prefix}_year.ext"
        new_file_name = f"{file_prefix}_{year}{file_ext}"

        # Download the file and save it in the folder
        file_response = requests.get(file_url)
        with open(os.path.join(folder_name, new_file_name), 'wb') as file:
            file.write(file_response.content)

# Define the URL of the website you want to scrape
url = 'https://ufm.dk/uddannelse/statistik-og-analyser/sogning-og-optag-pa-videregaende-uddannelser/grundtal-om-sogning-og-optag/ansogere-og-optagne-fordelt-pa-kon-alder-og-adgangsgrundlag/ansogere-og-optagne-fordelt-pa-kon-alder-og-adgangsgrundlag'

# Send an HTTP request to the website
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Parse the HTML content
    soup = BeautifulSoup(response.content, 'html.parser')

    # Define the headers and sub-headers to iterate over
    headers = [
        ('Ansøgere', 'applicants_'),
        ('Optagne', 'admitted_')
    ]

    sub_headers = [
        ('Køn:', 'gender'),
        #('Alder:', 'age'),
        #('Adgangsgrundlag:', 'acceptance')
    ]

    # Define the range of years to download files for
    years = range(2009, 2023)

    # Loop over headers and sub-headers
    for h2_text, h2_prefix in headers:
        for h3_text, folder_name in sub_headers:

            # Construct the file prefix using h2_prefix and folder_name
            file_prefix = f"{h2_prefix}{folder_name}"

            # Create the folder for storing files
           
            create_folder(folder_name)

            # Download and save files
            download_and_save_files(h2_text, h3_text, folder_name, file_prefix, years)
else:
    print(f'Failed to fetch the website: {url}')



  h2_tag = soup.find('h2', text=h2_text)
  h3_tag = h2_tag.find_next('h3', text=h3_text)


## File Convertion

In [None]:
# Creating a function that convers all .xls and .xlsx files to .csv 
def convert_to_csv_and_delete(folder_name):
    for file in os.listdir(folder_name):
        file_path = os.path.join(folder_name, file)

        # Get the file extension
        file_ext = os.path.splitext(file_path)[1]

        # Check if the file has a .xls or .xlsx extension
        if file_ext == '.xls' or file_ext == '.xlsx':
            # Read the file using pandas
            df = pd.read_excel(file_path)

            # Construct the new file name with .csv extension
            new_file_name = os.path.splitext(file)[0] + '.csv'

            # Save the DataFrame as a CSV file
            df.to_csv(os.path.join(folder_name, new_file_name), index=False)

            # Delete the original .xls or .xlsx file
            os.remove(file_path)

# Convert .xls and .xlsx files to .csv format in the respective folders and delete the original files
folders = ['gender', 'age', 'acceptance']
for folder in folders:
    convert_to_csv_and_delete(folder)
