# ***Importing***

In [None]:
# from bs4 import BeautifulSoup # Using lxml instead
from lxml import html
# import requests # Using httpx instead
# import aiohttp # Using httpx instead
import httpx
import pandas as pd
import numpy as py
import os
import time
from urllib.parse import urlparse
import asyncio
import nest_asyncio

directory = 'G:\\Other computers\\My Laptop\\Script\\Web Scraping Project\\Tutorials\\Asyncio'
os.chdir(directory)

df = pd.read_csv("df_nocode.csv")
df.head(25)

# ***Scrape (httpx, lxml)***

In [None]:
# Allow running in Jupyter
nest_asyncio.apply()

# Async Functions
async def fetch(session, url, row_num, attempts=9, delay=5):
    for attempt in range(attempts):
        resp = await session.get(url)
        if resp.status_code == 200:
            return resp
        await asyncio.sleep(delay)
        print(f"Reconnecting at {row_num}... Attempt {attempt+1}/10")
    print(f"Couldn't connect after {attempts+1} attempts in {row_num}")
    return None

async def scrape(session, url, row_num):
    page = await fetch(session, url, row_num)
    if page:
        raw_code = html.fromstring(page.text)
        code = raw_code.xpath('//*[@id="description"]/div[1]/p[2]/text()')
        if code:
            return code
        print(f'At {row_num}, code was not found')
        return None    
    print(f'At {row_num}, fetched page was successful but could not be processed')
    return None
        
async def processing(session, row_num):
    url = df.loc[row_num, 'esco_link']
    posted_link = urlparse(url).path.split('/')[-1]
    
    #-# Get the link (because requests cannot get links that are essentially shortened/changed like bit.ly):
    if 'occupation' in url:
        esco_link = f"https://esco.ec.europa.eu/en/classification/occupation?uri=http%3A%2F%2Fdata.europa.eu%2Fesco%2Foccupation%2F{posted_link}"
    else:
        esco_link = f"https://esco.ec.europa.eu/en/classification/occupation?uri=http%3A%2F%2Fdata.europa.eu%2Fesco%2Fisco%2F{posted_link}"
    
    return await scrape(session, esco_link, row_num)

async def main(batches = 5, delay = 2):
    total_url = 21
    code_list = []

    # PUT A LOAD FILE TRY-EXCEPT CODE HERE #
    # PUT A LOAD FILE TRY-EXCEPT CODE HERE #
    
    async with httpx.AsyncClient(timeout=100) as session:
        for batch_start in range(0, total_url, batches):
            batch_end = min(batch_start + batches, total_url)
            
            tasks = [processing(session, row_num) for row_num in range(batch_start, batch_end)]
            results = await asyncio.gather(*tasks)
            
            code_list.extend(results)
            print(f"Batches {batch_start} to {batch_end} done!")

            # PUT A SAVE FILE CODE HERE #
            print(code_list)
            # PUT A SAVE FILE CODE HERE #
        
            await asyncio.sleep(delay)
            
    return code_list

# Measure execution time
start_time = time.perf_counter()
await main()
end_time = time.perf_counter()
print(f"Time elapsed: {end_time - start_time:.6f} seconds")