In [67]:
import os
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
import matplotlib.pyplot as plt
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
import multiprocessing as mp
from webdriver_manager.chrome import ChromeDriverManager
from dotenv import load_dotenv
import traceback
import sys

# Get the data from the reports table

In [50]:
# Install Webdriver
service = Service()

options = webdriver.ChromeOptions()
driver = webdriver.Chrome(service=service, options=options)

driver.get("https://mrv.emsa.europa.eu/#public/emission-report")
time.sleep(60)

# html=driver.page_source
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, '//*[@id="exportablegrid-1137-body"]')))


# Find and scrape the tables
tables = driver.find_element(By.XPATH, '//*[@id="exportablegrid-1137-body"]')
elements = tables.find_elements(By.TAG_NAME, 'table')

keys = ['Reporting Period', 'Version', 'Generation Date', 'File']
reports = []
for i, table in enumerate(elements):
    result_dict = {keys[i]: value for i, value in enumerate(table.text.split('\n')[1:])}
    reports.append(result_dict)
    
    print()
    
driver.quit()








# Compare versions of new reports and old reports

In [73]:
reports_df_old = pd.read_csv('../data/raw/reports_metadata.csv')

In [82]:
reports_df_old.dtypes

Reporting Period             int64
Version                      int64
Generation Date     datetime64[ns]
File                        object
dtype: object

In [78]:
reports_df_old['Generation Date'] = pd.to_datetime(reports_df_old['Generation Date'], dayfirst=True)

In [52]:
reports_new = reports

In [53]:
reports_old = [{'Reporting Period': '2022',
  'Version': '105',
  'Generation Date': '11/11/2023',
  'File': '2022-v105-09112023-EU MRV Publication of information'},
 {'Reporting Period': '2021',
  'Version': '176',
  'Generation Date': '13/10/2023',
  'File': '2021-v176-13102023-EU MRV Publication of information'},
 {'Reporting Period': '2020',
  'Version': '194',
  'Generation Date': '11/10/2023',
  'File': '2020-v194-11102023-EU MRV Publication of information'},
 {'Reporting Period': '2019',
  'Version': '217',
  'Generation Date': '11/10/2023',
  'File': '2019-v217-11102023-EU MRV Publication of information'},
 {'Reporting Period': '2018',
  'Version': '270',
  'Generation Date': '11/10/2023',
  'File': '2018-v270-11102023-EU MRV Publication of information'}]


In [54]:
reports_new

[{'Reporting Period': '2022',
  'Version': '108',
  'Generation Date': '16/11/2023',
  'File': '2022-v108-16112023-EU MRV Publication of information '},
 {'Reporting Period': '2021',
  'Version': '178',
  'Generation Date': '16/11/2023',
  'File': '2021-v178-16112023-EU MRV Publication of information '},
 {'Reporting Period': '2020',
  'Version': '194',
  'Generation Date': '11/10/2023',
  'File': '2020-v194-11102023-EU MRV Publication of information '},
 {'Reporting Period': '2019',
  'Version': '217',
  'Generation Date': '11/10/2023',
  'File': '2019-v217-11102023-EU MRV Publication of information '},
 {'Reporting Period': '2018',
  'Version': '270',
  'Generation Date': '11/10/2023',
  'File': '2018-v270-11102023-EU MRV Publication of information '}]

In [55]:
df_old = pd.DataFrame(reports_old)
df_new = pd.DataFrame(reports_new)

## Merge current and new dataframes on the 'Reporting Period' column


In [56]:
merged_df = pd.merge(df_old, df_new, on='Reporting Period', how='outer', suffixes=('_current', '_new'))

In [57]:
merged_df

Unnamed: 0,Reporting Period,Version_current,Generation Date_current,File_current,Version_new,Generation Date_new,File_new
0,2022,105,11/11/2023,2022-v105-09112023-EU MRV Publication of infor...,108,16/11/2023,2022-v108-16112023-EU MRV Publication of infor...
1,2021,176,13/10/2023,2021-v176-13102023-EU MRV Publication of infor...,178,16/11/2023,2021-v178-16112023-EU MRV Publication of infor...
2,2020,194,11/10/2023,2020-v194-11102023-EU MRV Publication of infor...,194,11/10/2023,2020-v194-11102023-EU MRV Publication of infor...
3,2019,217,11/10/2023,2019-v217-11102023-EU MRV Publication of infor...,217,11/10/2023,2019-v217-11102023-EU MRV Publication of infor...
4,2018,270,11/10/2023,2018-v270-11102023-EU MRV Publication of infor...,270,11/10/2023,2018-v270-11102023-EU MRV Publication of infor...


## Identify rows where the new version is greater than the current version

In [58]:
new_versions = merged_df[merged_df['Version_new'] > merged_df['Version_current']]


In [59]:
new_versions

Unnamed: 0,Reporting Period,Version_current,Generation Date_current,File_current,Version_new,Generation Date_new,File_new
0,2022,105,11/11/2023,2022-v105-09112023-EU MRV Publication of infor...,108,16/11/2023,2022-v108-16112023-EU MRV Publication of infor...
1,2021,176,13/10/2023,2021-v176-13102023-EU MRV Publication of infor...,178,16/11/2023,2021-v178-16112023-EU MRV Publication of infor...


In [60]:
df_old

Unnamed: 0,Reporting Period,Version,Generation Date,File
0,2022,105,11/11/2023,2022-v105-09112023-EU MRV Publication of infor...
1,2021,176,13/10/2023,2021-v176-13102023-EU MRV Publication of infor...
2,2020,194,11/10/2023,2020-v194-11102023-EU MRV Publication of infor...
3,2019,217,11/10/2023,2019-v217-11102023-EU MRV Publication of infor...
4,2018,270,11/10/2023,2018-v270-11102023-EU MRV Publication of infor...


## Update current data for rows with new versions

In [70]:
for index, row in new_versions.iterrows():
    df_old.loc[df_old['Reporting Period'] == row['Reporting Period'], 'Version'] = row['Version_new']
    df_old.loc[df_old['Reporting Period'] == row['Reporting Period'], 'Generation Date'] = row['Generation Date_new']
    df_old.loc[df_old['Reporting Period'] == row['Reporting Period'], 'File'] = row['File_new']
    print(type(row['File_new']))
     

<class 'str'>
<class 'str'>


In [64]:
df_old

Unnamed: 0,Reporting Period,Version,Generation Date,File
0,2022,108,16/11/2023,2022-v108-16112023-EU MRV Publication of infor...
1,2021,178,16/11/2023,2021-v178-16112023-EU MRV Publication of infor...
2,2020,194,11/10/2023,2020-v194-11102023-EU MRV Publication of infor...
3,2019,217,11/10/2023,2019-v217-11102023-EU MRV Publication of infor...
4,2018,270,11/10/2023,2018-v270-11102023-EU MRV Publication of infor...


In [None]:
df

# Click on links

In [86]:
service = Service()

options = webdriver.ChromeOptions()
driver = webdriver.Chrome(service=service, options=options)

driver.get("https://mrv.emsa.europa.eu/#public/emission-report")

# Adjust sleep time if needed, waiting for page elements to load
time.sleep(60)

reports_test = ['2018-v270-11102023-EU MRV Publication of information', '2019-v217-11102023-EU MRV Publication of information', '2020-v194-11102023-EU MRV Publication of information']

try:
    for report in reports_test:
        print(f"===={report}=====")
        link = WebDriverWait(driver, 60).until(
                    EC.presence_of_element_located((By.LINK_TEXT, report))
        )
        time.sleep(10)
        print('clicking the link')
        link.click()
        time.sleep(10)
    
except Exception as e:
    print(f"An error occurred: {e}")
    print(traceback.format_exc())
finally:
    driver.quit()

====2018-v270-11102023-EU MRV Publication of information=====
clicking the link


KeyboardInterrupt: 

In [None]:
service = Service()

options = webdriver.ChromeOptions()
driver = webdriver.Chrome(service=service, options=options)

driver.get("https://mrv.emsa.europa.eu/#public/emission-report")

# Adjust sleep time if needed, waiting for page elements to load
time.sleep(60)

report1 = '2018-v270-11102023-EU MRV Publication of information'
report2 = '2019-v217-11102023-EU MRV Publication of information'

try:
    link = WebDriverWait(driver, 60).until(
                EC.presence_of_element_located((By.LINK_TEXT, report1))
    )
    time.sleep(10)
    print('clicking the link')
    link.click()
    time.sleep(10)
    
    link2 = WebDriverWait(driver, 60).until(
                EC.presence_of_element_located((By.LINK_TEXT, report2))
    )
    time.sleep(10)
    print('clicking the link')
    link2.click()
    time.sleep(10)
    
except Exception as e:
    print(f"An error occurred: {e}")
    print(traceback.format_exc())
finally:
    driver.quit()