In [1]:
import os
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
import matplotlib.pyplot as plt
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
import multiprocessing as mp
from webdriver_manager.chrome import ChromeDriverManager
from dotenv import load_dotenv
import traceback
import sys

# Get the data from the reports table

In [50]:
# Install Webdriver
service = Service()

options = webdriver.ChromeOptions()
driver = webdriver.Chrome(service=service, options=options)

driver.get("https://mrv.emsa.europa.eu/#public/emission-report")
time.sleep(60)

# html=driver.page_source
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, '//*[@id="exportablegrid-1137-body"]')))


# Find and scrape the tables
tables = driver.find_element(By.XPATH, '//*[@id="exportablegrid-1137-body"]')
elements = tables.find_elements(By.TAG_NAME, 'table')

keys = ['Reporting Period', 'Version', 'Generation Date', 'File']
reports = []
for i, table in enumerate(elements):
    result_dict = {keys[i]: value for i, value in enumerate(table.text.split('\n')[1:])}
    reports.append(result_dict)
    
    print()
    
driver.quit()








# Compare versions of new reports and old reports

In [73]:
reports_df_old = pd.read_csv('../data/raw/reports_metadata.csv')

In [82]:
reports_df_old.dtypes

Reporting Period             int64
Version                      int64
Generation Date     datetime64[ns]
File                        object
dtype: object

In [78]:
reports_df_old['Generation Date'] = pd.to_datetime(reports_df_old['Generation Date'], dayfirst=True)

In [52]:
reports_new = reports

In [53]:
reports_old = [{'Reporting Period': '2022',
  'Version': '105',
  'Generation Date': '11/11/2023',
  'File': '2022-v105-09112023-EU MRV Publication of information'},
 {'Reporting Period': '2021',
  'Version': '176',
  'Generation Date': '13/10/2023',
  'File': '2021-v176-13102023-EU MRV Publication of information'},
 {'Reporting Period': '2020',
  'Version': '194',
  'Generation Date': '11/10/2023',
  'File': '2020-v194-11102023-EU MRV Publication of information'},
 {'Reporting Period': '2019',
  'Version': '217',
  'Generation Date': '11/10/2023',
  'File': '2019-v217-11102023-EU MRV Publication of information'},
 {'Reporting Period': '2018',
  'Version': '270',
  'Generation Date': '11/10/2023',
  'File': '2018-v270-11102023-EU MRV Publication of information'}]


In [54]:
reports_new

[{'Reporting Period': '2022',
  'Version': '108',
  'Generation Date': '16/11/2023',
  'File': '2022-v108-16112023-EU MRV Publication of information '},
 {'Reporting Period': '2021',
  'Version': '178',
  'Generation Date': '16/11/2023',
  'File': '2021-v178-16112023-EU MRV Publication of information '},
 {'Reporting Period': '2020',
  'Version': '194',
  'Generation Date': '11/10/2023',
  'File': '2020-v194-11102023-EU MRV Publication of information '},
 {'Reporting Period': '2019',
  'Version': '217',
  'Generation Date': '11/10/2023',
  'File': '2019-v217-11102023-EU MRV Publication of information '},
 {'Reporting Period': '2018',
  'Version': '270',
  'Generation Date': '11/10/2023',
  'File': '2018-v270-11102023-EU MRV Publication of information '}]

In [55]:
df_old = pd.DataFrame(reports_old)
df_new = pd.DataFrame(reports_new)

## Merge current and new dataframes on the 'Reporting Period' column


In [56]:
merged_df = pd.merge(df_old, df_new, on='Reporting Period', how='outer', suffixes=('_current', '_new'))

In [57]:
merged_df

Unnamed: 0,Reporting Period,Version_current,Generation Date_current,File_current,Version_new,Generation Date_new,File_new
0,2022,105,11/11/2023,2022-v105-09112023-EU MRV Publication of infor...,108,16/11/2023,2022-v108-16112023-EU MRV Publication of infor...
1,2021,176,13/10/2023,2021-v176-13102023-EU MRV Publication of infor...,178,16/11/2023,2021-v178-16112023-EU MRV Publication of infor...
2,2020,194,11/10/2023,2020-v194-11102023-EU MRV Publication of infor...,194,11/10/2023,2020-v194-11102023-EU MRV Publication of infor...
3,2019,217,11/10/2023,2019-v217-11102023-EU MRV Publication of infor...,217,11/10/2023,2019-v217-11102023-EU MRV Publication of infor...
4,2018,270,11/10/2023,2018-v270-11102023-EU MRV Publication of infor...,270,11/10/2023,2018-v270-11102023-EU MRV Publication of infor...


## Identify rows where the new version is greater than the current version

In [58]:
new_versions = merged_df[merged_df['Version_new'] > merged_df['Version_current']]


In [59]:
new_versions

Unnamed: 0,Reporting Period,Version_current,Generation Date_current,File_current,Version_new,Generation Date_new,File_new
0,2022,105,11/11/2023,2022-v105-09112023-EU MRV Publication of infor...,108,16/11/2023,2022-v108-16112023-EU MRV Publication of infor...
1,2021,176,13/10/2023,2021-v176-13102023-EU MRV Publication of infor...,178,16/11/2023,2021-v178-16112023-EU MRV Publication of infor...


In [60]:
df_old

Unnamed: 0,Reporting Period,Version,Generation Date,File
0,2022,105,11/11/2023,2022-v105-09112023-EU MRV Publication of infor...
1,2021,176,13/10/2023,2021-v176-13102023-EU MRV Publication of infor...
2,2020,194,11/10/2023,2020-v194-11102023-EU MRV Publication of infor...
3,2019,217,11/10/2023,2019-v217-11102023-EU MRV Publication of infor...
4,2018,270,11/10/2023,2018-v270-11102023-EU MRV Publication of infor...


## Update current data for rows with new versions

In [70]:
for index, row in new_versions.iterrows():
    df_old.loc[df_old['Reporting Period'] == row['Reporting Period'], 'Version'] = row['Version_new']
    df_old.loc[df_old['Reporting Period'] == row['Reporting Period'], 'Generation Date'] = row['Generation Date_new']
    df_old.loc[df_old['Reporting Period'] == row['Reporting Period'], 'File'] = row['File_new']
    print(type(row['File_new']))
     

<class 'str'>
<class 'str'>


In [2]:
import re

In [3]:
def extract_table_elements(data):
    # Define regular expressions to extract data
    reporting_period_pattern = re.compile(r"Reporting Period(\d+)")
    version_pattern = re.compile(r"Version(\d+)")
    generation_date_pattern = re.compile(r"Generation Date([\d/]+)")
    file_pattern = re.compile(r"File(.+)$")

    # Initialize variables to store extracted values
    reporting_period = None
    version = None
    generation_date = None
    file_data = None

    # Extract data using regular expressions
    match_reporting_period = reporting_period_pattern.search(data)
    if match_reporting_period:
        reporting_period = int(match_reporting_period.group(1))

    match_version = version_pattern.search(data)
    if match_version:
        version = int(match_version.group(1))

    match_generation_date = generation_date_pattern.search(data)
    if match_generation_date:
        generation_date = match_generation_date.group(1)

    match_file = file_pattern.search(data)
    if match_file:
        file_data = match_file.group(1).strip()

    # Create a dictionary with the extracted data
    data_dict = {
        "Reporting Period": reporting_period,
        "Version": version,
        "Generation Date": generation_date,
        "File": file_data,
    }

    # Print the dictionary
    print(data_dict)
    return data_dict

In [38]:
options = webdriver.ChromeOptions()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--disable-gpu")
options.add_experimental_option("prefs", {"download.default_directory": "/tmp"})
driver = webdriver.Remote("http://localhost:4444", options=options)

try:
    print("Visiting the Thetis MRV website")

    driver.get("https://mrv.emsa.europa.eu/#public/emission-report")
    time.sleep(30)

    table_element = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.XPATH, '//*[@id="gridview-1152"]/div[2]'))
    )
    table_rows = table_element.find_elements(By.TAG_NAME, "tr")
    table_data = []
    for row in table_rows:
        row_data = []
        cells = row.find_elements(By.TAG_NAME, "td")
        for cell in cells:
            row_data.append(cell.text)
        table_data.append(row_data)
        
    print(table_data)
    result = []

    for row in table_data:
        data = {}
        data["Reporting Period"] = row[1].split("Reporting Period")[1]
        data["Version"] = row[2].split("Version")[1]
        data["Generation Date"] = row[3].split("Generation Date")[1]
        data["File"] = row[4].split("File")[1]
        result.append(data)

    print(result)
    df_new = pd.DataFrame(result)

except Exception as e:
    print(f"An error occurred while getting the data: {e}")
    print(traceback.format_exc())
finally:
    driver.quit()

Visiting the Thetis MRV website
[['Actions', 'Reporting Period2022', 'Version202', 'Generation Date17/05/2024', 'File2022-v202-17052024-EU MRV Publication of information '], ['Actions', 'Reporting Period2021', 'Version203', 'Generation Date29/04/2024', 'File2021-v203-29042024-EU MRV Publication of information '], ['Actions', 'Reporting Period2020', 'Version196', 'Generation Date14/05/2024', 'File2020-v196-14052024-EU MRV Publication of information '], ['Actions', 'Reporting Period2019', 'Version219', 'Generation Date14/05/2024', 'File2019-v219-14052024-EU MRV Publication of information '], ['Actions', 'Reporting Period2018', 'Version271', 'Generation Date14/05/2024', 'File2018-v271-14052024-EU MRV Publication of information ']]
[{'Reporting Period': '2022', 'Version': '202', 'Generation Date': '17/05/2024', 'File': '2022-v202-17052024-EU MRV Publication of information '}, {'Reporting Period': '2021', 'Version': '203', 'Generation Date': '29/04/2024', 'File': '2021-v203-29042024-EU MRV 

In [22]:
df_new.dtypes

Reporting Period    object
Version             object
Generation Date     object
File                object
dtype: object

In [39]:
def fix_column_types(df):
    df["Reporting Period"] = df[["Reporting Period"]].astype(int)
    df["Version"] = df[["Version"]].astype(int)
    df["Generation Date"] = pd.to_datetime(df["Generation Date"], dayfirst=True)

    return df

In [40]:
df_new1 = fix_column_types(df_new)

In [41]:
df_new1.dtypes

Reporting Period             int64
Version                      int64
Generation Date     datetime64[ns]
File                        object
dtype: object

In [42]:
df_new

Unnamed: 0,Reporting Period,Version,Generation Date,File
0,2022,202,2024-05-17,2022-v202-17052024-EU MRV Publication of infor...
1,2021,203,2024-04-29,2021-v203-29042024-EU MRV Publication of infor...
2,2020,196,2024-05-14,2020-v196-14052024-EU MRV Publication of infor...
3,2019,219,2024-05-14,2019-v219-14052024-EU MRV Publication of infor...
4,2018,271,2024-05-14,2018-v271-14052024-EU MRV Publication of infor...


In [43]:
df_new1

Unnamed: 0,Reporting Period,Version,Generation Date,File
0,2022,202,2024-05-17,2022-v202-17052024-EU MRV Publication of infor...
1,2021,203,2024-04-29,2021-v203-29042024-EU MRV Publication of infor...
2,2020,196,2024-05-14,2020-v196-14052024-EU MRV Publication of infor...
3,2019,219,2024-05-14,2019-v219-14052024-EU MRV Publication of infor...
4,2018,271,2024-05-14,2018-v271-14052024-EU MRV Publication of infor...


In [29]:
import os

In [30]:
os.getcwd()

'/Users/vasileiosvyzas/workspace/side-projects/ship-emissions-tracker/notebooks'

In [44]:
reports_df_old = pd.read_csv("../data/raw/reports_metadata.csv", )

In [45]:
reports_df_old.dtypes

Reporting Period     int64
Version              int64
Generation Date     object
File                object
dtype: object

In [46]:
df_new1

Unnamed: 0,Reporting Period,Version,Generation Date,File
0,2022,202,2024-05-17,2022-v202-17052024-EU MRV Publication of infor...
1,2021,203,2024-04-29,2021-v203-29042024-EU MRV Publication of infor...
2,2020,196,2024-05-14,2020-v196-14052024-EU MRV Publication of infor...
3,2019,219,2024-05-14,2019-v219-14052024-EU MRV Publication of infor...
4,2018,271,2024-05-14,2018-v271-14052024-EU MRV Publication of infor...


In [47]:
reports_df_old

Unnamed: 0,Reporting Period,Version,Generation Date,File
0,2022,119,2023-12-09,2022-v119-09122023-EU MRV Publication of infor...
1,2021,179,2023-12-04,2021-v179-04122023-EU MRV Publication of infor...
2,2020,194,2023-10-11,2020-v194-11102023-EU MRV Publication of infor...
3,2019,217,2023-10-11,2019-v217-11102023-EU MRV Publication of infor...
4,2018,270,2023-10-11,2018-v270-11102023-EU MRV Publication of infor...


In [48]:
merged_df = pd.merge(
    reports_df_old, df_new1, on="Reporting Period", how="outer", suffixes=("_current", "_new")
)
new_versions = merged_df[merged_df["Version_new"] > merged_df["Version_current"]]

In [49]:
new_versions

Unnamed: 0,Reporting Period,Version_current,Generation Date_current,File_current,Version_new,Generation Date_new,File_new
