### Libraries

In [1]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
import chromedriver_autoinstaller
pd.set_option('display.max_columns', None)


### Main

In [2]:

# specify the path to chromedriver explicitly
#chromedriver_path = '../../src/chromedriver'  # Adjust the path if necessary
#service = Service(executable_path=chromedriver_path)


chromedriver_autoinstaller.install()

driver = webdriver.Chrome(service=Service())



In [3]:
driver.get('http://cab.inta-csic.es/rems/wp-content/plugins/marsweather-widget/widget.php?lang=en')


In [5]:

def get_a_data_row(html_content):
    
    # parse
    soup = BeautifulSoup(html_content, 'html.parser')

    # get sol
    sol_info = soup.find('div', class_='mw-current')  # Assuming the Sol info is within this class based on your description
    sol_text = sol_info.get_text(strip=True) if sol_info else "Sol information not available"
    sol_number = sol_text.split()[-1] if "Sol" in sol_text else "N/A"

    # get extra context
    earth_date = soup.find('span', id='mw-terrestrial_date').get_text(strip=True) if soup.find('span', id='mw-terrestrial_date') else "N/A"
    mars_season = soup.find('span', id='mw-season').get_text(strip=True) if soup.find('span', id='mw-season') else "N/A"
    ls = soup.find('span', id='mw-ls').get_text(strip=True) if soup.find('span', id='mw-ls') else "N/A"




    # for handling uv
    def get_uv_index_level(element):
        if element.find(class_='high'):
            return 'High'
        elif element.find(class_='moderate'):
            return 'Moderate'
        elif element.find(class_='low'):
            return 'Low'
        elif element.find(class_='very_high'):
            return 'Very High'
        elif element.find(class_='extreme'):
            return 'Extreme'
        return "Not available"



    # extract/organize
    data = []
    measurements = soup.find_all('div', class_='mw-measurement')
    for measurement in measurements:
        row = {
            'Sol': sol_number,
            'Earth Date': earth_date,
            'Mars Season': mars_season,
            'LS (°)': ls,
            'Measurement': None,
            'Max': None,
            'Min': None,
            'Current': None,
            'Unit': None,
        }
        
        title = measurement.find('div', class_='title').get_text(strip=True)
        max_value = measurement.find('div', class_='max')
        min_value = measurement.find('div', class_='min')
        current_value = measurement.find('div', class_='current')
        unit = measurement.find('div', class_='unit-title')

        row['Measurement'] = title
        if max_value:
            row['Max'] = max_value.find(class_='digit').get_text(strip=True)
        if min_value:
            row['Min'] = min_value.find(class_='digit').get_text(strip=True)
        
        # handling uv
        if 'Ultraviolet Radiation' in title and current_value:
            row['Current'] = get_uv_index_level(current_value)
        elif current_value and current_value.find(class_='digit'):
            row['Current'] = current_value.find(class_='digit').get_text(strip=True)
        
        if unit:
            row['Unit'] = unit.get_text(strip=True)

        data.append(row)

    # create df
    df = pd.DataFrame(data)


    # arrange the data
    df_pivot = df.pivot(index=['Sol', 'Earth Date', 'Mars Season', 'LS (°)'], columns='Measurement', values=['Max', 'Min', 'Current', 'Unit'])

    # fix the headers
    df_pivot.columns = ['_'.join(col[::-1]).strip() for col in df_pivot.columns.values]
    df_pivot.reset_index(inplace=True)

    # sort
    first_cols = ['Sol', 'Earth Date', 'Mars Season', 'LS (°)']
    remaining_cols_sorted = sorted(df_pivot.columns.drop(first_cols))
    cols_sorted = first_cols + remaining_cols_sorted

    df_pivot = df_pivot[cols_sorted]

    # format
    df_pivot.columns = [col.lower().replace(' ', '_') for col in df_pivot.columns]

    # rename cols
    df_pivot = df_pivot.rename(columns={'ls_(°)': 'solar_longitude_degrees', 'earth_date': 'earth_date_utc'})
    
    return df_pivot


# test
html_content = driver.page_source
html_content
get_a_data_row(html_content)



Unnamed: 0,sol,earth_date_utc,mars_season,solar_longitude_degrees,air_temperature_current,air_temperature_max,air_temperature_min,air_temperature_unit,atmospheric_opacity_current,atmospheric_opacity_max,atmospheric_opacity_min,atmospheric_opacity_unit,ground_temperature_current,ground_temperature_max,ground_temperature_min,ground_temperature_unit,pressure_current,pressure_max,pressure_min,pressure_unit,relative_humidity_current,relative_humidity_max,relative_humidity_min,relative_humidity_unit,sunrise_and_sunset_current,sunrise_and_sunset_max,sunrise_and_sunset_min,sunrise_and_sunset_unit,ultraviolet_radiation_current,ultraviolet_radiation_max,ultraviolet_radiation_min,ultraviolet_radiation_unit,wind_current,wind_max,wind_min,wind_unit
0,Sol4081,2024-01-29,Month 7,189,,-1,-69,°C,Sunny,,,,,13,-78,°C,730,,,Pa,Value not available,,,%,,05:18,17:21,,High,,,,Value not available,,,Km/h


In [8]:
timeout = 5
d_rows = pd.DataFrame()  # Use a list to collect rows

#for item in range(4081):
for item in range(4081):
    try:
        # wait for loading
        WebDriverWait(driver, timeout).until(EC.presence_of_element_located((By.ID, 'mw-terrestrial_date')))
        
        widget_content = driver.page_source
        data_row = get_a_data_row(widget_content)
        d_rows = pd.concat([d_rows, data_row], ignore_index=True)  # Append the row to the list


        # go back a date
        goback = driver.find_element(By.XPATH, '//a[@href="#previous-sol"]')
        goback.click()

    except TimeoutException:
        print("Timed out waiting for page to load")
    except Exception as e:
        print(f"An error occurred: {e}")
        break

# check
display(d_rows)



Unnamed: 0,sol,earth_date_utc,mars_season,solar_longitude_degrees,air_temperature_current,air_temperature_max,air_temperature_min,air_temperature_unit,atmospheric_opacity_current,atmospheric_opacity_max,atmospheric_opacity_min,atmospheric_opacity_unit,ground_temperature_current,ground_temperature_max,ground_temperature_min,ground_temperature_unit,pressure_current,pressure_max,pressure_min,pressure_unit,relative_humidity_current,relative_humidity_max,relative_humidity_min,relative_humidity_unit,sunrise_and_sunset_current,sunrise_and_sunset_max,sunrise_and_sunset_min,sunrise_and_sunset_unit,ultraviolet_radiation_current,ultraviolet_radiation_max,ultraviolet_radiation_min,ultraviolet_radiation_unit,wind_current,wind_max,wind_min,wind_unit
0,Sol4081,2024-01-29,Month 7,189,,-1,-69,°C,Sunny,,,,,13,-78,°C,730,,,Pa,Value not available,,,%,,05:18,17:21,,High,,,,Value not available,,,Km/h
1,Sol4080,2024-01-28,Month 7,188,,-2,-72,°C,Sunny,,,,,14,-78,°C,729,,,Pa,Value not available,,,%,,05:18,17:21,,High,,,,Value not available,,,Km/h
2,Sol4079,2024-01-27,Month 7,188,,2,-75,°C,Sunny,,,,,14,-78,°C,729,,,Pa,Value not available,,,%,,05:18,17:21,,High,,,,Value not available,,,Km/h
3,Sol4078,2024-01-26,Month 7,187,,3,-73,°C,Sunny,,,,,16,-84,°C,728,,,Pa,Value not available,,,%,,05:18,17:21,,High,,,,Value not available,,,Km/h
4,Sol4077,2024-01-25,Month 7,186,,0,-70,°C,Sunny,,,,,15,-75,°C,728,,,Pa,Value not available,,,%,,05:18,17:20,,High,,,,Value not available,,,Km/h
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4076,Sol1,2012-08-07,Month 6,150,,Value not available,Value not available,°C,Sunny,,,,,Value not available,Value not available,°C,Value not available,,,Pa,Value not available,,,%,,05:30,17:22,,Not available,,,,Value not available,,,Km/h
4077,Sol1,2012-08-07,Month 6,150,,Value not available,Value not available,°C,Sunny,,,,,Value not available,Value not available,°C,Value not available,,,Pa,Value not available,,,%,,05:30,17:22,,Not available,,,,Value not available,,,Km/h
4078,Sol1,2012-08-07,Month 6,150,,Value not available,Value not available,°C,Sunny,,,,,Value not available,Value not available,°C,Value not available,,,Pa,Value not available,,,%,,05:30,17:22,,Not available,,,,Value not available,,,Km/h
4079,Sol1,2012-08-07,Month 6,150,,Value not available,Value not available,°C,Sunny,,,,,Value not available,Value not available,°C,Value not available,,,Pa,Value not available,,,%,,05:30,17:22,,Not available,,,,Value not available,,,Km/h


In [10]:
# save
d_rows.to_csv(f'../../../data/raw/public_sector/mars_weather_from_widget/mars_weather_from_widget_raw.csv', index=False)