In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
from datetime import datetime, timedelta
import yaml
from bs4 import BeautifulSoup as bs
import requests
from uuid import UUID

# from src.utils.data_log_config import logger


#### Get news links

In [None]:
page_no = 0
url = f"https://www.fxstreet.com/news/latest/asset?q=&hPP=17&idx=FxsIndexPro&p={page_no}&dFR%5BCategory%5D%5B0%5D=News&dFR%5BTags%5D%5B0%5D=EURUSD"

In [None]:
wd = webdriver.Chrome()
page = wd.get(url)

In [None]:
news_boxes = wd.find_elements(
    By.CLASS_NAME,
    "ais-hits--item"
)

In [None]:
len(news_boxes)

In [None]:
# news_box = news_boxes[0]

def get_news_data(news_box: WebElement) -> dict:
    headline = news_box.find_element(
        By.CLASS_NAME,
        "fxs_headline_tiny"
    )
    headline_text = headline.text
    link = headline.find_element(
        By.TAG_NAME,
        "a"
    ).get_attribute("href")
    date_time = news_box.find_element(
        By.TAG_NAME,
        "time"
    ).get_attribute("datetime")

    return {
        "date_time": date_time,
        "headline": headline_text,
        "link": link,
    }


In [None]:
news_data = get_news_data(news_box)

In [None]:
news_data["date_time"][:10]

#### Get article text

In [None]:
def get_article_text(article_link: str) -> str:
    news_article = requests.get(article_link)
    article_page = bs(news_article.content, "html.parser")
    return article_page.find('div', class_='fxs_article_body').text

In [None]:
article_text = get_article_text(news_data["link"])

#### Page parsing logic

In [None]:
def scrape_news_data(news_data_list: list = None, page_no: int = None) -> list:
    wd = webdriver.Chrome()

    if news_data_list is None:
        news_data_list = []
        page_no = 0

    

    try:
        if len(news_data_list) == 0:
            url = f"https://www.fxstreet.com/news/latest/asset?q=&hPP=17&idx=FxsIndexPro&p={page_no}&dFR%5BCategory%5D%5B0%5D=News&dFR%5BTags%5D%5B0%5D=EURUSD"
            _ = wd.get(url)
            WebDriverWait(wd, 10).until(
                EC.presence_of_all_elements_located((By.CLASS_NAME, "ais-hits--item"))
            )
            news_boxes = wd.find_elements(
                By.CLASS_NAME,
                "ais-hits--item"
            )
            for news_box in news_boxes:
                news_data_list.append(get_news_data(news_box))
            page_no += 1

        while True:
            url = f"https://www.fxstreet.com/news/latest/asset?q=&hPP=17&idx=FxsIndexPro&p={page_no}&dFR%5BCategory%5D%5B0%5D=News&dFR%5BTags%5D%5B0%5D=EURUSD"
            _ = wd.get(url)
            WebDriverWait(wd, 10).until(
                EC.presence_of_all_elements_located((By.CLASS_NAME, "ais-hits--item"))
            )
            news_boxes = wd.find_elements(
                By.CLASS_NAME,
                "ais-hits--item"
            )
            for news_box in news_boxes:
                news_data_list.append(get_news_data(news_box))
            page_no += 1
        
            if news_data_list[-1]["date_time"][:10] == "2013-12-31":
                break
            else:
                continue
        wd.quit()
        print(f"Successfully scraped {page_no + 1} pages and {len(news_data_list)} news articles")
        return news_data_list, page_no
    except Exception as e:
        wd.quit()
        print(f"scraping failed at page {page_no} || {len(news_data_list)} news articles scraped\n", e)
        return news_data_list, page_no
        

In [None]:
news_data_list = [news_data]
news_data_list[-1]["date_time"][:10] == "2024-11-22"

In [None]:
news_data_test = scrape_news_data()

In [None]:
news_data_test[0]

### Back to forex factory

In [2]:
url = "https://www.forexfactory.com/calendar?range=may1.2014-jun16.2014"

In [6]:
options = webdriver.ChromeOptions()
options.page_load_strategy = 'none'
wd = webdriver.Chrome(options=options)
wd.maximize_window()
page = wd.get(url)

WebDriverWait(wd, 10).until(
    EC.presence_of_all_elements_located((By.CSS_SELECTOR,
    "td.calendar__cell.calendar__impact span[title]"))
)
wd.execute_script("window.stop();")


In [4]:
data_rows = wd.find_elements(
    By.TAG_NAME,
    "tbody"
)

In [5]:
sample = data_rows[1]

eur_data = sample.find_elements(
    By.CLASS_NAME,
    "calendar__currency"
)

one_data = sample.find_elements(
    By.TAG_NAME,
    "tr"
)
len(one_data)

18

In [8]:
date = sample.find_element(
    By.CLASS_NAME,
    "date"
).text
date

'Thu\nJan 2'

In [9]:
row = one_data[3]

In [24]:
row.find_element(
    By.CSS_SELECTOR,
    "td.calendar__cell.calendar__impact span[title]"
).get_attribute("title")

'Medium Impact Expected'

In [13]:
row = one_data[3]

row.find_element(By.CLASS_NAME, "calendar__detail").click()
WebDriverWait(wd, 10).until(
    EC.presence_of_all_elements_located((By.CLASS_NAME, "flexposts__storydisplay-info"))
)
wd.execute_script("window.stop();")

In [8]:
wd.refresh()

In [14]:
wd.current_url

'https://www.forexfactory.com/calendar?range=jan1.2014-nov16.2024#detail=50456'

In [15]:
updated_data_rows = wd.find_elements(
    By.CLASS_NAME,
    "flexposts__storydisplay-info"
)

In [16]:
updated_data_rows[0].find_element(
    By.TAG_NAME,
    "a"
).get_attribute("href")

'https://www.forexfactory.com/news/464650-audusd-technicals-early-bears-overextending-on-china/hit'

In [6]:
def generate_date_ranges(start_year, end_year):
    # List of months and their respective end days in a normal year
    month_days = {
        1: 31, 2: 28, 3: 31, 4: 30, 5: 31, 6: 30,
        7: 31, 8: 31, 9: 30, 10: 31, 11: 30, 12: 31
    }
    date_ranges = []

    def is_leap_year(year):
        return (year % 4 == 0 and year % 100 != 0) or (year % 400 == 0)

    for year in range(start_year, end_year + 1):
        for month in range(1, 13, 2):  # Iterate by steps of 2 months
            # Determine if it's a leap year for February adjustment
            if is_leap_year(year):
                month_days[2] = 29
            else:
                month_days[2] = 28

            # Start date of the range
            start_date = f"{datetime(year, month, 1):%b%d.%Y}".lower()

            # Calculate the end month and year
            end_month = month + 1
            end_year = year
            if end_month > 12:  # Handle year wrap-around
                end_month = 1
                end_year += 1

            # End date of the range
            end_date = f"{datetime(end_year, end_month, month_days[end_month]):%b%d.%Y}".lower()

            # Create the range string
            date_range = f"{start_date}-{end_date}"
            date_ranges.append(date_range)

    return date_ranges

In [7]:
# Generate the date ranges from 2014 to 2024
ranges = generate_date_ranges(2014, 2024)

In [9]:
ranges[0][-4:]

'2014'

In [8]:
for r in ranges:
    print(r)

jan01.2014-feb28.2014
mar01.2014-apr30.2014
may01.2014-jun30.2014
jul01.2014-aug31.2014
sep01.2014-oct31.2014
nov01.2014-dec31.2014
jan01.2015-feb28.2015
mar01.2015-apr30.2015
may01.2015-jun30.2015
jul01.2015-aug31.2015
sep01.2015-oct31.2015
nov01.2015-dec31.2015
jan01.2016-feb29.2016
mar01.2016-apr30.2016
may01.2016-jun30.2016
jul01.2016-aug31.2016
sep01.2016-oct31.2016
nov01.2016-dec31.2016
jan01.2017-feb28.2017
mar01.2017-apr30.2017
may01.2017-jun30.2017
jul01.2017-aug31.2017
sep01.2017-oct31.2017
nov01.2017-dec31.2017
jan01.2018-feb28.2018
mar01.2018-apr30.2018
may01.2018-jun30.2018
jul01.2018-aug31.2018
sep01.2018-oct31.2018
nov01.2018-dec31.2018
jan01.2019-feb28.2019
mar01.2019-apr30.2019
may01.2019-jun30.2019
jul01.2019-aug31.2019
sep01.2019-oct31.2019
nov01.2019-dec31.2019
jan01.2020-feb29.2020
mar01.2020-apr30.2020
may01.2020-jun30.2020
jul01.2020-aug31.2020
sep01.2020-oct31.2020
nov01.2020-dec31.2020
jan01.2021-feb28.2021
mar01.2021-apr30.2021
may01.2021-jun30.2021
jul01.2021

In [25]:
symbols = ['eur', 'usd']

if 'eur' in symbols:
    print('yes')

yes


In [14]:
# def __generate_date_ranges(start_year: int, end_year: int) -> list[str]:
#     date_ranges = []

#     # Start from the beginning of the start year
#     current_date = datetime(start_year, 1, 1)

#     # End at the end of the end year
#     end_date = datetime(end_year, 12, 31)

#     # Generate weekly ranges
#     while current_date <= end_date:
#         # Start of the week
#         start_date_str = current_date.strftime("%b%d.%Y").lower()

#         # End of the week (7 days later, capped at end_date)
#         end_of_week = current_date + timedelta(days=6)
#         if end_of_week > end_date:
#             end_of_week = end_date  # Ensure the range doesn't exceed the final date
#         end_date_str = end_of_week.strftime("%b%d.%Y").lower()

#         # Add the range to the list
#         date_ranges.append(f"{start_date_str}-{end_date_str}")

#         # Move to the next week
#         current_date += timedelta(days=7)

#     return date_ranges

In [11]:
# Generate the date ranges from 2014 to 2024
ranges = __generate_date_ranges(2014, 2024)

In [10]:
len_ranges = len(ranges)
len_ranges

574

In [12]:
ranges[:100]

['jan01.2014-jan07.2014',
 'jan08.2014-jan14.2014',
 'jan15.2014-jan21.2014',
 'jan22.2014-jan28.2014',
 'jan29.2014-feb04.2014',
 'feb05.2014-feb11.2014',
 'feb12.2014-feb18.2014',
 'feb19.2014-feb25.2014',
 'feb26.2014-mar04.2014',
 'mar05.2014-mar11.2014',
 'mar12.2014-mar18.2014',
 'mar19.2014-mar25.2014',
 'mar26.2014-apr01.2014',
 'apr02.2014-apr08.2014',
 'apr09.2014-apr15.2014',
 'apr16.2014-apr22.2014',
 'apr23.2014-apr29.2014',
 'apr30.2014-may06.2014',
 'may07.2014-may13.2014',
 'may14.2014-may20.2014',
 'may21.2014-may27.2014',
 'may28.2014-jun03.2014',
 'jun04.2014-jun10.2014',
 'jun11.2014-jun17.2014',
 'jun18.2014-jun24.2014',
 'jun25.2014-jul01.2014',
 'jul02.2014-jul08.2014',
 'jul09.2014-jul15.2014',
 'jul16.2014-jul22.2014',
 'jul23.2014-jul29.2014',
 'jul30.2014-aug05.2014',
 'aug06.2014-aug12.2014',
 'aug13.2014-aug19.2014',
 'aug20.2014-aug26.2014',
 'aug27.2014-sep02.2014',
 'sep03.2014-sep09.2014',
 'sep10.2014-sep16.2014',
 'sep17.2014-sep23.2014',
 'sep24.2014

In [15]:
# clean date ranges
def __generate_date_ranges(start_year: int, end_year: int) -> list[str]:
    date_ranges = []

    # Iterate year by year
    for year in range(start_year, end_year + 1):
        # Start at the beginning of the current year
        current_date = datetime(year, 1, 1)
        # End at the end of the current year
        year_end_date = datetime(year, 12, 31)

        # Generate weekly ranges within the year
        while current_date <= year_end_date:
            # Start of the week
            start_date_str = current_date.strftime("%b%d.%Y").lower()

            # End of the week (7 days later, capped at year_end_date)
            end_of_week = current_date + timedelta(days=6)
            if end_of_week > year_end_date:
                end_of_week = year_end_date  # Ensure the range doesn't exceed the year's final date
            end_date_str = end_of_week.strftime("%b%d.%Y").lower()

            # Add the range to the list
            date_ranges.append(f"{start_date_str}-{end_date_str}")

            # Move to the next week
            current_date += timedelta(days=7)

    return date_ranges

In [16]:
# Generate the date ranges from 2014 to 2024
ranges = __generate_date_ranges(2014, 2024)

In [18]:
ranges.index('jul23.2017-jul29.2017')

188

In [20]:
ranges[188:]

['jul23.2017-jul29.2017',
 'jul30.2017-aug05.2017',
 'aug06.2017-aug12.2017',
 'aug13.2017-aug19.2017',
 'aug20.2017-aug26.2017',
 'aug27.2017-sep02.2017',
 'sep03.2017-sep09.2017',
 'sep10.2017-sep16.2017',
 'sep17.2017-sep23.2017',
 'sep24.2017-sep30.2017',
 'oct01.2017-oct07.2017',
 'oct08.2017-oct14.2017',
 'oct15.2017-oct21.2017',
 'oct22.2017-oct28.2017',
 'oct29.2017-nov04.2017',
 'nov05.2017-nov11.2017',
 'nov12.2017-nov18.2017',
 'nov19.2017-nov25.2017',
 'nov26.2017-dec02.2017',
 'dec03.2017-dec09.2017',
 'dec10.2017-dec16.2017',
 'dec17.2017-dec23.2017',
 'dec24.2017-dec30.2017',
 'dec31.2017-dec31.2017',
 'jan01.2018-jan07.2018',
 'jan08.2018-jan14.2018',
 'jan15.2018-jan21.2018',
 'jan22.2018-jan28.2018',
 'jan29.2018-feb04.2018',
 'feb05.2018-feb11.2018',
 'feb12.2018-feb18.2018',
 'feb19.2018-feb25.2018',
 'feb26.2018-mar04.2018',
 'mar05.2018-mar11.2018',
 'mar12.2018-mar18.2018',
 'mar19.2018-mar25.2018',
 'mar26.2018-apr01.2018',
 'apr02.2018-apr08.2018',
 'apr09.2018