## Economical news

This code scrapes `Forexfactory.com` economical news calendar between a given date and today.

The resulting dataframe is then saved in a CSV file.

In [2]:
from bs4 import BeautifulSoup
import requests
import datetime
import logging
import csv
import pandas as pd
import math

from datetime import datetime, timedelta
import calendar

In [3]:
def getWeeklyNews(Sdate):
    '''
    This function will fetch the economical news and return them in the form of a dataframe
    Any date can be given, but it will get the news for the current week starting on Monday
    
    Code modified from https://gist.github.com/pohzipohzi/ad7942fc5545675022c1f31123e64c0c#file-forexfactory_econcal-py
     (Recursivity was removed, returning dataframe, dealing with URL directly instead of following a link)
     
    '''
    news = pd.DataFrame(columns = ['Datetime','Currency','Event','Actual','Forecast','Previous'])
    baseURL = "https://www.forexfactory.com/calendar.php?week="
    
    PreviousMonday = Sdate.date() - timedelta(days= Sdate.weekday())
    URLDatePart = calendar.month_abbr[PreviousMonday.month].lower() + str(PreviousMonday.day) + '.' + str(PreviousMonday.year)
    
    r = requests.get(baseURL + URLDatePart)
    
    if  r.status_code == 200: 
                
        soup = BeautifulSoup(r.text, "lxml")
        
        # get and parse table data, ignoring details and graph
        table = soup.find("table", class_="calendar__table")

        # do not use the ".calendar__row--grey" css selector (reserved for historical data)
        trs = table.select("tr.calendar__row.calendar_row")
        fields = ["date","time","currency","impact","event","actual","forecast","previous"]
        
        curr_year = str(PreviousMonday.year)
        curr_date = ""
        curr_time = ""
        for tr in trs:

            # fields may mess up sometimes, see Tue Sep 25 2:45AM French Consumer Spending
            # in that case we append to errors.csv the date time where the error is
            try:
                for field in fields:
                    data = tr.select("td.calendar__cell.calendar__{}.{}".format(field,field))[0]
                
                    if field=="date" and data.text.strip()!="":
                        curr_date = data.text.strip()
                    elif field=="time" and data.text.strip()!="":
                        # time is sometimes "All Day" or "Day X" (eg. WEF Annual Meetings)
                        if data.text.strip().find("Day")!=-1:
                            curr_time = "12:00am"
                        else:
                            curr_time = data.text.strip()
                    elif field=="currency":
                        currency = data.text.strip()
                    elif field=="impact":
                        # when impact says "Non-Economic" on mouseover, the relevant
                        # class name is "Holiday", thus we do not use the classname
                        impact = data.find("span")["title"]
                    elif field=="event":
                        event = data.text.strip()
                    elif field=="actual":
                        actual = data.text.strip()
                    elif field=="forecast":
                        forecast = data.text.strip()
                    elif field=="previous":
                        previous = data.text.strip()

                
                news = news.append({'Datetime':str(datetime.strptime(",".join([curr_year,curr_date,curr_time]),"%Y,%a%b %d,%I:%M%p")), 'Currency':currency, 'Impact':impact,'Event':event, 'Actual':actual,'Forecast':forecast,'Previous':previous}, ignore_index=True)

            except:
                pass
                # It seems that errors are caused by duplicates that have some fields missing

    else: print("There was an error ",r.status_code," while retrieving the following URL: ",baseURL + URLDatePart)
    
    return(news)

In [4]:
def getAllNews(startDate,endDate):
    '''
    Get all weekly news between two dates
    Returns a dataframe of all the news between the weeks that the given dates belongs to.
    '''
    Ndf = pd.DataFrame(columns = ['Datetime','Currency','Event','Actual','Forecast','Previous'])
    
    # Calculate the number of weeks to retrieve, more is okay.
    NWeeks = math.ceil((endDate- startDate).days / 7)
    for week in range(0,NWeeks,1):
        print("X", end="", flush=True)
        Ndf= Ndf.append(getWeeklyNews(startDate + timedelta(days=(week*7))), ignore_index=True)
    print("\n", Ndf.shape[0], "news are in the dataframe")
    return((Ndf))

In [5]:
today = datetime.now()
start_date = datetime(2019,11,1,12,0,0)

In [6]:
N = getAllNews(start_date, today)

XXXXXXXXXXXXXXXXXXXXXX
 1999 news are in the dataframe


In [8]:
N.to_csv("./EconomicalNews.csv")