In [5]:
import nltk
import json
from nltk.corpus import reuters


nltk.download('reuters')

fileids = reuters.fileids()

articles = []

articles_collected = 0

for fileid in fileids:
    if 'money-fx' in reuters.categories(fileid):
        raw_text = reuters.raw(fileid)
        lines = raw_text.split('\n')
        
        if lines and lines[0].isupper(): 
            title = lines[0].strip()
            content = '\n'.join(lines[1:]).strip()
        else:
            continue

        if len(content) <= 15: # if article is too short, ignore
            continue

        date = None
        author = None
        
        article_details = {
            'title': title,
            'content': content,
            'date': date,
            'author': author
        }
        articles.append(article_details)
        
        articles_collected += 1
        
        if articles_collected >= 200:
            break

print("Total business articles collected:", len(articles))

output_file = '../data/json/reuters.json'
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(articles, f, ensure_ascii=False, indent=4)

print(f"Articles saved to {output_file}")


Total business articles collected: 200
Articles saved to ../data/json/reuters.json


[nltk_data] Downloading package reuters to
[nltk_data]     /Users/winlaeyee/nltk_data...
[nltk_data]   Package reuters is already up-to-date!


In [6]:
import json

with open(output_file, 'r', encoding='utf-8') as json_file:
    articles = json.load(json_file)

In [7]:
import pandas as pd

title_with_date = {}

def add_date(csv_file):
    df = pd.read_csv(csv_file)
    
    for article in articles:
        title = article['title']
        if title not in title_with_date:
            for index, row in df.iterrows():
                date = row['date']
                if row['title'] == title:
                    title_with_date[title] = date
                    break
    

csv_files = ['../data/reuters/ModApte_test.csv', '../data/reuters/ModHayes_test.csv', '../data/reuters/ModApte_train.csv', '../data/reuters/ModHayes_train.csv', '../data/reuters/ModApte_unused.csv', '../data/reuters/ModLewis_test.csv', '../data/reuters/ModLewis_train.csv', '../data/reuters/ModLewis_unused.csv']  

for csv_file in csv_files:
    add_date(csv_file)


In [8]:
print(title_with_date)

{'BUNDESBANK ALLOCATES 6.1 BILLION MARKS IN TENDER': ' 8-APR-1987 04:03:27.14', 'U.K. MONEY MARKET DEFICIT FORECAST AT 250 MLN STG': ' 8-APR-1987 05:06:23.11', 'ECONOMIC SPOTLIGHT - AUSTRALIAN MARKETS BOOMING': ' 8-APR-1987 06:44:11.87', 'BANK OF JAPAN INTERVENES SOON AFTER TOKYO OPENING': ' 8-APR-1987 07:18:02.86', 'U.K. MONEY MARKET GIVEN 53 MLN STG ASSISTANCE': ' 8-APR-1987 07:27:55.76', 'POEHL WARNS AGAINST FURTHER DOLLAR FALL': ' 8-APR-1987 08:14:52.55', 'U.K. MONEY MARKET GIVEN FURTHER 166 MLN STG HELP': ' 8-APR-1987 09:19:05.97', 'STOLTENBERG SEES MOVES TO STRENGTHEN PARIS ACCORD': ' 8-APR-1987 09:52:12.98', 'TOP OFFICIALS ARRIVE AT TREASURY FOR G-5 TALKS': ' 8-APR-1987 11:12:13.98', 'GROUP OF FIVE MEETING ENDS': ' 8-APR-1987 14:33:37.00', 'INDUSTRIAL NATIONS RECONVENE FOR TALKS': ' 8-APR-1987 15:21:56.07', 'BUNDESBANK CALLS FOR CENTRAL BANK COOPERATION': ' 8-APR-1987 18:00:34.46', 'G-7 ISSUES STATEMENT AFTER MEETING': ' 9-APR-1987 02:18:23.02', 'BANK OF FRANCE TO HOLD MONEY MAR

In [9]:
articles_with_dates = []

for article in articles:
    title = article['title'] 
    if title in title_with_date.keys():
        article['date'] = title_with_date[title]
    articles_with_dates.append(article)


In [10]:
articles_with_dates

[{'title': 'BUNDESBANK ALLOCATES 6.1 BILLION MARKS IN TENDER',
  'content': "The Bundesbank accepted bids for 6.1\n  billion marks at today's tender for a 28-day securities\n  repurchase pact at a fixed rate of 3.80 pct, a central bank\n  spokesman said.\n      Banks, which bid for a total 12.2 billion marks liquidity,\n  will be credited with the funds allocated today and must buy\n  back securities pledged on May 6.\n      Some 14.9 billion marks will drain from the market today as\n  an earlier pact expires, so the Bundesbank is effectively\n  withdrawing a net 8.1 billion marks from the market with\n  today's allocation.\n      A Bundesbank spokesman said in answer to enquiries that the\n  withdrawal of funds did not reflect a tightening of credit\n  policy, but was to be seen in the context of plentiful\n  liquidity in the banking system.\n      Banks held an average 59.3 billion marks at the Bundesbank\n  over the first six days of the month, well clear of the likely\n  April min

In [11]:
len(articles_with_dates)

200

In [12]:
for article in articles_with_dates:
    if not article['date']:
        print(article)

{'title': 'BUNDESBANK SEES NO REASON TO CHANGE MONETARY COURSE -', 'content': 'VICE-PRESIDENT SCHLESINGER\n\n   BUNDESBANK SEES NO REASON TO CHANGE MONETARY COURSE -\n  VICE-PRESIDENT SCHLESINGER', 'date': None, 'author': None}
{'title': "FED'S SEGER SAYS FOREIGN EXCHANGE MARKETS SHOWING SIGNS", 'content': "OF STABILITY\n\n  FED'S SEGER SAYS FOREIGN EXCHANGE MARKETS SHOWING SIGNS\n  OF STABILITY", 'date': None, 'author': None}
{'title': 'FED SAYS IT SETS TWO BILLION DLRS OF CUSTOMER REPURCHASE', 'content': 'AGREEMENTS\n\n  FED SAYS IT SETS TWO BILLION DLRS OF CUSTOMER REPURCHASE\n  AGREEMENTS', 'date': None, 'author': None}
{'title': 'NORWAY CENTRAL BANK SELLS CROWNS TO EASE UPWARD PRESSURE', 'content': 'ON CURRENCY - DEALERS\n\n   NORWAY CENTRAL BANK SELLS CROWNS TO EASE UPWARD PRESSURE\n  ON CURRENCY - DEALERS', 'date': None, 'author': None}


In [13]:
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(articles, f, ensure_ascii=False, indent=4)