In [None]:
import eikon as ek  # the Eikon Python wrapper package
import numpy as np  # NumPy
import pandas as pd  # pandas
import cufflinks as cf  # Cufflinks
import configparser as cp
import pickle
import re

import nltk, bs4  # NLP toolkit & BeautyfulSoup
from bs4 import BeautifulSoup  # HTML parsing
from nltk import word_tokenize  # tokenizing
from nltk.corpus import stopwords

from nltk.sentiment.vader import SentimentIntensityAnalyzer  # sentiment analysis

nltk.download('stopwords')
nltk.download('punkt')  # downloads package if required, for tokenizing
nltk.download('vader_lexicon')  # For sentiment

In [None]:
# Set up Eikon

cfg = cp.ConfigParser()
cfg.read('eikon.cfg')  # adjust for different file location
ek.set_app_key(cfg['eikon']['app_id'])

In [None]:
# Download headlines

query = '( Topic:NEWS1 OR Topic:TOPNWS ) AND (Topic:FRX OR Topic:CEN OR Topic:ECI OR Topic:INT) AND (Topic:EZC OR Topic:EZ OR Topic:GB) AND LEN NOT ( Topic:SPO)'
news = ek.get_news_headlines(query, 
                             date_from='2019-08-19T00:00:00', 
                             date_to ='2019-08-20T20:00:00', 
                             count = 100)
news.head()

In [None]:
%%time

# Download and store news content
try:
    news = pickle.load(open('eikon_news.pkl', 'rb'))
except:
    stories = []
    for i, storyId in enumerate(news['storyId']):
        try:
            html = ek.get_news_story(storyId)
            story = BeautifulSoup(html, 'html5lib').get_text(strip = True)
            stories.append(story)
        except:
            stories.append('')
    news['story'] = stories
    pickle.dump(news, open('eikon_news.pkl', 'wb'))

In [None]:
# Remove top placeholder from the news
news['story'] = news['story'].apply(lambda x: re.sub("^\..*?{.*?}","", x))

In [None]:
news['story'][0][:100]

In [None]:
# measure sentiment
sid = SentimentIntensityAnalyzer()
scores = sid.polarity_scores(news['story'][0])
scores

In [None]:
all_news = ";".join(news['story'].values)
all_tokens = word_tokenize(all_news)

In [None]:
# Remove non-words
clean_tokens = [x.lower() for x in all_tokens if x.isalpha()][:]

In [None]:
# Remove stopwords
stop = stopwords.words('english') + ['reuters']
clean_tokens = [x for x in clean_tokens if x not in stop]

In [None]:
freq = nltk.FreqDist(clean_tokens)
freq.plot(20, cumulative = False)

In [None]:
text = nltk.Text(all_tokens)
text.concordance("full")