In [None]:
import eikon as ek  # the Eikon Python wrapper package
import numpy as np  # NumPy
import pandas as pd  # pandas
import cufflinks as cf  # Cufflinks
import configparser as cp
import pickle
import re
from datetime import date, timedelta, datetime

import nltk, bs4  # NLP toolkit & BeautyfulSoup
from bs4 import BeautifulSoup  # HTML parsing
from nltk import word_tokenize  # tokenizing
from nltk.corpus import stopwords

from nltk.sentiment.vader import SentimentIntensityAnalyzer  # sentiment analysis

nltk.download('stopwords')
nltk.download('punkt')  # downloads package if required, for tokenizing
nltk.download('vader_lexicon')  # For sentiment

In [None]:
import os
os.getcwd()

In [None]:
# Set up Eikon

cfg = cp.ConfigParser()
cfg.read('eikon.cfg')  # adjust for different file location
ek.set_app_key(cfg['eikon']['app_id'])

In [None]:
def download_news(start_date, end_date, category = "macro"):
    news = []
    for i in range((end_date - start_date).days + 1):
        for h in range(0, 24, 4):
            start_datetime = datetime.combine(start_date, datetime.min.time()) + timedelta(days = i, hours = h)
            end_datetime = start_datetime + timedelta(hours = 4)
            news.append(_download_news_with_datetime(start_datetime, end_datetime, 100, category))
            
    return pd.concat(news)

def _download_news_with_datetime(start_datetime, end_datetime, count = 100, category = "macro"):
    if category == "macro":
        query = '( Topic:NEWS1 OR Topic:TOPNWS ) AND (Topic:FRX OR Topic:CEN OR Topic:ECI OR Topic:INT) AND (Topic:EZC OR Topic:EZ OR Topic:GB) AND LEN NOT ( Topic:SPO)'
    else:
        stop("Invalid category")
        
    news_pickle = "news_archive/eikon_{}_{:%Y%m%d%H%M}_{:%Y%m%d%H%M}.pkl".format(category, start_datetime, end_datetime)
    try: 
        news = pickle.load(open(news_pickle, 'rb'))
    except:
        news = ek.get_news_headlines(query,
                                           date_from="{:%Y-%m-%dT%H:%M}:00".format(start_datetime), 
                                           date_to ="{:%Y-%m-%dT%H:%M}:00".format(end_datetime),
                                           count = count)
        stories = []
        print("Downloading stories for {} news articles from {:%Y-%m-%dT%H:%M} to {:%Y-%m-%dT%H:%M}".format(len(news), start_datetime, end_datetime))
        for i, storyId in enumerate(news['storyId']):
            try:
                html = ek.get_news_story(storyId)
                story = BeautifulSoup(html, 'html5lib').get_text(strip = True)
                stories.append(story)
            except:
                stories.append('')
                
        # Clean up extra text    
        news['story'] = [re.sub("^\..*?{.*?}","", x) for x in stories]
        if end_datetime < datetime.now():
            pickle.dump(news, open(news_pickle, 'wb'))
    
    return news

In [None]:
%%time

n_days = 90
start_date = date.today() - timedelta(days = n_days)
end_date = date.today()

news = download_news(start_date, end_date)

In [None]:
# Remove news with same storyId
news = news.drop_duplicates('storyId').reset_index()

In [None]:
def clean_text(text, stop = None):
    all_tokens = word_tokenize(text)
    clean_tokens = [x.lower() for x in all_tokens if x.isalpha()][:]
    
    if stop is None:
        stop = stopwords.words('english') + ['reuters', 'click', 'full', 'story']
    clean_tokens = [x for x in clean_tokens if x not in stop]
    
    return " ".join(clean_tokens)

In [None]:
# Clean news
news['storyClean'] = [clean_text(x) for x in news['story']]

# Remove very similar articles (repeat but in slightly different form)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vec = TfidfVectorizer(stop_words=stop, ngram_range = (1,3), min_df = 5)

In [None]:
X = vec.fit_transform(news['story'].values)

In [None]:
pairwise_similarity = X * X.T
arr = pairwise_similarity.toarray()

In [None]:
# Eliminate diagonals and low similarity
np.fill_diagonal(arr, 0)
arr[arr <= 0.98] = 0

# Articles where match is more than 0.95
r, c = np.where(arr > 0.98)

# Identify indices to delete
t = [tuple(sorted(x)) for x in zip(r, c)]
unique_t = [x for x in set(t)]
to_delete = list(set([x[1] for x in unique_t]))

# Delete said indices
news = news.drop(to_delete)

In [None]:
news_today = news[news['index'] > datetime.combine(date.today(), datetime.min.time()) - timedelta(days = 3)]
news_history = news[news['index'] < datetime.combine(date.today(), datetime.min.time()) - timedelta(days = 3)]

In [None]:
# Refit model
vec.fit(news_history['storyClean'].values)

In [None]:
X = vec.transform(news_today['storyClean'].values)

In [None]:
pd.DataFrame({"word": vec.get_feature_names(), "score": np.array(X.mean(axis = 0))[0]}).sort_values(by = "score", ascending = False)

In [None]:
# For concordance
curr_news = ";".join(news_today['story'].values)
curr_tokens = word_tokenize(curr_news)
curr_tokens = [x.lower() for x in curr_tokens if x.isalpha()][:]
stop = stopwords.words('english') + ['reuters', 'click', 'full', 'story']
curr_tokens = [x for x in curr_tokens if x not in stop]

text = nltk.Text(curr_tokens)

In [None]:
text.concordance("government")

# Misc analysis

In [None]:
# measure sentiment
sid = SentimentIntensityAnalyzer()
scores = sid.polarity_scores(news['story'][0])
scores

In [None]:
all_news = ";".join(news['story'].values)
all_tokens = word_tokenize(all_news)

In [None]:
# Remove non-words
clean_tokens = [x.lower() for x in all_tokens if x.isalpha()][:]

In [None]:
# Remove stopwords
stop = stopwords.words('english') + ['reuters', 'click', 'full', 'story']
clean_tokens = [x for x in clean_tokens if x not in stop]

In [None]:
freq = nltk.FreqDist(clean_tokens)
freq.plot(20, cumulative = False)