In [None]:
import eikon as ek  # the Eikon Python wrapper package
import numpy as np  # NumPy
import pandas as pd  # pandas
import cufflinks as cf  # Cufflinks
import configparser as cp
import pickle
import re
from datetime import date, timedelta, datetime

import nltk, bs4  # NLP toolkit & BeautyfulSoup
from bs4 import BeautifulSoup  # HTML parsing
from nltk import word_tokenize  # tokenizing
from nltk.corpus import stopwords

from nltk.sentiment.vader import SentimentIntensityAnalyzer  # sentiment analysis

nltk.download('stopwords')
nltk.download('punkt')  # downloads package if required, for tokenizing
nltk.download('vader_lexicon')  # For sentiment

In [None]:
import os
os.getcwd()
os.chdir("news_nlp")

In [None]:
# Set up Eikon

cfg = cp.ConfigParser()
cfg.read('eikon.cfg')  # adjust for different file location
ek.set_app_key(cfg['eikon']['app_id'])

In [None]:
%%time

# Download last 3 months
n_days = 90
start_date = datetime.combine(date.today() - timedelta(days = n_days), datetime.min.time())

query = '( Topic:NEWS1 OR Topic:TOPNWS ) AND (Topic:FRX OR Topic:CEN OR Topic:ECI OR Topic:INT) AND (Topic:EZC OR Topic:EZ OR Topic:GB) AND LEN NOT ( Topic:SPO)'
headlines = []

for i in range(n_days):
    headlines.append(ek.get_news_headlines(query,
                                           date_from="{:%Y-%m-%dT%H:%M}:00".format(start_date + timedelta(days = i)), 
                                           date_to ="{:%Y-%m-%dT%H:%M}:00".format(start_date + timedelta(days = i, hours = 8)),
                                           count = 100))
    headlines.append(ek.get_news_headlines(query, 
                         date_from="{:%Y-%m-%dT%H:%M}:00".format(start_date + timedelta(days = i, hours = 8)), 
                         date_to ="{:%Y-%m-%dT%H:%M}:00".format(start_date + timedelta(days = i, hours = 16)), 
                         count = 100))
    headlines.append(ek.get_news_headlines(query, 
                     date_from="{:%Y-%m-%dT%H:%M}:00".format(start_date + timedelta(days = i, hours = 16)), 
                     date_to ="{:%Y-%m-%dT%H:%M}:00".format(start_date + timedelta(days = i + 1)), 
                     count = 100))


In [None]:
headlines = pd.concat(headlines)

In [None]:
%%time

# Download and store news content
try:
    news = pickle.load(open('eikon_news.pkl', 'rb'))
except:
    stories = []
    for i, storyId in enumerate(headlines['storyId']):
        try:
            html = ek.get_news_story(storyId)
            story = BeautifulSoup(html, 'html5lib').get_text(strip = True)
            stories.append(story)
        except:
            stories.append('')
    news['story'] = stories
    pickle.dump(news, open('eikon_news.pkl', 'wb'))

In [None]:
# Remove top placeholder from the news
news['story'] = news['story'].apply(lambda x: re.sub("^\..*?{.*?}","", x))

In [None]:
news

In [None]:
# measure sentiment
sid = SentimentIntensityAnalyzer()
scores = sid.polarity_scores(news['story'][0])
scores

In [None]:
all_news = ";".join(news['story'].values)
all_tokens = word_tokenize(all_news)

In [None]:
# Remove non-words
clean_tokens = [x.lower() for x in all_tokens if x.isalpha()][:]

In [None]:
# Remove stopwords
stop = stopwords.words('english') + ['reuters', 'story']
clean_tokens = [x for x in clean_tokens if x not in stop]

In [None]:
freq = nltk.FreqDist(clean_tokens)
freq.plot(20, cumulative = False)

In [None]:
text = nltk.Text(all_tokens)
text.concordance("full")

In [None]:
from collections import Counter

In [None]:
gram2 = [" ".join([x,y]) for x, y in nltk.ngrams(text, 2)]
gram2_counts = Counter(gram2).most_common()
gram2_clean = [(key, cnt) for key, cnt in gram2_counts if (cnt > 1 and key.replace(" ", "").isalpha())]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
vec = TfidfVectorizer(stop_words=stop, ngram_range = (1,3))

In [None]:
X = vec.fit_transform(news['story'].values)

In [None]:
pairwise_similarity = X * X.T

In [None]:
arr = pairwise_similarity.toarray()

In [None]:
# Eliminate diagonals and low similarity
np.fill_diagonal(arr, 0)
arr[arr < 0.9] = 0

In [None]:
r, c = np.where(arr > 0.9)

In [None]:
[r, c]

In [None]:
pd.DataFrame({"Doc1": r, "Doc2": c, "Similarity":arr[r,c]})

In [None]:
results = pd.DataFrame(X.todense(), columns = vec.get_feature_names())

In [None]:
[results.iloc[x,].sort_values(ascending = False)[:5] for x in range(len(results))]

In [None]:
len(results)