In [None]:
import eikon as ek  # the Eikon Python wrapper package
import numpy as np  # NumPy
import pandas as pd  # pandas
import cufflinks as cf  # Cufflinks
import configparser as cp
import pickle
import re
from datetime import date, timedelta, datetime

import nltk, bs4  # NLP toolkit & BeautyfulSoup
from bs4 import BeautifulSoup  # HTML parsing
from nltk import word_tokenize  # tokenizing
from nltk.corpus import stopwords

from nltk.sentiment.vader import SentimentIntensityAnalyzer  # sentiment analysis

nltk.download('stopwords')
nltk.download('punkt')  # downloads package if required, for tokenizing
nltk.download('vader_lexicon')  # For sentiment

In [None]:
import os
os.getcwd()

In [None]:
# Set up Eikon

cfg = cp.ConfigParser()
cfg.read('eikon.cfg')  # adjust for different file location
ek.set_app_key(cfg['eikon']['app_id'])

In [None]:
def download_news(start_date, end_date, category = "macro"):
    news = []
    for i in range((end_date - start_date).days + 1):
        for h in range(0, 24, 4):
            start_datetime = datetime.combine(start_date, datetime.min.time()) + timedelta(days = i, hours = h)
            end_datetime = start_datetime + timedelta(hours = 4)
            news.append(_download_news_with_datetime(start_datetime, end_datetime, 100, category))
            
    return pd.concat(news)

def _download_news_with_datetime(start_datetime, end_datetime, count = 100, category = "macro"):
    if category == "macro":
        query = '( Topic:NEWS1 OR Topic:TOPNWS ) AND (Topic:FRX OR Topic:CEN OR Topic:ECI OR Topic:INT) AND (Topic:EZC OR Topic:EZ OR Topic:GB) AND LEN NOT ( Topic:SPO)'
    else:
        stop("Invalid category")
        
    news_pickle = "news_archive/eikon_{}_{:%Y%m%d%H%M}_{:%Y%m%d%H%M}.pkl".format(category, start_datetime, end_datetime)
    try: 
        news = pickle.load(open(news_pickle, 'rb'))
    except:
        news = ek.get_news_headlines(query,
                                           date_from="{:%Y-%m-%dT%H:%M}:00".format(start_datetime), 
                                           date_to ="{:%Y-%m-%dT%H:%M}:00".format(end_datetime),
                                           count = count)
        news = news.drop_duplicates('storyId')
        stories = []
        print("Downloading stories for {} news articles from {:%Y-%m-%dT%H:%M} to {:%Y-%m-%dT%H:%M}".format(len(news), start_datetime, end_datetime))
        for i, storyId in enumerate(news['storyId']):
            try:
                html = ek.get_news_story(storyId)
                story = BeautifulSoup(html, 'html5lib').get_text(strip = True)
                stories.append(story)
            except:
                stories.append('')
                
        # Clean up extra text    
        news['story'] = [re.sub("^\..*?{.*?}","", x) for x in stories]
        if end_datetime < datetime.now():
            pickle.dump(news, open(news_pickle, 'wb'))
    
    return news

In [None]:
%%time

# Download news from last 3 months
n_days = 90
start_date = date.today() - timedelta(days = n_days)
end_date = date.today()

news = download_news(start_date, end_date)
raw_news = news.copy()

In [None]:
# Remove news with same storyId
news = news.drop_duplicates('storyId').reset_index()

In [None]:
# Use title if no content
news[news['story'] == ""]['story'] = news[news['story'] == ""]['text']

In [None]:
# Clean up opening para
news['storyClean'] = news['story'].str.replace(re.compile(r"^.*?(Jan(uary)*|Feb(ruary)*|Mar(ch)*|Apr(il)*|May|Jun(e)*|Jul(y)*|Aug(ust)*|Sep(tember)*|Oct(ober)*|Nov(ember)*|Dec(ember)*)\.* [1-3]{0,1}[0-9] \(.+?\) *-* *", flags=re.IGNORECASE), " ")

In [None]:
# Clean footer
news['storyClean'] = news['storyClean'].str.replace(re.compile(r"\((Compiled|Polling|Editing|Writing|Graphic|Additional reporting|Reporting) by(.|\n)*$", flags=re.IGNORECASE), " ")
news['storyClean'] = news['storyClean'].str.replace(re.compile(r"(Email:|\()[a-z0-9|\.]+@(thomsonreuters|tr|news\.reuters)\.com(.|\n)*$", flags=re.IGNORECASE), " ")
news['storyClean'] = news['storyClean'].str.replace(re.compile(r"(\(.{5,30}\).{5,10}){0,1}Copyright Thomson Reuters(.|\n)*$", flags=re.IGNORECASE), " ")

In [None]:
# Clean footer
#news['storyClean'] = news['storyClean'].apply(lambda x: re.sub("\((Compiled|Polling|Editing|Writing|Graphic|Additional reporting|Reporting) by(.|\n)*$", "", x, re.IGNORECASE))
#news['storyClean'] = news['storyClean'].apply(lambda x: re.sub("(Email:|\()[a-z0-9|\.]+@(thomsonreuters|tr|news\.reuters)\.com(.|\n)*$", "", x, re.IGNORECASE))
#news['storyClean'] = news['storyClean'].apply(lambda x: re.sub("(\(.{5,30}\).{5,10}){0,1}Copyright Thomson Reuters(.|\n)*$", "", x, re.IGNORECASE))

In [None]:
# Clean bracket text
news['storyClean'] = news['storyClean'].str.replace(re.compile(r"\(.{5,30}?\)", flags=re.IGNORECASE), " ")
news['storyClean'] = news['storyClean'].str.replace(re.compile(r"\(Reporting by.*?\)", flags=re.IGNORECASE), " ")

In [None]:
def clean_text(text, stop = None):
    all_tokens = word_tokenize(text)
    clean_tokens = [x.lower() for x in all_tokens if x.isalpha()][:]
    
    if stop is None:
        stop = stopwords.words('english') + ['reuters', 'click', 'full', 'story', "says", "said"]
    clean_tokens = [x for x in clean_tokens if x not in stop]
    
    return " ".join(clean_tokens)

In [None]:
# Clean words in news
news['storyClean'] = [clean_text(x) for x in news['storyClean']]

# Remove very similar articles (repeat but in slightly different form)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vec = TfidfVectorizer(stop_words=stop, ngram_range = (1,3), min_df = 5)

In [None]:
X = vec.fit_transform(news['storyClean'].values)

In [None]:
pairwise_similarity = X * X.T
arr = pairwise_similarity.toarray()

In [None]:
# Eliminate diagonals and low similarity
np.fill_diagonal(arr, 0)
arr[arr <= 0.98] = 0

# Articles where match is more than 0.95
r, c = np.where(arr > 0.98)

# Identify indices to delete
t = [tuple(sorted(x)) for x in zip(r, c)]
unique_t = [x for x in set(t)]
to_delete = list(set([x[1] for x in unique_t]))

In [None]:
# Delete said indices
news = news.drop(to_delete)

# Fit and predict

In [None]:
news_today = news[news['index'] > datetime.combine(date.today(), datetime.min.time()) - timedelta(days = 5)]
news_history = news[news['index'] < datetime.combine(date.today(), datetime.min.time()) - timedelta(days = 5)]

In [None]:
news_lastweek = news[(news['index'] > datetime.combine(date.today(), datetime.min.time()) - timedelta(days = 10)) & 
                    (news['index'] < datetime.combine(date.today(), datetime.min.time()) - timedelta(days = 5))]

In [None]:
# Refit model
X_alltime = vec.fit_transform(news['storyClean'].values).todense()

In [None]:
X_today_combined = vec.transform([";".join(news_today['storyClean'])]).todense()

In [None]:
X_today = vec.transform(news_today['storyClean'].values)

In [None]:
X_lastweek = vec.transform([";".join(news_lastweek['storyClean'])]).todense()

In [None]:
results = pd.DataFrame({"word": vec.get_feature_names(), 
                        "score_alltime": np.array(X_alltime)[0],
              "score_combined": np.array(X_today_combined)[0], 
              "score": np.array(X_today.mean(axis = 0))[0], 
              "score_lastweek": np.array(X_lastweek)[0]}).sort_values(by = "score_combined", ascending = False)

In [None]:
results['curr_change_combined'] = results['score_combined'] - results['score_lastweek']
results['curr_change'] = results['score'] - results['score_lastweek']
results['prev_change'] = results['score_lastweek'] -results['score_alltime']

In [None]:
def filter_results(word_df, score_column, n = 10):
    word_df = word_df.sort_values(score_column, ascending = False).reset_index(drop = True)
    word_df['rank'] = list(range(len(word_df)))
    #word_df = word_df.sort_values(score_column, ascending = False).iloc[:(n*2)].copy()
    for index, word in enumerate(word_df['word']):
        higher_grams_score = word_df[word_df['word'].str.contains('^{} '.format(word)) | 
                                     word_df['word'].str.contains(' {}$'.format(word)) & 
                                     ~results['word'].str.match(i)][score_column].sum()
        #print(i, higher_grams_score)
        word_df.loc[word_df['word'] == i, score_column] -= higher_grams_score
    return word_df.sort_values(score_column, ascending = False)

In [None]:
results.sort_values("curr_change", ascending = False)

In [None]:
results.sort_values("curr_change", ascending = False)

In [None]:
filter_results(results, "curr_change")

In [None]:
results.sort_values(by ="curr_change", ascending = False).iloc[:20]

In [None]:
# For concordance
curr_news = ";".join(news_today['storyClean'].values)
curr_tokens = word_tokenize(curr_news)
curr_tokens = [x.lower() for x in curr_tokens if x.isalpha()][:]
#stop = stopwords.words('english') + ['reuters', 'click', 'full', 'story']
#curr_tokens = [x for x in curr_tokens if x not in stop]

text = nltk.Text(curr_tokens)

In [None]:
text.concordance("policymakers")

# Misc analysis

In [None]:
# measure sentiment
sid = SentimentIntensityAnalyzer()
scores = sid.polarity_scores(news['story'][0])
scores

In [None]:
all_news = ";".join(news['story'].values)
all_tokens = word_tokenize(all_news)

In [None]:
# Remove non-words
clean_tokens = [x.lower() for x in all_tokens if x.isalpha()][:]

In [None]:
# Remove stopwords
stop = stopwords.words('english') + ['reuters', 'click', 'full', 'story']
clean_tokens = [x for x in clean_tokens if x not in stop]

In [None]:
freq = nltk.FreqDist(clean_tokens)
freq.plot(20, cumulative = False)