Reuters Headlines 2015-2017

In [27]:
# packages for analysis
import os
from os import listdir
from os.path import isfile, join
import numpy as np
import pandas as pd
import datetime
import nltk
import nltk.data
from nltk.util import bigrams 
from nltk.tokenize import TreebankWordTokenizer
from nltk.stem import WordNetLemmatizer
import collections
from collections import Counter
import re
import string

In [28]:
# Path to Excel files for each year of Reuters Newswire headlines
path_to_data = "/Users/varsha/GWU_SPRING2019/Topics in Big Data/Project/Code/"
newsyear = ['Reuters Newswire 2017.xlsx']

In [29]:
def processfile(datapath, filename):
    # Read in Excel file
    yearfile = pd.ExcelFile(join(datapath, filename))
    
    #Parse sheet 1, drop time from datetime, group by day
    sheet1 = yearfile.parse('Sheet1')
    sheet1['publish_time'] = sheet1['publish_time'].floordiv(10000)
    daygroups1 = sheet1.groupby(['publish_time'])['headline_text'].apply(lambda x:' | '.join(x.astype(str))).reset_index()
    
    # If sheet 2 exists
    try:
        #Parse sheet 2, drop time from datetime, group by day
        sheet2 = yearfile.parse('Sheet2')
        sheet2['publish_time'] = sheet2['publish_time'].floordiv(10000)
        daygroups2 = sheet2.groupby(['publish_time'])['headline_text'].apply(lambda x:' | '.join(x.astype(str))).reset_index()
    except:
        pass
    
    #Append data from sheet 2 to sheet 1, if sheet 2 exists
    try:
        headlines_year = daygroups1.append(daygroups2, ignore_index=True)
    except:
        headlines_year = daygroups1
    
    #Store dataframe for year in list
    appendedheadlines.append(headlines_year)
    return appendedheadlines

In [30]:
# Loop through each year and process its Excel file, then concatenate into one dataframe
appendedheadlines = []
for year in newsyear:
    processfile(path_to_data, year)

df_headlines = pd.concat(appendedheadlines)

In [31]:
# Reset index of complete dataframe
headlines = df_headlines.reset_index(drop=True)

In [32]:
#headlines.iloc[-1,1]

In [33]:
# Format publish_time as a datetime
headlines['publish_time'] = pd.to_datetime(headlines['publish_time'], format='%Y%m%d')

In [34]:
headlines.head(10)

Unnamed: 0,publish_time,headline_text
0,2017-01-01,China's brokerages told to manage reputation r...
1,2017-01-02,Kia Motors says plans to sell 3.17 million veh...
2,2017-01-03,Around 60 killed in Brazil prison riot - state...
3,2017-01-04,BRIEF-Hunter Hall International updates on off...
4,2017-01-05,Russia's Rosneftegaz closes Rosneft privatisat...
5,2017-01-06,BRIEF-Richter must be more aggressive in produ...
6,2017-01-07,DIARY-Emerging Markets Economic Events to Jan....
7,2017-01-08,Mutinying soldiers in Ivory Coast agree to ret...
8,2017-01-09,DIARY-Emerging Markets Economic Events to Jan....
9,2017-01-10,DIARY-Emerging Markets Economic Events to Jan....


In [35]:
# Create year and week number columns
headlines['week_number'] = headlines['publish_time'].dt.strftime('%W')
headlines['year'] = headlines['publish_time'].dt.year

In [36]:
# Group headlines by year and week
weekly_headlines = headlines.groupby(['year','week_number'])['headline_text'].apply(lambda x:' | '.join(x.astype(str))).reset_index()

In [37]:
weekly_headlines.tail()

Unnamed: 0,year,week_number,headline_text
48,2017,48,Balderton Capital completes new $375 mln Europ...
49,2017,49,DIARY-Top Economic Events to Jan 30 | Facebook...
50,2017,50,DIARY-Top Economic Events to Jan 30 | Japanese...
51,2017,51,DIARY-Emerging Markets Economic Events to Feb....
52,2017,52,Avalanche top Coyotes in game marred by fights...


In [38]:
#weekly_headlines.iloc[1,2]

In [39]:
# 'ADVISORY', 'SHAREHOLDER ALERT', 'COLUMN', 'DIARY', 'BRIEF', 'GRAPHIC', 'UPDATE 1', 'UPDATE 2', 'UPDATE 3', 'EMBARGOED'

In [40]:
# Create instances of classes for natural language processing
sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
treebank_tokenizer = TreebankWordTokenizer()
wordnet_lemmatizer = WordNetLemmatizer()
stopwords = nltk.corpus.stopwords.words('english')

In [41]:
# Expanding list of stopwords to be removed by including words specific to Reuters data - 
# observed words Reuters seems to use to indicate types of news but that do not carry news content
#user_defined_stop_words = ['ADVISORY', 'ALERT', 'ANALYSIS', 'BRIEF', 'COLUMN', 'CORRECTED', 'DIARY', 'EMBARGOED', 
#                           'EXCLUSIVE', 'FEATURE', 'FRAUD ALERT', 'GRAPHIC',
#                           'INSIGHT', 'INVESTIGATION ALERT', 'INVESTOR ALERT', 'PREVIEW', 'SHAREHOLDER ALERT', 
#                           'UPDATE', 'UPDATE 1', 'UPDATE 2', 'UPDATE 3', 'Jan', 'Feb', 'Mar', 'Apr', 'Jun', 'Jul', 
#                           'Aug', 'Sep', 'Oct', 'Nov', 'Dec', ' R ', ' TM ', ' plc ', ' LLC ', ' PLC ', ' CES '
#                           'PRESS DIGEST', 'GLOBAL' 'ETF Net Asset Value'] 
                    
user_defined_stop_words = ['UPDATE', 'Jan', 'Feb', 'Mar', 'Apr', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec', 
                           ' R ', ' TM ', ' plc ', ' LLC ', ' PLC ', ' CES '] 

# Could combine:
#a = nltk.corpus.stopwords.words('english')
#b = list(string.punctuation) + user_defined_stop_words
#stopwords = set(a).union(b)

In [44]:
def preprocess(x):
    punkt_sentences = sentence_tokenizer.tokenize(x)
    sentences_words = [treebank_tokenizer.tokenize(sentence) for sentence in punkt_sentences] #segment sentences
    all_tokens = [word for sentence in sentences_words for word in sentence] #tokenize words
    clean_tokens = [w for w in all_tokens if w not in user_defined_stop_words] #drop Reuters stopwords
    #tokens = [word.lower() for word in clean_tokens]# make lowercase
    content = [w for w in clean_tokens if w.lower() not in stopwords] #drop regular stopwords
    content2 = [words for words in content if words.isalpha()] #drop punctuation
    lemmatized_words = [wordnet_lemmatizer.lemmatize(word) for word in content2] #lemmatize
    return ' '.join(lemmatized_words)

In [48]:
# Preprocess headlines for each date
headlines['clean_text'] = headlines['headline_text'].apply(preprocess)

In [49]:
# Create list of each date
week = list(headlines['week_number'])

In [50]:
# Create list from each row of cleaned text from dataframe
cleantextlist = list(headlines['clean_text'])

In [52]:
# Zip lists together and save each date's news headlines as a text file.
for i, t in zip(week ,cleantextlist):
    file = open('/Users/varsha/GWU_SPRING2019/Topics in Big Data/Project/Code/WeeklyNews/' + str(i) + '.txt', 'w',encoding='utf-8')
    file.write(t)
    file.close()

In [53]:
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords




In [54]:
import spacy
spacy.load('en')
from spacy.lang.en import English
parser = English()
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [55]:
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma

[nltk_data] Downloading package wordnet to /Users/varsha/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [56]:
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

In [57]:
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/varsha/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [58]:
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [59]:
from sklearn import metrics
from nltk.stem.porter import PorterStemmer
article_hash = {}
directory = "/Users/varsha/GWU_SPRING2019/Topics in Big Data/Project/Code/WeeklyNews"


files = [f for f in listdir(directory) if isfile(join(directory, f))]

for file in files:
    #file = pathlib.Path( "/Users/varsha/Python Workspace/homework_2/data")
    #print(file)
    fileop=open(directory+"/"+file,"r")
    text = fileop.read()
    article_hash[file]  = text
    print("-----------------------------------")
    print("file;" , file)
    print(article_hash[file])

-----------------------------------
file; 29.txt
Germany say Crimean turbine scandal souring relation Russia Egypt court sentence death prosecutor killing Sons remember Princess Diana film year death Czech judge denounce Poland legal overhaul attack freedom Polish court overhaul meet growing wave criticism protest Thousands dance Berlin promote gay lesbian right Germany say Crimean turbine scandal souring relation Russia paper championship result standing Hundreds ISIS militant corps await repatriation Libya Maglione win third term FINA president bitter campaign Violence flare Caracas march violinist hurt best enough defending champion Peaty say Man Utd Liverpool fan call monkey Zaha say Man Utd Liverpool fan call monkey Ham sign Arnautovic club record fee leaf Man City join Roma Things new season explore impact Upside world Kolarov leaf Man City join Roma Open score shoot take lead Alabama PGA Tour Barbasol Championship score EU sound alarm urge coordinate Russia sanction Kuwaiti oil 

In [60]:
newspath = os.path.join(os.getcwd(), 'WeeklyNews')

In [61]:
nltk.download('wordnet')


newspath = os.path.join(os.getcwd(),'WeeklyNews')



def get_topics(text):
    def lemmatize_stemming(text):
        stemmer = SnowballStemmer("english", ignore_stopwords=True)
        return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

    def preprocess(text):
        result = []
        for token in gensim.utils.simple_preprocess(text):
            if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
                #result.append(token)
                result.append(lemmatize_stemming(token))
        return result


    words = []
    for word in text.split(' '):
        words.append(word)

    processed_data = preprocess(text)

    dictionary = gensim.corpora.Dictionary([processed_data])

    bow_corpus = [dictionary.doc2bow(processed_data)]

    bow_doc_0 = bow_corpus[0]

    tfidf = models.TfidfModel(bow_corpus)

    corpus_tfidf = tfidf[bow_corpus]

    # LDA Model using Bag of Words
    lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=15, id2word=dictionary, passes=2, workers=4)

    topic_score_list = lda_model.show_topics(num_topics=1, num_words=15, log=False, formatted=False)[0][1]
    topics_list = [topic[0] for topic in topic_score_list]
    return(topics_list)
    




def compute_afinn_score(text):
    afinn = Afinn(emoticons=True)

    #compute sentiment scores (polarity) and labels
    sentiment_score     = [afinn.score(text)]
    sentiment_category  = ['positive' if score > 0
                           else 'negative' if score < 0
                           else 'neutral'
                           for score in sentiment_score]

    #print(sentiment_score)
    #print(sentiment_category)
    return(sentiment_score, sentiment_category)


def process_text(source,text):
    article_df = pd.DataFrame(columns=['source','topics'])      # initialize dataframe for each article
    #person_names=person_list

    #article_df['a'] = None

    #person_names = get_names(text)
    article_df['source'] = pd.Series(dtype='str')
    article_df['source'] = source
    
    topics = get_topics(text)
    article_df['topics'] = [topics]
    article_df['source'] = source
    (sentiment_score, sentiment_category) = compute_afinn_score(text)
    article_df['sentiment_score'] = sentiment_score
    article_df['sentiment_category'] = sentiment_category
    return article_df




results_df = pd.DataFrame()

[nltk_data] Downloading package wordnet to /Users/varsha/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [63]:
import gensim

from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
import numpy as np
from afinn import Afinn

newsfiles = [f for f in listdir(newspath) if isfile(join(newspath, f))]

for file in newsfiles:
   filepath = newspath + "/" + file
   data = open(filepath,'r',encoding='utf-8')
   text = data.read()
   results_df = results_df.append(process_text(file,text))

In [64]:

results_df['source'] = results_df['source'].astype(str)
print(results_df)




   source                                             topics  sentiment_score  \
0  29.txt  [result, championship, profit, open, rise, sta...             12.0   
0  15.txt  [result, profit, championship, stand, bank, re...            126.0   
0  01.txt  [preview, state, florida, shoot, attack, presi...             56.0   
0  00.txt  [year, attack, china, kill, growth, state, syr...           -196.0   
0  14.txt  [result, stand, syria, preview, attack, champi...            -27.0   
0  28.txt  [profit, feder, lead, wimbledon, bank, trump, ...            -24.0   
0  02.txt  [trump, preview, lead, flight, profit, rise, b...            -31.0   
0  16.txt  [vote, elect, french, stand, result, champions...             93.0   
0  17.txt  [profit, titl, stand, rise, trump, result, cha...            175.0   
0  03.txt  [trump, preview, state, protest, result, minis...             24.0   
0  07.txt  [rise, state, profit, trump, result, stand, ch...            179.0   
0  13.txt  [bank, kill, rise

In [65]:

writer = pd.ExcelWriter('resultsweekly.xlsx')
results_df.to_excel(writer,'results')
writer.save()

exit()

