# Topic Extraction and Sentiment Analysis

In [1]:



# packages for preprocessing news headlines
import os
from os import listdir
from os.path import isfile, join
import numpy as np
import pandas as pd
import datetime
import nltk
import nltk.data
from nltk.util import bigrams 
from nltk.tokenize import TreebankWordTokenizer
from nltk.stem import WordNetLemmatizer
import collections
from collections import Counter
import re
import string

In [2]:
# packages for analysis - do not run for preprocessing - just keeping handy
import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
warnings.filterwarnings(action='ignore', category=FutureWarning, module='pyLDAvis')
warnings.filterwarnings(action='ignore', category=DeprecationWarning)
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import gensim
from gensim import corpora
from gensim.models import CoherenceModel
from gensim.models.wrappers import LdaMallet
from pprint import pprint
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pyLDAvis
import pyLDAvis.gensim

In [3]:
# Path to Excel files for each year of Reuters Newswire headlines
path_to_data = "/Users/varsha/GWU_SPRING2019/Topics in Big Data/Project/Code/"
#newsyear = ['Reuters Newswire 2015.xlsx','Reuters Newswire 2016.xlsx','Reuters Newswire 2017.xlsx']
newsyear = ['Reuters Newswire 2017.xlsx']

In [4]:
def processfile(datapath, filename):
    # Read in Excel file
    yearfile = pd.ExcelFile(join(datapath, filename))
    
    #Parse sheet 1, drop times from datetime, group by day
    sheet1 = yearfile.parse('Sheet1')
    sheet1['publish_time'] = sheet1['publish_time'].floordiv(10000)
    daygroups1 = sheet1.groupby(['publish_time'])['headline_text'].apply(lambda x:' | '.join(x.astype(str))).reset_index()
    
    # If sheet 2 exists
    try:
        #Parse sheet 2, drop time from datetime, group by day
        sheet2 = yearfile.parse('Sheet2')
        sheet2['publish_time'] = sheet2['publish_time'].floordiv(10000)
        daygroups2 = sheet2.groupby(['publish_time'])['headline_text'].apply(lambda x:' | '.join(x.astype(str))).reset_index()
    except:
        pass
    
    #Append data from sheet 2 to sheet 1, if sheet 2 exists
    try:
        headlines_year = daygroups1.append(daygroups2, ignore_index=True)
    except:
        headlines_year = daygroups1
    
    #Store dataframe for year in list
    appendedheadlines.append(headlines_year)
    return appendedheadlines

In [5]:
# Loop through each year and process its Excel file, then concatenate into one dataframe
appendedheadlines = []
for year in newsyear:
    processfile(path_to_data, year)

df_headlines = pd.concat(appendedheadlines)

In [6]:
# Reset index of complete dataframe
headlines = df_headlines.reset_index(drop=True)

In [7]:
# Format publish_time as a datetime
headlines['publish_time'] = pd.to_datetime(headlines['publish_time'], format='%Y%m%d')
# Note that time still included, just not shown due to format.

In [8]:
# Create date column - no time included
headlines['date'] = headlines['publish_time'].dt.date

In [9]:
# Drop original date-time column
headlines = headlines.drop(['publish_time'], axis=1)

In [10]:
headlines.head()

Unnamed: 0,headline_text,date
0,China's brokerages told to manage reputation r...,2017-01-01
1,Kia Motors says plans to sell 3.17 million veh...,2017-01-02
2,Around 60 killed in Brazil prison riot - state...,2017-01-03
3,BRIEF-Hunter Hall International updates on off...,2017-01-04
4,Russia's Rosneftegaz closes Rosneft privatisat...,2017-01-05


In [11]:
# Create instances of classes for natural language processing
sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
treebank_tokenizer = TreebankWordTokenizer()
wordnet_lemmatizer = WordNetLemmatizer()
stopwords = nltk.corpus.stopwords.words('english')

In [12]:
# Expanding list of stopwords to be removed by including words specific to Reuters data - 
# observed words Reuters seems to use to indicate types of news but that do not carry news content
#user_defined_stop_words = ['ADVISORY', 'ALERT', 'ANALYSIS', 'BRIEF', 'COLUMN', 'CORRECTED', 'DIARY', 'EMBARGOED', 
#                           'EXCLUSIVE', 'FEATURE', 'FRAUD ALERT', 'GRAPHIC',
#                           'INSIGHT', 'INVESTIGATION ALERT', 'INVESTOR ALERT', 'PREVIEW', 'SHAREHOLDER ALERT', 
#                           'UPDATE', 'UPDATE 1', 'UPDATE 2', 'UPDATE 3', 'Jan', 'Feb', 'Mar', 'Apr', 'Jun', 'Jul', 
#                           'Aug', 'Sep', 'Oct', 'Nov', 'Dec', ' R ', ' TM ', ' plc ', ' LLC ', ' PLC ', ' CES '
#                           'PRESS DIGEST', 'GLOBAL' 'ETF Net Asset Value'] 
                    
user_defined_stop_words = ['UPDATE', 'Jan', 'Feb', 'Mar', 'Apr', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec', 
                           ' R ', ' TM ', ' plc ', ' LLC ', ' PLC ', ' CES '] 

# Could combine:
#a = nltk.corpus.stopwords.words('english')
#b = list(string.punctuation) + user_defined_stop_words
#stopwords = set(a).union(b)

In [13]:
def preprocess(x):
    punkt_sentences = sentence_tokenizer.tokenize(x)
    sentences_words = [treebank_tokenizer.tokenize(sentence) for sentence in punkt_sentences] #segment sentences
    all_tokens = [word for sentence in sentences_words for word in sentence] #tokenize words
    clean_tokens = [w for w in all_tokens if w not in user_defined_stop_words] #drop Reuters stopwords
    #tokens = [word.lower() for word in clean_tokens]# make lowercase
    content = [w for w in clean_tokens if w.lower() not in stopwords] #drop regular stopwords
    content2 = [words for words in content if words.isalpha()] #drop punctuation
    lemmatized_words = [wordnet_lemmatizer.lemmatize(word) for word in content2] #lemmatize
    return ' '.join(lemmatized_words)

In [14]:
# Preprocess headlines for each date
headlines['clean_text'] = headlines['headline_text'].apply(preprocess)

In [15]:
# Create list of each date
dates = list(headlines['date'])

In [16]:
# Create list from each row of cleaned text from dataframe
cleantextlist = list(headlines['clean_text'])

In [17]:
# Zip lists together and save each date's news headlines as a text file.
for i, t in zip(dates, cleantextlist):
    file = open('/Users/varsha/GWU_SPRING2019/Topics in Big Data/Project/Code/Daily News/' + str(i) + '.txt', 'w',encoding='utf-8')
    file.write(t)
    file.close()

In [18]:
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords





In [19]:
newspath = os.path.join(os.getcwd(), 'Code','Daily News')
newspath

'/Users/varsha/GWU_SPRING2019/Topics in Big Data/Project/Code/Daily News'

In [20]:
import spacy
spacy.load('en')
from spacy.lang.en import English
parser = English()
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens



import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
    
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)


nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

[nltk_data] Downloading package wordnet to /Users/varsha/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/varsha/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [21]:

from sklearn import metrics
from nltk.stem.porter import PorterStemmer
article_hash = {}
directory = "/Users/varsha/GWU_SPRING2019/Topics in Big Data/Project/Code/Daily News/"

files = [f for f in listdir(directory) if isfile(join(directory, f))]

for file in files:
    #file = pathlib.Path( "/Users/varsha/Python Workspace/homework_2/data")
    #print(file)
    fileop=open(directory+"/"+file,"r")
    text = fileop.read()
    article_hash[file]  = text
    print("-----------------------------------")
    print("file;" , file)
    print(article_hash[file])

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [22]:
def get_topics(text):
    def lemmatize_stemming(text):
        stemmer = SnowballStemmer("english", ignore_stopwords=True)
        return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

    def preprocess(text):
        result = []
        for token in gensim.utils.simple_preprocess(text):
            if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
                #result.append(token)
                result.append(lemmatize_stemming(token))
        return result


    words = []
    for word in text.split(' '):
        words.append(word)

    processed_data = preprocess(text)

    dictionary = gensim.corpora.Dictionary([processed_data])

    bow_corpus = [dictionary.doc2bow(processed_data)]

    bow_doc_0 = bow_corpus[0]

    tfidf = models.TfidfModel(bow_corpus)

    corpus_tfidf = tfidf[bow_corpus]

    # LDA Model using Bag of Words
    lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=15, id2word=dictionary, passes=2, workers=4)

    topic_score_list = lda_model.show_topics(num_topics=1, num_words=15, log=False, formatted=False)[0][1]
    topics_list = [topic[0] for topic in topic_score_list]
    return(topics_list)
    #return(lda_model.show_topics(num_topics=1, num_words=15, log=False, formatted=False)[0][1])




In [23]:
def compute_afinn_score(text):
    afinn = Afinn(emoticons=True)

    #compute sentiment scores (polarity) and labels
    sentiment_score     = [afinn.score(text)]
    sentiment_category  = ['positive' if score > 0
                           else 'negative' if score < 0
                           else 'neutral'
                           for score in sentiment_score]

    #print(sentiment_score)
    #print(sentiment_category)
    return(sentiment_score, sentiment_category)




In [24]:
def process_text(source,text):
    article_df = pd.DataFrame(columns=['source','topics'])      # initialize dataframe for each article
    #person_names=person_list

    #article_df['a'] = None

   
    #article_df['source'] = pd.Series(dtype='str')
    #article_df['source'] = source
    
    topics = get_topics(text)
    article_df['topics'] = [topics]
    article_df['source'] = source
    (sentiment_score, sentiment_category) = compute_afinn_score(text)
    article_df['sentiment_score'] = sentiment_score
    article_df['sentiment_category'] = sentiment_category
    return article_df




In [25]:
results_df = pd.DataFrame()

In [26]:
results_df

In [27]:
from os import listdir
from os.path import isfile, join
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
import numpy as np
from afinn import Afinn
afinn = Afinn()

In [28]:
newsfiles = [f for f in listdir(newspath) if isfile(join(newspath, f))]

for file in newsfiles:
   filepath = newspath + "/" + file
   data = open(filepath,'r',encoding='utf-8')
   text = data.read()
   results_df = results_df.append(process_text(file,text))

In [29]:
results_df

Unnamed: 0,source,topics,sentiment_score,sentiment_category
0,2017-05-24.txt,"[share, profit, group, offer, plan, bank, mill...",645.0,positive
0,2017-05-30.txt,"[profit, share, loss, rise, india, bank, fall,...",607.0,positive
0,2017-05-18.txt,"[share, profit, trump, rise, group, announc, m...",760.0,positive
0,2017-02-11.txt,"[trump, report, stake, profit, share, hold, ri...",45.0,positive
0,2017-02-05.txt,"[trump, preview, stand, travel, result, judg, ...",13.0,positive
0,2017-12-28.txt,"[share, unit, china, say, group, yuan, hold, b...",521.0,positive
0,2017-12-14.txt,"[bank, say, share, profit, group, unit, deal, ...",936.0,positive
0,2017-10-05.txt,"[share, announc, stock, bank, price, sale, mar...",197.0,positive
0,2017-10-11.txt,"[share, bank, sale, deal, group, announc, repo...",244.0,positive
0,2017-07-21.txt,"[share, profit, million, announc, bank, report...",1000.0,positive


In [30]:
results_df.to_csv("output.csv")
