# Reuters Archives - Amazon Comprehend (NLP and Text Analytics)

**Objectives:** 
1. Use Amazon Comprehend for Topic Modeling and Sentiment Analysis
https://docs.aws.amazon.com/comprehend/latest/dg/getting-started.html
- Sentiment Analysis - https://docs.aws.amazon.com/comprehend/latest/dg/how-sentiment.html
- Topic Modeling - https://docs.aws.amazon.com/comprehend/latest/dg/topic-modeling.html

- The Reuters dataset used here "reuters_data.csv" was web scraped from https://uk.reuters.com/news/archive/GCA-ForeignExchange on Dec 2, 2018. It contains... 
- articles from 2010-05-17 to 2018-11-30
- 10,200 total articles
- Index([u'Date', u'Timestamp', u'excerpt', u'link', u'page', u'post', u'title'], dtype='object')

In [13]:
import boto3
import botocore

In [179]:
Bucket = "capstoneproject-770851433061"
Key = "reuters_data_with_location.csv" #"Name of the file in S3 that you want to download"
outPutName = "reuters_data_with_location.csv" #The name you want to save after we download from s3
s3 = boto3.resource('s3')
try:
    s3.Bucket(Bucket).download_file(Key, outPutName)
except botocore.exceptions.ClientError as e:
    if e.response['Error']['Code'] == "404":
        print("The object does not exist.")
    else:
        raise

In [None]:
python -m pip install --user nltk #python2

# Objective 1. Topic Modeling and Sentiment Analysis on Article Excerpts

## Load Dataset

In [1]:
import pandas as pd
df = pd.read_csv('./data/reuters_data.csv')

In [2]:
df.post[1]

'LONDON (Reuters) - Sterling slumped against the dollar and the euro on Tuesday as doubts grew about whether British Prime Minister Theresa May can get a Brexit agreement through a divided parliament.  May began a tour of the United Kingdom to drum up support for her Brexit divorce deal with the European Union but her deputy said parliament might reject it if asked to vote on it now. May\xa1\xa6s attempts to win over critics in her Conservative and opposition parties in order to get her deal approved are seen by investors as increasingly fraught.   \xa1\xa7The failure of the pound to rally on recent positive developments suggest the market is pricing in that the deal won\xa1\xa6t pass the first time in parliament,\xa1\xa8 said Lee Hardman, a currency analyst at MUFG. \xa1\xa7The pound will likely trade with increased volatility during the next two weeks.\xa1\xa8 Against the dollar, the British currency fell more than half a percent to $1.2730, its lowest level in nearly two weeks. It a

In [3]:
#get excerpts
reuters_post = df.post[:]
reuters_post.head()

0    NEW YORK (Reuters) - The U.S. dollar gained on...
1    LONDON (Reuters) - Sterling slumped against th...
2    LONDON (Reuters) - Sterling gave up most of it...
3    NEW YORK (Reuters) - The dollar tumbled from t...
4    LONDON (Reuters) - The pound fell towards a tw...
Name: post, dtype: object

In [26]:
#reuters_excerpt.to_csv('./data/reuters_excerpt.csv')

In [4]:
with open("reuters_post.txt", "w") as my_output_file:
    [my_output_file.write("".join(row)+'\n') for row in reuters_post]
my_output_file.close()
#copy and paste to UTF

In [None]:
#import io
#with open("reuters_excerpt.txt",'r') as f:
#    text = f.read()
# process Unicode text
#with io.open("reuters_excerpt_utf.txt",'w',encoding='utf8') as f:
#    f.write(text)

## Submit to Amazon Comprehend for Topic Modeling and Sentiment
### grab output files from Topic Modeling

In [5]:
topic_terms = pd.read_csv('./data/post-topic-terms.csv')
doc_topics = pd.read_csv('./data/post-doc-topics.csv')

In [6]:
topic_terms.head()

Unnamed: 0,topic,term,weight
0,0,year,0.023462
1,0,show,0.017193
2,0,poll,0.015251
3,0,reuters,0.014388
4,0,european,0.014177


In [7]:
doc_topics.head()

Unnamed: 0,docname,topic,proportion
0,reuters_postt_utf.txt:6,16,1.0
1,reuters_postt_utf.txt:41,14,1.0
2,reuters_postt_utf.txt:76,1,1.0
3,reuters_postt_utf.txt:111,0,1.0
4,reuters_postt_utf.txt:146,4,1.0


In [8]:
import re
doc_topics['docname'] = doc_topics['docname'].apply(lambda x: re.sub('reuters_postt_utf.txt:', '', x))
doc_topics.docname = pd.to_numeric(doc_topics.docname)
doc_topics = doc_topics.sort_values('docname')

In [9]:
doc_topics= doc_topics.reset_index()
doc_topics.head()

Unnamed: 0,index,docname,topic,proportion
0,3204,0,27,1.0
1,8159,1,19,1.0
2,9325,2,4,1.0
3,583,3,27,1.0
4,3496,4,19,1.0


In [11]:
len(topic_terms.topic.unique())

30

In [10]:
len(doc_topics.topic.unique())
#30 topics but only 22 selected

22

In [12]:
doc_topics.groupby('topic')["proportion"].count()

topic
0     1231
1     1163
2       53
3       35
4     1402
5     2048
6       46
7       24
8        8
9        8
10       1
11       6
12     849
13     595
14     456
15      11
16     528
17     425
18      12
19     650
26     241
27     408
Name: proportion, dtype: int64

In [13]:
from collections import defaultdict

In [None]:
#cluster_groups = kmeans.predict(ret2.T)
#set(cluster_groups)
#print(cluster_groups)
#print(list(zip(cluster_groups, ret2.columns)))

In [14]:
topic = topic_terms['topic']
term = topic_terms['term']
#set(list(topic))
#print(list(topic))
print (list(zip(topic, term)))

[(0, 'year'), (0, 'show'), (0, 'poll'), (0, 'reuters'), (0, 'european'), (0, 'month'), (0, 'currency'), (0, 'union'), (0, 'sterling'), (0, 'leave'), (1, 'yen'), (1, 'dollar'), (1, 'japanese'), (1, 'safe'), (1, 'currency'), (1, 'haven'), (1, 'japan'), (1, 'low'), (1, 'minister'), (1, 'monday'), (2, 'market'), (2, 'primar'), (2, 'referendum'), (2, 'york'), (2, 'theresa'), (2, 'centrar'), (2, 'border'), (2, 'consecutive'), (2, 'campaign'), (2, 'sideline'), (3, 'friday'), (3, 'thursday'), (3, 'wednesday'), (3, 'tuesday'), (3, 'monday'), (3, 'saturday'), (3, 'sunday'), (3, 'prove'), (3, 'road'), (3, 'story'), (4, 'euro'), (4, 'zone'), (4, 'debt'), (4, 'crisis'), (4, 'european'), (4, 'investor'), (4, 'greece'), (4, 'government'), (4, 'monday'), (4, 'bond'), (5, 'u.s'), (5, 'stock'), (5, 'global'), (5, 'bond'), (5, 'market'), (5, 'high'), (5, 'rise'), (5, 'investor'), (5, 'equity'), (5, 'european'), (6, 'bank'), (6, 'union'), (6, 'share'), (6, "bank's"), (6, 'company'), (6, 'group'), (6, 'ban

In [15]:
topic_terms.iloc[:,[0,1]].head() #['topic','term']

Unnamed: 0,topic,term
0,0,year
1,0,show
2,0,poll
3,0,reuters
4,0,european


In [None]:
#similar_by_cluster = defaultdict(list)
#for a, b in zip(cluster_groups, ret2.columns):
#       similar_by_cluster[a].append(b)

In [16]:
#similar_by_cluster = defaultdict(list)
similar_by_cluster = defaultdict(list)
for a,b in zip(topic, term):
    similar_by_cluster[a].append(b)

In [17]:
similar_by_cluster

defaultdict(list,
            {0: ['year',
              'show',
              'poll',
              'reuters',
              'european',
              'month',
              'currency',
              'union',
              'sterling',
              'leave'],
             1: ['yen',
              'dollar',
              'japanese',
              'safe',
              'currency',
              'haven',
              'japan',
              'low',
              'minister',
              'monday'],
             2: ['market',
              'primar',
              'referendum',
              'york',
              'theresa',
              'centrar',
              'border',
              'consecutive',
              'campaign',
              'sideline'],
             3: ['friday',
              'thursday',
              'wednesday',
              'tuesday',
              'monday',
              'saturday',
              'sunday',
              'prove',
              'road',
              'stor

In [26]:
df['post.topic'] = doc_topics['topic']

In [27]:
df.head()

Unnamed: 0,Date,Timestamp,excerpt,link,page,post,title,post.topic
0,2018-11-27,02:19:00,The U.S. dollar gained on Tuesday after Federa...,https://uk.reuters.com/article/uk-global-forex...,1,NEW YORK (Reuters) - The U.S. dollar gained on...,Dollar gains as Fed's Clarida backs further ra...,27
1,2018-11-27,10:30:00,Sterling slumped against the dollar and the eu...,https://uk.reuters.com/article/uk-britain-ster...,1,LONDON (Reuters) - Sterling slumped against th...,Sterling slides with UK Brexit vote in doubt,19
2,2018-11-28,09:25:00,Sterling gave up most of its earlier gains and...,https://uk.reuters.com/article/uk-britain-ster...,1,LONDON (Reuters) - Sterling gave up most of it...,Sterling erases earlier gains after central ba...,4
3,2018-11-28,01:50:00,The dollar tumbled from two-week highs on Wedn...,https://uk.reuters.com/article/uk-global-forex...,1,NEW YORK (Reuters) - The dollar tumbled from t...,Dollar drops as Fed's Powell says rates near n...,27
4,2018-11-29,09:52:00,The pound fell towards a two-week low on Thurs...,https://uk.reuters.com/article/uk-britain-ster...,1,LONDON (Reuters) - The pound fell towards a tw...,Sterling heads towards two-week lows as Brexit...,19


In [24]:
#df.topic
#df = df.drop('term', 1)

In [61]:
list = []
list.append("d")
print(list)
list[0]

['d']


'd'

In [20]:
list = []
for i in df.topic:
    list.append(str(similar_by_cluster[i]))
    #df['term'][i]= str(similar_by_cluster[i])

In [28]:
df["post.term"] = list

In [29]:
df.head()

Unnamed: 0,Date,Timestamp,excerpt,link,page,post,title,post.topic,post.term
0,2018-11-27,02:19:00,The U.S. dollar gained on Tuesday after Federa...,https://uk.reuters.com/article/uk-global-forex...,1,NEW YORK (Reuters) - The U.S. dollar gained on...,Dollar gains as Fed's Clarida backs further ra...,27,"['dollar', 'u.s', 'rate', 'federal', 'reservar..."
1,2018-11-27,10:30:00,Sterling slumped against the dollar and the eu...,https://uk.reuters.com/article/uk-britain-ster...,1,LONDON (Reuters) - Sterling slumped against th...,Sterling slides with UK Brexit vote in doubt,19,"['bank', 'rate', 'sterling', 'england', 'inter..."
2,2018-11-28,09:25:00,Sterling gave up most of its earlier gains and...,https://uk.reuters.com/article/uk-britain-ster...,1,LONDON (Reuters) - Sterling gave up most of it...,Sterling erases earlier gains after central ba...,4,"['euro', 'zone', 'debt', 'crisis', 'european',..."
3,2018-11-28,01:50:00,The dollar tumbled from two-week highs on Wedn...,https://uk.reuters.com/article/uk-global-forex...,1,NEW YORK (Reuters) - The dollar tumbled from t...,Dollar drops as Fed's Powell says rates near n...,27,"['dollar', 'u.s', 'rate', 'federal', 'reservar..."
4,2018-11-29,09:52:00,The pound fell towards a two-week low on Thurs...,https://uk.reuters.com/article/uk-britain-ster...,1,LONDON (Reuters) - The pound fell towards a tw...,Sterling heads towards two-week lows as Brexit...,19,"['bank', 'rate', 'sterling', 'england', 'inter..."


In [30]:
df.to_csv('./data/reuters_post_topic_modeling.csv', index=False)

## grab output files from sentiment

In [31]:
topic_terms = pd.read_csv('./data/post_sentiment.txt', header = None)

In [33]:
topic_terms.columns = ['output','line','sentiment','mixed','negative','neutral','positive']

In [35]:
topic_terms = topic_terms.drop('output', 1)
topic_terms = topic_terms.drop('line', 1)

In [164]:
#topic_terms = topic_terms.drop(range(10200,10203),0)

In [39]:
topic_terms['sentiment'] = topic_terms['sentiment'].apply(lambda x: re.sub('"Sentiment":', '', x))
topic_terms['mixed'] = topic_terms['mixed'].apply(lambda x: re.sub('"SentimentScore": {"Mixed":', '', x))
topic_terms['negative'] = topic_terms['negative'].apply(lambda x: re.sub('"Negative":', '', x))
topic_terms['neutral'] = topic_terms['neutral'].apply(lambda x: re.sub('"Neutral":', '', x))
topic_terms['positive'] = topic_terms['positive'].apply(lambda x: re.sub('"Positive":', '', x))

In [41]:
topic_terms['sentiment'] = topic_terms['sentiment'].apply(lambda x: re.sub('"', '', x))
topic_terms['positive'] = topic_terms['positive'].apply(lambda x: re.sub('}}', '', x))

In [43]:
topic_terms.groupby('sentiment').count()

Unnamed: 0_level_0,mixed,negative,neutral,positive
sentiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
MIXED,8,8,8,8
NEGATIVE,704,704,704,704
NEUTRAL,9436,9436,9436,9436
POSITIVE,52,52,52,52


In [44]:
topic_terms.head(20)

Unnamed: 0,sentiment,mixed,negative,neutral,positive
0,NEUTRAL,0.00354626448825,0.0067107952199876,0.9880415201187134,0.0017014214536175
1,NEUTRAL,0.0081893661990761,0.3202889561653137,0.666081964969635,0.00543964933604
2,NEUTRAL,0.0080856867134571,0.0642052516341209,0.921276032924652,0.0064329295419156
3,NEUTRAL,0.0006543741328641,0.0048144487664103,0.9928760528564452,0.0016551942098885
4,NEUTRAL,0.0038918505888432,0.0506839826703071,0.942244291305542,0.0031799226999282
5,NEUTRAL,0.0042532058432698,0.1269941627979278,0.8649519681930542,0.0038006799295544
6,NEUTRAL,0.0039678472094237,0.030410561710596,0.9502358436584472,0.0153857255354523
7,NEUTRAL,0.0002372775634285,0.0004061247454956,0.9981526732444764,0.0012039742432534
8,NEUTRAL,0.0005289752734825,0.0032367452513426,0.9956300258636476,0.0006042591412551
9,NEUTRAL,0.0053769932128489,0.0210483744740486,0.9595285058021544,0.01404610555619


## sentiment is the same for excerpt and post!!!