# Using AWS Comprehend
AWS Comprehend is Amazon's text analysis service. In this file, you'll learn how to call Comprehend from within SageMaker Studio. We will also look at how we can integrate Comprehend for data that we already have saved in S3.

In [None]:
import pandas as pd
from collections import OrderedDict
import requests

### Get the Comprehend client

In [None]:
comprehend = boto3.client('comprehend', region_name='us-east-1')

In [None]:
### Call the Comprehend methods

In [None]:
sample_tweet="Be of good cheer. Take life one step at a time and do the best you can each day. Life passes so swiftly. --President Nelson"   

# Key phrases
phrases = comprehend.detect_key_phrases(Text=sample_tweet, LanguageCode='en')

# Entities
entities = comprehend.detect_entities(Text=sample_tweet, LanguageCode='en')

#Sentiments
sentiments = comprehend.detect_sentiment(Text=sample_tweet, LanguageCode='en')


# Print the phrases:
print('------- phrases ---------')
for i in range(0, len(phrases['KeyPhrases'])):
    print((phrases['KeyPhrases'][i]['Text']))
    

# Print the entities with entitity type:
print('------- entity : entity type ---------')
for i in range(0, len(entities['Entities'])):
    print(entities['Entities'][i]['Text'] + ' : ' + entities['Entities'][i]['Type'] )
    
# Print the sentiment:
print('------- sentiment ---------')
print(sentiments['Sentiment'])

### Import CSV from web + Comprehend

In [None]:
import pandas as pd
import boto3 

comprehend = boto3.client('comprehend', region_name='us-east-1')

df = pd.read_csv("https://www.ishelp.info/data/tweets_aws.csv")
df = df.head(10)

InputFieldName = "text"
OutputFieldName_prefix = "Sentiment"

#iterate over each row calling comprehend for each and taking the result and saving it back to the datafrom
for index, row in df.iterrows():
    #this calls the comprehend service for each item in our data frame
    res = comprehend.detect_sentiment(Text=df.loc[index, InputFieldName], LanguageCode='en')
    
    #save sentiment scores to existing dataframe
    df.loc[index,f'{OutputFieldName_prefix}_Overall'] = res['Sentiment']
    df.loc[index,f'{OutputFieldName_prefix}_Positive'] = res["SentimentScore"]["Positive"]
    df.loc[index,f'{OutputFieldName_prefix}_Negative'] = res["SentimentScore"]["Negative"]
    df.loc[index,f'{OutputFieldName_prefix}_Neutral'] = res["SentimentScore"]["Neutral"]
    df.loc[index,f'{OutputFieldName_prefix}_Mixed'] = res["SentimentScore"]["Mixed"]

In [None]:
df

### Use the Twitter API + Comprehend

In [None]:
import pandas as pd
from collections import OrderedDict
import requests
import boto3 

comprehend = boto3.client('comprehend', region_name='us-east-1')

In [None]:
#Insert your Twitter API keys here
api_key = ''
api_secret = ''
access_token = ''
access_secret = ''

In [None]:
%%bash
pip install tweepy

In [None]:
import tweepy
auth = tweepy.OAuthHandler(api_key, api_secret)
auth.set_access_token(access_token, access_secret)
api = tweepy.API(auth)

In [None]:
tag = '#cciv'
tweets = api.search(q=tag, count = 100)

In [None]:
posts = []
timestamp = []
locations = []
sentiments = []
positive = []
negative = []
neutral = []

for i in range(len(tweets)):
    d = tweets[i].text
    ts = tweets[i].created_at
    l = tweets[i].user.location
    
    if d != '':
        res = comprehend.detect_sentiment(Text=d, LanguageCode='en')
        s = res.get('Sentiment')
        p = res.get('SentimentScore')['Positive']
        neg = res.get('SentimentScore')['Negative']
        neu = res.get('SentimentScore')['Neutral']
    
    timestamp.append(ts)
    posts.append(d)
    locations.append(l)
    sentiments.append(s)
    positive.append(p)
    negative.append(neg)
    neutral.append(neu)

In [None]:
import pandas as pd
from collections import OrderedDict

result = pd.DataFrame(OrderedDict( {
            'tweets': posts
         , 'location': pd.Series(locations).str.wrap(15)
         , 'timestamp': timestamp
         , 'sentiment': sentiments
         , 'positiveScore': positive
         , 'negativeScore': negative
         , 'neutralScore' : neutral
         }))

In [None]:
result

In [None]:
result.groupby(by='location', sort = True)['tweets'].count().sort_values(ascending=False)

In [None]:
result.groupby(by='sentiment', sort = True)['tweets'].count().sort_values(ascending=False)