<a href="https://colab.research.google.com/github/sumukshashidhar/toreda/blob/master/src/sentiment%20analysis/Sentiment%20Analysis%20Code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sentiment Analysis Beta 

First, install the dependencies

In [1]:
!pip install vaderSentiment
!pip install newspaper3k

Collecting vaderSentiment
[?25l  Downloading https://files.pythonhosted.org/packages/76/fc/310e16254683c1ed35eeb97386986d6c00bc29df17ce280aed64d55537e9/vaderSentiment-3.3.2-py2.py3-none-any.whl (125kB)
[K     |██▋                             | 10kB 16.0MB/s eta 0:00:01[K     |█████▏                          | 20kB 6.6MB/s eta 0:00:01[K     |███████▉                        | 30kB 6.7MB/s eta 0:00:01[K     |██████████▍                     | 40kB 8.5MB/s eta 0:00:01[K     |█████████████                   | 51kB 6.9MB/s eta 0:00:01[K     |███████████████▋                | 61kB 7.3MB/s eta 0:00:01[K     |██████████████████▏             | 71kB 7.7MB/s eta 0:00:01[K     |████████████████████▉           | 81kB 8.6MB/s eta 0:00:01[K     |███████████████████████▍        | 92kB 9.0MB/s eta 0:00:01[K     |██████████████████████████      | 102kB 9.5MB/s eta 0:00:01[K     |████████████████████████████▋   | 112kB 9.5MB/s eta 0:00:01[K     |███████████████████████████████▏| 12

Imports

In [0]:
import sys
import json
import time
import re
import requests
import nltk
import argparse
import logging
import string
import plotly.express as px
try:
    import urllib.parse as urlparse
except ImportError:
    import urlparse
from tweepy.streaming import StreamListener
from tweepy import API, Stream, OAuthHandler, TweepError
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from bs4 import BeautifulSoup

from random import randint, randrange
from datetime import datetime
from newspaper import Article, ArticleException

Defining some constants here, such as the sentiment url and list of tweet_ids that we wish to use for analysis

In [0]:
sentimentURL = 'http://text-processing.com/api/sentiment/'


# tweet id list
tweet_ids = []


prev_time = time.time()
sentiment_avg = [0.0,0.0,0.0]

This is the code that allows us to recieve sentiment analysis data from the API specified

In [0]:
def get_sentiment_from_url(text, sentimentURL):
    # get sentiment from text processing website
    payload = {'text': text}

    try:
        #logger.debug(text)
        post = requests.post(sentimentURL, data=payload)
        #logger.debug(post.status_code)
        #logger.debug(post.text)
    except requests.exceptions.RequestException as re:
        logger.error("Exception: requests exception getting sentiment from url caused by %s" % re)
        raise

    # return None if we are getting throttled or other connection problem
    if post.status_code != 200:
        logger.warning("Can't get sentiment from url caused by %s %s" % (post.status_code, post.text))
        return None

    response = post.json()

    neg = response['probability']['neg']
    pos = response['probability']['pos']
    neu = response['probability']['neutral']
    label = response['label']

    # determine if sentiment is positive, negative, or neutral
    if label == "neg":
        sentiment = "negative"
    elif label == "neutral":
        sentiment = "neutral"
    else:
        sentiment = "positive"

    return sentiment, neg, pos, neu

Test case

In [5]:
get_sentiment_from_url("I hate you", sentimentURL)

('negative', 0.7400333317066653, 0.2599666682933347, 0.043876766913714414)

Removing parts of speech and non essential parts

In [0]:
def clean_text(text):
    # clean up text
    text = text.replace("\n", " ")
    text = re.sub(r"https?\S+", "", text)
    text = re.sub(r"&.*?;", "", text)
    text = re.sub(r"<.*?>", "", text)
    text = text.replace("RT", "")
    text = text.replace(u"…", "")
    text = text.strip()
    return text

Some more cleaning

In [0]:
def clean_text_sentiment(text):
    # clean up text for sentiment analysis
    text = re.sub(r"[#|@]\S+", "", text)
    text = text.strip()
    return text


Main Analysis Code

In [0]:
def sentiment_analysis(text):
    """Determine if sentiment is positive, negative, or neutral
    algorithm to figure out if sentiment is positive, negative or neutral
    uses sentiment polarity from TextBlob, VADER Sentiment and
    sentiment from text-processing URL
    """

    # pass text into sentiment url
    if True:
        ret = get_sentiment_from_url(text, sentimentURL)
        if ret is None:
            sentiment_url = None
        else:
            sentiment_url, neg_url, pos_url, neu_url = ret
    else:
        sentiment_url = None

    # pass text into TextBlob
    text_tb = TextBlob(text)

    # pass text into VADER Sentiment
    analyzer = SentimentIntensityAnalyzer()
    text_vs = analyzer.polarity_scores(text)

    # determine sentiment from our sources
    if sentiment_url is None:
        #threshold values
        if text_tb.sentiment.polarity < 0 and text_vs['compound'] <= -0.05:
            sentiment = "negative"
        elif text_tb.sentiment.polarity > 0 and text_vs['compound'] >= 0.05:
            sentiment = "positive"
        else:
            sentiment = "neutral"
    else:
        # this works if the above function executes properly
        if text_tb.sentiment.polarity < 0 and text_vs['compound'] <= -0.05 and sentiment_url == "negative":
            sentiment = "negative"
        elif text_tb.sentiment.polarity > 0 and text_vs['compound'] >= 0.05 and sentiment_url == "positive":
            sentiment = "positive"
        else:
            sentiment = "neutral"

    polarity = (text_tb.sentiment.polarity + text_vs['compound']) / 2

    # output sentiment polarity
    print("************")
    print("Sentiment Polarity: " + str(round(polarity, 3)))

    # output sentiment subjectivity (TextBlob)
    print("Sentiment Subjectivity: " + str(round(text_tb.sentiment.subjectivity, 3)))
    # output sentiment
    print("Sentiment (url): " + str(sentiment_url))
    print("Sentiment (algorithm): " + str(sentiment))
    print("Overall sentiment (textblob): ", text_tb.sentiment)
    print("Overall sentiment (vader): ", text_vs)
    print("sentence was rated as ", round(text_vs['neg']*100, 3), "% Negative")
    print("sentence was rated as ", round(text_vs['neu']*100, 3), "% Neutral")
    print("sentence was rated as ", round(text_vs['pos']*100, 3), "% Positive")
    print("************")
    fig = px.bar(x = ['Negative', 'Positive', 'Neutral'], y=[text_vs['neg']*100, text_vs['pos']*100, text_vs['neu']*100], width=500)
    fig.show()
    return polarity, text_tb.sentiment.subjectivity, sentiment

In [32]:
sentiment_analysis("Sumuk Shashidhar is the best in tHIS world")

************
Sentiment Polarity: 0.818
Sentiment Subjectivity: 0.3
Sentiment (url): neutral
Sentiment (algorithm): neutral
Overall sentiment (textblob):  Sentiment(polarity=1.0, subjectivity=0.3)
Overall sentiment (vader):  {'neg': 0.0, 'neu': 0.625, 'pos': 0.375, 'compound': 0.6369}
sentence was rated as  0.0 % Negative
sentence was rated as  62.5 % Neutral
sentence was rated as  37.5 % Positive
************


(0.81845, 0.3, 'neutral')

Under Development

In [0]:
def get_page_text(url):

    max_paragraphs = 10

    try:
        logger.debug(url)
        req = requests.get(url)
        html = req.text
        soup = BeautifulSoup(html, 'html.parser')
        html_p = soup.findAll('p')

        logger.debug(html_p)

        if html_p:
            n = 1
            for i in html_p:
                if n <= max_paragraphs:
                    if i.string is not None:
                        logger.debug(i.string)
                        yield i.string
                n += 1

    except requests.exceptions.RequestException as re:
        logger.warning("Exception: can't crawl web site (%s)" % re)
        pass