# Import necessary depencencies

In [6]:
import pandas as pd
import numpy as np
import model_evaluation_utils as meu
import pickle
import random
import os
import text_normalizer as tn
import nltk
!python -m spacy download en

nltk.download('sentiwordnet')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('vader_lexicon')

np.set_printoptions(precision=2, linewidth=80)

Collecting https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.0.0/en_core_web_sm-2.0.0.tar.gz
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.0.0/en_core_web_sm-2.0.0.tar.gz (37.4MB)
[K    100% |################################| 37.4MB 12.3MB/s ta 0:00:01

[93m    Linking successful[0m
    /home/tie-server/anaconda3/lib/python3.6/site-packages/en_core_web_sm
    --> /home/tie-server/anaconda3/lib/python3.6/site-packages/spacy/data/en

    You can now load the model via spacy.load('en')

[nltk_data] Downloading package sentiwordnet to /home/tie-
[nltk_data]     server/nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /home/tie-
[nltk_data]     server/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/tie-server/nltk_data...
[nltk_data]   Package aver

# Change path to dataset here

In [12]:
basedir = os.path.join(os.sep, "media", "tie-server", "DATA", "Jens", "Crunchbase")
#tempdir = os.getcwd()
tempdir = basedir

# Load and normalize data

In [14]:
with open (os.path.join(tempdir, 'temp_data', 'sample_labelled_df'), 'rb') as fp:
    sample_labelled_df = pickle.load(fp)

test_articles = np.array(sample_labelled_df['paragraphs'])
test_sentiments = np.array(sample_labelled_df['label'])

# extract data for model evaluation
random.seed(13)
sample_article_ids = random.sample(list(range(0,len(sample_labelled_df)-1)), 3)

# normalize dataset
norm_test_articles = tn.normalize_corpus(test_articles)

# Sentiment Analysis with AFINN

In [15]:
from afinn import Afinn

afn = Afinn(emoticons=True) 

## Predict sentiment for sample articles

In [16]:
for article, sentiment in zip(test_articles[sample_article_ids], test_sentiments[sample_article_ids]):
    print('Article:', article)
    print('Actual Sentiment:', sentiment)
    print('Predicted Sentiment polarity:', afn.score(article))
    print('-'*60)

Article:  alexandria, va: united states patent and trademark office has received an application no. 20170281686 for us patent, published on october 5, 2017, by stembiosys, inc. (texas), titled as "bone marrow stromal cell derived extracellular matrix protein extract and uses thereof"  for the registration of patent.

 stembiosys, inc. (texas) applies for us patent titled as "bone marrow stromal cell derived extracellular matrix protein extract and uses thereof"
Actual Sentiment: neutral
Predicted Sentiment polarity: 0.0
------------------------------------------------------------
Article: fashion and apparel, a $110-130 billion market, offers 30-40% margins but it is crowded with the likes of the flipkart-jabong-myntra combine, amazon and more than 800 other players such as limeroad and voonik jostling for space. snapdeal will spend $100 million to build this category, says bahl.\x93to build any of these we could even look at acquiring capabilities,\x93 says bahl. the acquisition in ma

## Predict sentiment for test dataset

In [17]:
sentiment_polarity = [afn.score(article) for article in test_articles]
predicted_sentiments = ['positive' if score >= 1.0 else 'negative' for score in sentiment_polarity]

## Evaluate model performance

In [18]:
meu.display_model_performance_metrics(true_labels=test_sentiments, predicted_labels=predicted_sentiments, 
                                  classes=['positive', 'negative'])

Model Performance metrics:
------------------------------
Accuracy: 0.0612
Precision: 0.0039
Recall: 0.0612
F1 Score: 0.0073

Model Classification report:
------------------------------
             precision    recall  f1-score   support

   positive       0.07      1.00      0.13         5
   negative       0.04      1.00      0.08         1

avg / total       0.06      1.00      0.12         6


Prediction Confusion Matrix:
------------------------------
                 Predicted:         
                   positive negative
Actual: positive          5        0
        negative          0        1


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


# Sentiment Analysis with SentiWordNet

In [19]:
from nltk.corpus import sentiwordnet as swn

awesome = list(swn.senti_synsets('awesome', 'a'))[0]
print('Positive Polarity Score:', awesome.pos_score())
print('Negative Polarity Score:', awesome.neg_score())
print('Objective Score:', awesome.obj_score())

Positive Polarity Score: 0.875
Negative Polarity Score: 0.125
Objective Score: 0.0


## Build model

In [20]:
def analyze_sentiment_sentiwordnet_lexicon(article,
                                           verbose=False):

    # tokenize and POS tag text tokens
    tagged_text = [(token.text, token.tag_) for token in tn.nlp(article)]
    pos_score = neg_score = token_count = obj_score = 0
    # get wordnet synsets based on POS tags
    # get sentiment scores if synsets are found
    for word, tag in tagged_text:
        ss_set = None
        if 'NN' in tag and list(swn.senti_synsets(word, 'n')):
            ss_set = list(swn.senti_synsets(word, 'n'))[0]
        elif 'VB' in tag and list(swn.senti_synsets(word, 'v')):
            ss_set = list(swn.senti_synsets(word, 'v'))[0]
        elif 'JJ' in tag and list(swn.senti_synsets(word, 'a')):
            ss_set = list(swn.senti_synsets(word, 'a'))[0]
        elif 'RB' in tag and list(swn.senti_synsets(word, 'r')):
            ss_set = list(swn.senti_synsets(word, 'r'))[0]
        # if senti-synset is found        
        if ss_set:
            # add scores for all found synsets
            pos_score += ss_set.pos_score()
            neg_score += ss_set.neg_score()
            obj_score += ss_set.obj_score()
            token_count += 1
    
    # aggregate final scores
    final_score = pos_score - neg_score
    norm_final_score = round(float(final_score) / token_count, 2)
    final_sentiment = 'positive' if norm_final_score >= 0 else 'negative'
    if verbose:
        norm_obj_score = round(float(obj_score) / token_count, 2)
        norm_pos_score = round(float(pos_score) / token_count, 2)
        norm_neg_score = round(float(neg_score) / token_count, 2)
        # to display results in a nice table
        sentiment_frame = pd.DataFrame([[final_sentiment, norm_obj_score, norm_pos_score, 
                                         norm_neg_score, norm_final_score]],
                                       columns=pd.MultiIndex(levels=[['SENTIMENT STATS:'], 
                                                             ['Predicted Sentiment', 'Objectivity',
                                                              'Positive', 'Negative', 'Overall']], 
                                                             labels=[[0,0,0,0,0],[0,1,2,3,4]]))
        print(sentiment_frame)
        
    return final_sentiment

## Predict sentiment for sample articles

In [21]:
for article, sentiment in zip(test_articles[sample_article_ids], test_sentiments[sample_article_ids]):
    print('article:', article)
    print('Actual Sentiment:', sentiment)
    pred = analyze_sentiment_sentiwordnet_lexicon(article, verbose=True)    
    print('-'*60)

article:  alexandria, va: united states patent and trademark office has received an application no. 20170281686 for us patent, published on october 5, 2017, by stembiosys, inc. (texas), titled as "bone marrow stromal cell derived extracellular matrix protein extract and uses thereof"  for the registration of patent.

 stembiosys, inc. (texas) applies for us patent titled as "bone marrow stromal cell derived extracellular matrix protein extract and uses thereof"
Actual Sentiment: neutral
     SENTIMENT STATS:                                      
  Predicted Sentiment Objectivity Positive Negative Overall
0            negative        0.97     0.01     0.02   -0.01
------------------------------------------------------------
article: fashion and apparel, a $110-130 billion market, offers 30-40% margins but it is crowded with the likes of the flipkart-jabong-myntra combine, amazon and more than 800 other players such as limeroad and voonik jostling for space. snapdeal will spend $100 mill

## Predict sentiment for test dataset

In [22]:
predicted_sentiments = [analyze_sentiment_sentiwordnet_lexicon(article, verbose=False) for article in norm_test_articles]

## Evaluate model performance

In [23]:
meu.display_model_performance_metrics(true_labels=test_sentiments, predicted_labels=predicted_sentiments, 
                                  classes=['positive', 'negative'])

Model Performance metrics:
------------------------------
Accuracy: 0.0612
Precision: 0.0037
Recall: 0.0612
F1 Score: 0.0071

Model Classification report:
------------------------------
             precision    recall  f1-score   support

   positive       0.06      1.00      0.12         5
   negative       0.06      1.00      0.11         1

avg / total       0.06      1.00      0.12         6


Prediction Confusion Matrix:
------------------------------
                 Predicted:         
                   positive negative
Actual: positive          5        0
        negative          0        1


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


# Sentiment Analysis with VADER

In [24]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer



## Build model

In [25]:
def analyze_sentiment_vader_lexicon(article, 
                                    threshold=0.1,
                                    verbose=False):
    # pre-process text
    article = tn.strip_html_tags(article)
    article = tn.remove_accented_chars(article)
    article = tn.expand_contractions(article)
    
    # analyze the sentiment for article
    analyzer = SentimentIntensityAnalyzer()
    scores = analyzer.polarity_scores(article)
    # get aggregate scores and final sentiment
    agg_score = scores['compound']
    final_sentiment = 'positive' if agg_score >= threshold\
                                   else 'negative'
    if verbose:
        # display detailed sentiment statistics
        positive = str(round(scores['pos'], 2)*100)+'%'
        final = round(agg_score, 2)
        negative = str(round(scores['neg'], 2)*100)+'%'
        neutral = str(round(scores['neu'], 2)*100)+'%'
        sentiment_frame = pd.DataFrame([[final_sentiment, final, positive,
                                        negative, neutral]],
                                        columns=pd.MultiIndex(levels=[['SENTIMENT STATS:'], 
                                                                      ['Predicted Sentiment', 'Polarity Score',
                                                                       'Positive', 'Negative', 'Neutral']], 
                                                              labels=[[0,0,0,0,0],[0,1,2,3,4]]))
        print(sentiment_frame)
    
    return final_sentiment

## Predict sentiment for sample articles

In [26]:
for article, sentiment in zip(test_articles[sample_article_ids], test_sentiments[sample_article_ids]):
    print('article:', article)
    print('Actual Sentiment:', sentiment)
    pred = analyze_sentiment_vader_lexicon(article, threshold=0.4, verbose=True)    
    print('-'*60)

article:  alexandria, va: united states patent and trademark office has received an application no. 20170281686 for us patent, published on october 5, 2017, by stembiosys, inc. (texas), titled as "bone marrow stromal cell derived extracellular matrix protein extract and uses thereof"  for the registration of patent.

 stembiosys, inc. (texas) applies for us patent titled as "bone marrow stromal cell derived extracellular matrix protein extract and uses thereof"
Actual Sentiment: neutral
     SENTIMENT STATS:                                         
  Predicted Sentiment Polarity Score Positive Negative Neutral
0            negative           0.15     4.0%     3.0%   93.0%
------------------------------------------------------------
article: fashion and apparel, a $110-130 billion market, offers 30-40% margins but it is crowded with the likes of the flipkart-jabong-myntra combine, amazon and more than 800 other players such as limeroad and voonik jostling for space. snapdeal will spend 

## Predict sentiment for test dataset

In [27]:
predicted_sentiments = [analyze_sentiment_vader_lexicon(article, threshold=0.4, verbose=False) for article in test_articles]

## Evaluate model performance

In [28]:
meu.display_model_performance_metrics(true_labels=test_sentiments, predicted_labels=predicted_sentiments, 
                                  classes=['positive', 'negative'])

Model Performance metrics:
------------------------------
Accuracy: 0.0612
Precision: 0.0043
Recall: 0.0612
F1 Score: 0.0081

Model Classification report:
------------------------------
             precision    recall  f1-score   support

   positive       0.08      1.00      0.15         5
   negative       0.03      1.00      0.06         1

avg / total       0.07      1.00      0.13         6


Prediction Confusion Matrix:
------------------------------
                 Predicted:         
                   positive negative
Actual: positive          5        0
        negative          0        1


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
