In [24]:
#Imports
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn as sk
import nltk
from nltk.corpus.reader import ConllCorpusReader
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import spacy

In [25]:
#Reading data
test_df = pd.read_csv('data/sentiment-topic-test.tsv', delimiter='\t')

train_df = ConllCorpusReader('data/CONLL2003', 'train.txt', ['words', 'pos', 'ignore', 'chunk'])
valid_df = ConllCorpusReader('data/CONLL2003', 'valid.txt', ['words', 'pos', 'ignore', 'chunk'])

In [26]:
#Exploration
print(train_df.words())

test_df.head()

['EU', 'rejects', 'German', 'call', 'to', 'boycott', ...]


Unnamed: 0,sentence id,text,sentiment,topic
0,0,I wouldn't be caught dead watching the NFL if ...,negative,sports
1,1,Chris O'Donnell stated that while filming for ...,neutral,movie
2,2,"The whole game was a rollercoaster ride, but L...",positive,sports
3,3,"Zendaya slayed in Dune 2, as she does in all h...",positive,movie
4,4,While my favorite player was playing this matc...,negative,sports


In [29]:
#Model Preparation
nltk.download('vader_lexicon')
vader_model = SentimentIntensityAnalyzer()
nlp = spacy.load("en_core_web_sm") # 'en_core_web_sm'

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\kevin\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [30]:
def run_vader(textual_unit, 
              lemmatize=False, 
              parts_of_speech_to_consider=None,):
    """
    Run VADER on a sentence from spacy
    
    :param str textual unit: a textual unit, e.g., sentence, sentences (one string)
    (by looping over doc.sents)
    :param bool lemmatize: If True, provide lemmas to VADER instead of words
    :param set parts_of_speech_to_consider:
    -None or empty set: all parts of speech are provided
    -non-empty set: only these parts of speech are considered.
    :param int verbose: if set to 1, information is printed
    about input and output
    
    :rtype: dict
    :return: vader output dict
    """
    doc = nlp(textual_unit)
    input_to_vader = []

    for sentence in doc.sents:
        for token in sentence:
            to_add = token.text

            if lemmatize:
                to_add = token.lemma_ if token.lemma_ != '-PRON-' else token.text

            if not parts_of_speech_to_consider:
                input_to_vader.append(to_add)
                continue
            if token.pos_ in parts_of_speech_to_consider:
                    input_to_vader.append(to_add)

    return vader_model.polarity_scores(' '.join(input_to_vader))

In [31]:
#Begin VADER
def vader_output_to_label(vader_output):
    """
    map vader output e.g.,
    {'neg': 0.0, 'neu': 0.0, 'pos': 1.0, 'compound': 0.4215}
    to one of the following values:
    a) positive float -> 'positive'
    b) 0.0 -> 'neutral'
    c) negative float -> 'negative'
    
    :param dict vader_output: output dict from vader
    
    :rtype: str
    :return: 'negative' | 'neutral' | 'positive'
    """
    compound = vader_output['compound']
    
    if compound < 0:
        return 'negative'
    elif compound == 0.0:
        return 'neutral'
    elif compound > 0.0:
        return 'positive'
    
assert vader_output_to_label( {'neg': 0.0, 'neu': 0.0, 'pos': 1.0, 'compound': 0.0}) == 'neutral'
assert vader_output_to_label( {'neg': 0.0, 'neu': 0.0, 'pos': 1.0, 'compound': 0.01}) == 'positive'
assert vader_output_to_label( {'neg': 0.0, 'neu': 0.0, 'pos': 1.0, 'compound': -0.01}) == 'negative'