# Exploring the use of spaCy in NLP, general and aspect-based sentiment analysis  

#### Includes data collection, cleaning, and EDA

<i>Note: for the ease of reading and follow up by non NLP practicioners, NLP/ SA/ spaCy specific terminology will be tagged with [word] where needed</i>

### Third-party package imports
<b>Spacy</b> is used to provide all relevant NLP procedures, including several trained models, built-in tokenization, annotations, etc

<b>Beautiful Soup 4</b> provides web scraping functionality

In [1]:
%time

import re
import requests

import nltk
import spacy
from spacy import displacy
from bs4 import BeautifulSoup as bs
from nltk.corpus import opinion_lexicon

# nltk.download('opinion_lexicon')

# load english small/medium model - models must be installed seperately from the base spacy package
# python -m spacy download en_core_web_sm --user


CPU times: total: 0 ns
Wall time: 0 ns


#### We expose a utiilty function to explain what some spacy terms mean

In [2]:
explain = lambda term: spacy.explain(term)

In [3]:
explain('dobj')

'direct object'

In [4]:


# nlp = spacy.load(r'C:\Users\zhuwe\AppData\Roaming\Python\Python310\site-packages\en_core_web_sm\en_core_web_sm-3.3.0')
nlp = spacy.load('en_core_web_md')


In [5]:
# removable - identify different labels avilable in pos and dep stages
# labels = nlp.get_pipe("parser").labels 
# labels = nlp.get_pipe("tagger").labels
# len(labels)


In [6]:
# scrape web data from sample news article
link = "https://www.channelnewsasia.com/singapore/hiv-risk-transmission-man-did-not-inform-sexual-partner-jail-2732376"

resp = requests.get(link)
assert resp.status_code == 200

AssertionError: 

In [None]:
# get only text in the <p> tags 
news_text = [i.getText() for i in bs(resp.content).find_all('p')]

In [None]:
# manually filter news website boilerplate, legal disclaimers, etc.
non_boilerplate_text = news_text[3:-5]

# create spacy DOC object
doc = nlp(''.join(non_boilerplate_text))

## Analysis of textual data in this article

In [None]:
print("Number of sentences in this news article: " + str(len(list(doc.sents))))

Number of sentences in this news article: 27


In [None]:
assert False, ""

AssertionError: 

### Token-wise analysis

#### POS (Part of Speech) tagging
Using [POS] tags, we can determine if an individual [token] is a noun, adjective, etc.

Skimming through available UPOS tags, the following seem to be most important for Sentiment Analysis
| POS tag | full name | examples       |
|---------|-----------|----------------|
| adj     | adjective | enormous, fast |
| adv     | adverb    | very, exactly  |
| verb    | verb      | eat, running   |



In [None]:
verbs = list(set([token for token in doc if token.pos_ == 'VERB']))

In [None]:
def extract_description(doc):
    output = []
    for token in doc:
        if token.pos_ == 'ADJ':
            prepend = ' '.join([child for child in token.children if child.pos_ == 'ADV'])
            descriptive_term = f"{prepend} {token.text}" if prepend else token.text
            output.append(descriptive_term)
    return output

In [None]:
# extract_description(doc)[:10]

In [None]:
def extract_aspect_descriptions(doc):
    aspects = {}
    descriptive_term = ''
    target = ''
    for token in doc:
        # if token is noun and dependency is
        if token.dep_ == 'nsubj' and token.pos_ == 'NOUN':
            target = token.text
        if token.pos_ == 'ADJ':
            prepend = ''
            for child in token.children:
                if child.pos_ != 'ADV':
                    continue
                prepend += child.text + ' '
            descriptive_term = prepend + token.text
        if target or descriptive_term:
            aspects[target] = {"description": descriptive_term}
            
    return aspects


In [None]:
extract_aspect_descriptions(doc)

{'': {'description': 'old'},
 'man': {'description': 'unable'},
 'victim': {'description': 'unidentified'},
 'men': {'description': 'initial'},
 'offender': {'description': 'sexual'},
 'informant': {'description': 'unidentified'},
 'charge': {'description': 'earlier'},
 'test': {'description': 'viral'},
 'doctor': {'description': 'sexual'},
 'report': {'description': 'viral'},
 'load': {'description': 'viral'},
 'prosecution': {'description': 'likely'},
 'risk': {'description': 'viral'},
 'prosecutor': {'description': 'high'},
 'client': {'description': 'unable'}}

In [None]:
sent = list(doc.sents)[0]

In [None]:
sent = nlp("The food we had yesterday was delicious")
displacy.render(sent, style='dep')


In [None]:
def aspect_description(sentences):
    aspects = []
    for sentence in sentences:
        descriptive_term = ''
        target = ''
        for token in sentence:
            if token.dep_ == 'nsubj' and token.pos_ == 'NOUN':
                target = token.text
            if token.pos_ == 'ADJ':
                prepend = ''
                for child in token.children:
                    if child.pos_ != 'ADV':
                        continue
                    prepend += child.text + ' '
                descriptive_term = prepend + token.text
        aspects.append({'aspect': target,
                        'description': descriptive_term})

    print(aspects)


In [None]:
aspect_description(doc.sents)

[{'aspect': 'man', 'description': 'positive'}, {'aspect': '', 'description': 'due'}, {'aspect': 'victim', 'description': 'public'}, {'aspect': '', 'description': 'viral'}, {'aspect': '', 'description': 'last'}, {'aspect': '', 'description': 'different'}, {'aspect': 'man', 'description': 'private'}, {'aspect': 'men', 'description': ''}, {'aspect': '', 'description': 'initial'}, {'aspect': 'offender', 'description': 'voluntary'}, {'aspect': 'victim', 'description': 'unidentified'}, {'aspect': 'charge', 'description': 'earlier'}, {'aspect': 'test', 'description': 'less'}, {'aspect': '', 'description': 'viral'}, {'aspect': 'doctor', 'description': 'sexual'}, {'aspect': 'report', 'description': 'available'}, {'aspect': 'load', 'description': 'viral'}, {'aspect': 'load', 'description': 'likely'}, {'aspect': 'load', 'description': 'viral'}, {'aspect': 'offender', 'description': 'different'}, {'aspect': 'prosecutor', 'description': 'sexual'}, {'aspect': 'client', 'description': 'unable'}, {'as

In [None]:
for sent in doc.sents:
    print(sent)

SINGAPORE: A 48-year-old man was on Tuesday (Jun 7) sentenced to a year's jail for not informing his sexual partner that he was HIV positive.
The offender cannot be named due to a gag order.
He was a public relations consultant at the time of the offence, and the victim was in his 20s.
The man was diagnosed with HIV in July 2017 and interviewed by a National Centre for Infectious Diseases official, who told him that he was required to inform sexual partners of his status regardless of his viral load and his perceived risk to others.
He pleaded guilty last week to one charge under the Infectious Diseases Act.
Another charge for the same offence against a different victim was considered for sentencing.
On Apr 23, 2021, the man offered the victim a ride home from work in a private-hire vehicle that he booked.
During the journey, both men decided to go to the offender's home instead.
They engaged in unprotected sex in the victim's room after initial resistance from the victim.
Before this,

In [None]:
for token in list(doc.sents)[0]:
    print(token, token.pos_, token.dep_)

In [None]:
def extract_nouns(doc):
    return [token for token in doc if token.pos_ == "NOUN"]
        # if token.pos_ == 'NOUN' and token.shape_ != 'x' and token.shape_ != 'xx' and token.shape_ != 'xxx':


extract_words = extract_nouns(doc)


In [None]:
vectors = [i.vector for i in extract_words]


In [None]:
# create a list of globally defined positive and negative words to identify sentiment
# sentiment score based on the laxicon neg, pos words
def _feature_sentiment(span, pos, neg):
    '''
    input: dictionary and span
    function: appends dictionary with new features if the feature
              did not exist previously,then updates sentiment to
              each of the new or existing features
    output: updated dictionary
    '''
    sent_dict = {}
    opinion_words = neg + pos
    if isinstance(span, spacy.tokens.doc.Doc):
        print("warning: arg should not be multiple sentences")
    if not isinstance(span, spacy.tokens.span.Span):
        span = nlp(span)
    
    for token in span:
        # check if the word is an opinion word, then assign sentiment
        if token.text in opinion_words:
            sentiment = 1 if token.text in pos else -1
            # original comment 
            # identify sentiment -- sentiment would be a dependency of the token, check if it is an adverb modifier (i.e. pretty, highly, etc.)
            # but happens to be an opinion word, ignore and pass 
            
            # if root of token is an adverb: 
            # TODO maybe it would be a good idea to handle adverb modifiers by increasing the sentiment score?
            # eg. very       good         food 
            #    (advmod)   (amod)       (noun)
            if (token.dep_ == "advmod"):
                continue
            # if root of token is adjective
            # eg. good       food
            #   (amod)      (noun)
            elif (token.dep_ == "amod"):
                sent_dict[token.head.text] = sentiment
            
            # for opinion words that are adjectives, adverbs, verbs...
            else:
                for child in token.children:
                    # if there's a adj modifier (i.e. very, pretty, etc.) add more weight to sentiment
                    # This could be better updated for modifiers that either positively or negatively emphasize
                    if ((child.dep_ == "amod") or (child.dep_ == "advmod")) and (child.text in opinion_words):
                        sentiment *= 1.5
                    # check for negation words and flip the sign of sentiment
                    if child.dep_ == "neg":
                        sentiment *= -1
                        
                for child in token.children:
                    # if verb, check if there's a direct object
                    if (token.pos_ == "VERB") & (child.dep_ == "dobj"):                        
                        sent_dict[child.text] = sentiment
                        # check for conjugates (a AND b), then add both to dictionary
                        subchildren = []
                        conj = 0
                        for subchild in child.children:
                            if subchild.text == "and":
                                conj=1
                            if (conj == 1) and (subchild.text != "and"):
                                subchildren.append(subchild.text)
                                conj = 0
                        for subchild in subchildren:
                            sent_dict[subchild] = sentiment

                # check for negation
                for child in token.head.children:
                    noun = ""
                    if ((child.dep_ == "amod") or (child.dep_ == "advmod")) and (child.text in opinion_words):
                        sentiment *= 1.5
                    # check for negation words and flip the sign of sentiment
                    if (child.dep_ == "neg"): 
                        sentiment *= -1
                
                # check for nouns
                for child in token.head.children:
                    noun = ""
                    if (child.pos_ == "NOUN") and (child.text not in sent_dict):
                        noun = child.text
                        # Check for compound nouns
                        for subchild in child.children:
                            if subchild.dep_ == "compound":
                                noun = subchild.text + " " + noun
                        sent_dict[noun] = sentiment
    return sent_dict
    
# example 
# tweet = "food was good but service was disappointing"
# print (feature_sentiment(tweet, pos, neg))
## Output: {'food': 1, 'service': -1}


def feature_sentiment(sentence): return _feature_sentiment(sentence, opinion_lexicon.positive(), opinion_lexicon.negative())


In [None]:
sent = nlp("very very good food")
sent[0].head

good

In [None]:
displacy.render(sent, style='dep')

In [7]:
displacy.render(nlp(
    "the bank manager was rude to me because I deposited only ten thousand dollars"), style='dep')


In [None]:
feature_sentiment("This place has not good food")


{'food': 1}