# Phrase-level sentiment analysis

Please make sure you have downloaded the polarity data using "Document-Level Sentiment Analysis.ipynb" before you run this notebook.

You may also need to first install SpaCy and the English model to be able to process English text. To do that, please run the following commands on the Anaconda prompt:

> conda install spacy

> python -m spacy download en_core_web_sm

In [1]:
import nltk
    
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\pekarv\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [2]:
from glob import glob
import pandas as pd
from nltk.sentiment import SentimentIntensityAnalyzer

MAX_DIST = 2

In [3]:
def get_dep_graph(sentence):
    deps = {}
    for word in sentence:
        if word.i != word.head.i:
            deps[word.i] = word.head.i
    return deps

def get_path_to_root(w_id, deps, path):
    path.append(w_id) 
    if not deps.get(w_id):
        return path
    else:
        return get_path_to_root(deps[w_id], deps, path)

def build_all_paths(sentence, sdict):
    deps = get_dep_graph(sentence)
    e2paths = {}
    s2paths = {}
    for w in sentence:
        if w.ent_iob_ in "BI":
            path = get_path_to_root(w.i, deps, [])
            if w.ent_id in e2paths:
                prev_path = e2paths[w.ent_id]
                if len(path) < len(prev_path):
                    e2paths[w.ent_id] = path
            else:
                e2paths[w.ent_id] = path
        score = sdict.get(w.lemma_)
        if score:
            s2paths[w.i] = (get_path_to_root(w.i, deps, []), score, w.text)
    return e2paths, s2paths

def get_distance(p1, p2):
    """Measure the distance between two paths to the root in the number of edges
    """
    common = None
    for i in p1:
        if i in p2:
            common = i
            break
    if common:
        return p1.index(common) + p2.index(common)
    else:
        return None
    
def process_sentence(sentence, sdict):
    global MAX_DIST
    # entities
    ents = dict([(e.ent_id, (e.text, e.label_)) for e in sentence.ents])

    e2paths, s2paths = build_all_paths(sentence, sdict)

    for eid, p1 in e2paths.items():
        for sid, (p2, score, w_text) in s2paths.items():
            d = get_distance(p1, p2)
            if d is not None and d < MAX_DIST:
                e_text, e_label = ents[eid]
                yield e_text, e_label, score, w_text

In [4]:
sentiment_lexicon = SentimentIntensityAnalyzer().lexicon

In [6]:
import re
import spacy
nlp = spacy.load('en_core_web_sm')

df = pd.DataFrame(columns=['name', 'label', 'sentiment', 'description', 'text'])
ids = 0

for fn in glob(r"datasets\txt_sentoken\*\*"):
    try:
        raw = open(fn).read()
    except PermissionError:
        print(f"Permission error: {fn}")
        continue
    for text in raw.split("\n"):        
        for s in nlp(text).sents:
            parsed_sentence = nlp(s.text)
            for e_text, e_label, score, w_text in process_sentence(parsed_sentence, sentiment_lexicon):
                if e_label in ["PERSON"] and len(e_text.strip()) > 3:
                    ids += 1
                    df.loc[ids] = [e_text, e_label, score, w_text, s.text]
        if ids > 10000:
            break
    if ids > 10:
        break

In [7]:
df.tail(30)

Unnamed: 0,name,label,sentiment,description,text
1,king arthur's,PERSON,1.3,spirited,the story revolves around the adventures of fr...
2,williams,PERSON,-2.4,criminal,"instead of having the criminal "" napolean "" wi..."
3,george c,PERSON,1.5,like,the deeper welles digs into his investigation ...
4,hugh,PERSON,1.5,grant,it's a terrible mess of a movie starring a ter...
5,adam sandler-annoying,PERSON,-1.7,annoying,not just adam sandler-annoying
6,jim carrey,PERSON,-1.7,annoying,", we're talking jim carrey-annoying ."
7,dreadful hideaway,PERSON,-0.7,hid,the only interesting character in the movie is...
8,yakov smirnov,PERSON,1.2,joke,his is a one-joke character-- the old foreign-...
9,suvari,PERSON,-1.7,fault,now i'm not sure if this was ms . suvari's fau...
10,john harrigan,PERSON,1.4,plays,cox plays the role of big john harrigan in the...


In [10]:
actors = ["jim carrey", "john harrigan"]

for x in df.groupby("name")["sentiment"].mean().sort_values().iteritems():
    if x[0] in actors:
        print(x)

('jim carrey', -1.7)
('john harrigan', 1.4)
