In [31]:
import nltk
nltk.download("opinion_lexicon")
from nltk.corpus import opinion_lexicon


[nltk_data] Downloading package opinion_lexicon to
[nltk_data]     C:\Users\thain\AppData\Roaming\nltk_data...
[nltk_data]   Package opinion_lexicon is already up-to-date!


In [32]:
from datasets import load_dataset
from sklearn.metrics import accuracy_score, f1_score

In [33]:
pos_words = set(opinion_lexicon.positive())
neg_words = set(opinion_lexicon.negative())
negations = ["not","no","never","n't"]


In [34]:

print(f"Total positive words: {len(pos_words)}")
print(f"Total negative words: {len(neg_words)}")
target_words = "iphone"
print(f"{target_words} in positive words: {target_words in pos_words}")

Total positive words: 2006
Total negative words: 4783
iphone in positive words: False


In [35]:
dataset = load_dataset("cardiffnlp/tweet_eval", "sentiment")
print(f"Dataset keys: {dataset.keys()}")
print(dataset['train'][0].keys())
for i in range(5):
    print(dataset['train'][i])

Dataset keys: dict_keys(['train', 'test', 'validation'])
dict_keys(['text', 'label'])
{'text': '"QT @user In the original draft of the 7th book, Remus Lupin survived the Battle of Hogwarts. #HappyBirthdayRemusLupin"', 'label': 2}
{'text': '"Ben Smith / Smith (concussion) remains out of the lineup Thursday, Curtis #NHL #SJ"', 'label': 1}
{'text': 'Sorry bout the stream last night I crashed out but will be on tonight for sure. Then back to Minecraft in pc tomorrow night.', 'label': 1}
{'text': "Chase Headley's RBI double in the 8th inning off David Price snapped a Yankees streak of 33 consecutive scoreless innings against Blue Jays", 'label': 1}
{'text': '@user Alciato: Bee will invest 150 million in January, another 200 in the Summer and plans to bring Messi by 2017"', 'label': 2}


# Function

In [36]:
def lexicon_score(text):
    toks = text.lower().split()
    s, i = 0, 0
    while i < len(toks):
        w = toks[i]
        if w in negations and i + 1 < len(toks):
            nxt = toks[i+1]
            if nxt in pos_words: s -= 1
            elif nxt in neg_words: s += 1
            i += 2
            continue
        if w in pos_words: s += 1
        if w in neg_words: s -= 1
        i += 1
    return s

def score_to_label(s, pos_th=1, neg_th=-1):
    if s >= pos_th: return 2
    if s <= neg_th: return 0
    return 1

def lexicon_model(msg):
    return score_to_label(lexicon_score(msg))

In [37]:
def eval_split(ds,split_name="validation"):
    y_true, y_pred = [], []
    for r in ds[split_name]:
        y_true.append(int(r["label"]))
        sc = lexicon_score(r["text"])
        y_pred.append(score_to_label(sc))
    acc = accuracy_score(y_true, y_pred)
    f1  = f1_score(y_true, y_pred, average="macro")
    return {"split": split_name, "accuracy": acc, "f1_macro": f1}

In [38]:
print(eval_split(ds=dataset,split_name="validation"))
print(eval_split(ds=dataset,split_name="test"))

{'split': 'validation', 'accuracy': 0.548, 'f1_macro': 0.5196334595994507}
{'split': 'test', 'accuracy': 0.5244220123738196, 'f1_macro': 0.5016798274454954}
