In [1]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer

import os
import numpy as np
from scipy.special import softmax

from config import INPUT_DIR

# Preprocess text (username and link placeholders)
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

# Tasks:
# emoji, emotion, hate, irony, offensive, sentiment
# stance/abortion, stance/atheism, stance/climate, stance/feminist, stance/hillary

task = 'sentiment'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"
tokenizer = AutoTokenizer.from_pretrained(MODEL)

# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

  from .autonotebook import tqdm as notebook_tqdm
The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


Moving 0 files to the new cache system


0it [00:00, ?it/s]
Downloading: 100%|██████████| 747/747 [00:00<00:00, 194kB/s]
Downloading: 100%|██████████| 899k/899k [00:02<00:00, 321kB/s]  
Downloading: 100%|██████████| 456k/456k [00:02<00:00, 199kB/s]  
Downloading: 100%|██████████| 150/150 [00:00<00:00, 26.2kB/s]
Downloading: 100%|██████████| 499M/499M [00:50<00:00, 9.96MB/s]    


In [2]:
# label mapping
labels = ['negative', 'neutral', 'positive']
label2id = {k:v for k, v in zip(labels, range(3))}
id2label = {k:v for k, v in zip(range(3), labels)}

In [3]:
sentence = "Oh sh*t!! What an awesome goal, I nearly missed it…"
# sentence = "Yet call out all Muslims for the acts of a few will get you pilloried.   So why is it okay to smear an entire religion over these few idiots?  Or is this because it's okay to bash Christian sects?"
# sentence = "Sorry to have to do this, but just to see if profanity filtering is enabled"

def predict_sentiment(sentence) :
    text = preprocess(sentence)
    encoded_input = tokenizer(text, return_tensors='pt')
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)

    sentiment = {id2label[idx]:s for idx, s in enumerate(scores)}

    ranking = np.argsort(scores)
    ranking = ranking[::-1]
    ranking = [id2label[idx] for idx in ranking]

    return sentiment, ranking

In [4]:
predict_sentiment(sentence)

({'negative': 0.21319835, 'neutral': 0.23147419, 'positive': 0.5553274},
 ['positive', 'neutral', 'negative'])