In [1]:
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
import math
import csv
import urllib.request
import pandas as pd
import torch
import textwrap

In [2]:
# Preprocess text split into 'wordcount' chunks
def preprocess(text):
    tokens = text.split(' ')
    new_texts = []

    wordcount = 384
    whole_parts = math.floor(len(tokens)/wordcount)
    for i in range(whole_parts):
        new_text = " ".join(tokens[i*wordcount:(i+1)*wordcount])
        new_texts.append(new_text)
    new_texts.append(" ".join(tokens[(whole_parts-1)*wordcount:]))

    return new_texts

In [6]:
# Tasks:
# emoji, emotion, hate, irony, offensive, sentiment
# stance/abortion, stance/atheism, stance/climate, stance/feminist, stance/hillary
task='sentiment'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"

tokenizer = AutoTokenizer.from_pretrained(MODEL)
tokenizer.save_pretrained(f"{MODEL}-tokenizer")

model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.save_pretrained(f"{MODEL}-model")

In [7]:
# download label mapping
labels=[]
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]

In [11]:
news_data = pd.read_csv('../datasets/all-the-news-2-1.csv')

In [13]:
print(news_data['article'][21544])

I started drinking when I went to college. In the UK, where I’m from, you can legally drink at 18—so I did. We all did. The bar opposite our apartment was where my new roommates became my best friends. Drinking turned me from an introvert into the life of the party, powered my dancing, and often inspired me to lay in the middle of a nightclub making “dance floor angels," blissed out. When I wasn’t drinking, I stayed up all night, panic-writing assignments, high on caffeine pills. In my second year, I felt exhausted all the time and blacked out after nights in the pub. One morning, while the room spun, I realized I’d lost my coat and keys and had dried vomit on my pillow.  Maybe I should drink less, I thought, although I couldn’t imagine how. Fortunately for my personal safety, fate intervened: My fatigue worsened, and I had to drop out. My doctor diagnosed me with ME/CFS, triggered by the mono I’d had in high school. Home with my family, where no one wanted to stay up until 3 am, I sto

In [7]:
print(news_data['article'].size)

2688878


In [8]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
model = model.to(device)
print(device)

cuda:0


In [11]:
batchsize = 1
#batches = math.floor((news_data['article'].size)/batchsize)
batches = 1
scores = []
for i in range(batches):
    torch.cuda.empty_cache()
    #batch = news_data['article'][i*batchsize:(i+1)*batchsize]
    batch = news_data['article'][21544:21545]
    for article in batch:
        article_parts = preprocess(str(article))
        encoded_input = tokenizer(article_parts, return_tensors='pt', truncation=True, padding=True).to(device)
        print(encoded_input[0])
        output = model(**encoded_input)
        scores = output[0][0].detach().to('cpu').numpy()
        scores = torch.softmax(scores)

# # TF
# model = TFAutoModelForSequenceClassification.from_pretrained(MODEL)
# model.save_pretrained(MODEL)

# text = "Good night 😊"
# encoded_input = tokenizer(text, return_tensors='tf')
# output = model(encoded_input)
# scores = output[0][0].numpy()
# scores = softmax(scores)

Encoding(num_tokens=105, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])


In [12]:
ranking = np.argsort(scores)
ranking = ranking[::-1]
for i in range(scores.shape[0]):
    l = labels[ranking[i]]
    s = scores[ranking[i]]
    print(f"{i+1}) {l} {np.round(float(s), 4)}")

1) left 0.5525
2) center 0.4475
