In [19]:
import re
from enum import Enum
import pandas as pd
import plotly.express as px
import torch
from IPython.core.display import display, HTML
from dotenv import load_dotenv
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
from transformers import pipeline
device = 'cuda' if torch.cuda.is_available() else 'cpu';print(f'Using {device}')
load_dotenv()

S = {'NEGATIVE': 'red', 'NEUTRAL': 'yellow', 'POSITIVE': 'green'}

Label = Enum('Label', 'NEGATIVE NEUTRAL POSITIVE', start=0)


def view(df_, *args, **kwargs):
    with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.width', None):
        if args:
            if args[0] == -1:  # view all
                display(HTML(df_.head(len(df_)).to_html()))
            else:
                display(HTML(df_.head(args[0]).to_html()))
                display(HTML(df_.tail(args[1]).to_html()))
        elif kwargs:
            display(HTML(df_.head(kwargs['head']).to_html()))
            display(HTML(df_.tail(kwargs['tail']).to_html()))


def convert_label(clf_output):
    return Label(int(clf_output['label'].split('_')[-1])).name


def query_transcript(d, q, w, f):
    """
    d:  dataframe     - DataFrame containing trascript text and urls
    w:  width         - How much many surrounding text entries to include from the dataframe
                        as context for the query.
    q:  query         - The keyword or phrase to search in the transcript.
    f: regex flag
    """
    match_idxs = d.query('text.str.contains(@q,regex=True,flags=@f)', engine='python').index.values
    match_text = [' '.join(d.loc[i - w:i + w].text).replace('\n', ' ') for i in match_idxs]
    match_df = d.loc[match_idxs]
    return match_text, match_df


def infer(clf, text):
    return [{'label': convert_label(d), 'score': d['score']} for d in clf(text)]


def sent_df(text, sent):
    return pd.DataFrame([a | b for a, b in zip([{'text': t} for t in text], sent)])


def colour_df(sent_slice):
    return sent_slice.style.apply(lambda x: [
        "background:red" if 'NEGATIVE' in x.iloc[0] else "background:green" if 'POSITIVE' in x.iloc[
            0] else "background:yellow" for v in x], axis=1, subset='label')

def viz(sent,sent_slice):
    try:
        fig = px.pie(sent, names='label', color='label', color_discrete_map=S, width=400, height=400)
        fig.update_traces(textposition='inside', textinfo='percent+label')
        fig.show()
        return colour_df(sent_slice)
    except:
        print('no matches')

Using cpu


In [None]:
df = pd.read_parquet('transcript.parquet')
df.drop_duplicates(inplace=True)

In [6]:
model_name = "cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
clf = pipeline(
    task='sentiment-analysis',
    model=model_name,
    tokenizer=tokenizer,
    max_length=512,
    truncation=True
)

In [20]:
match_text, match_df = query_transcript(df, ' fuck ', 1, re.I)

In [24]:
data = infer(clf, match_text)
sents = sent_df(match_text, data)

In [25]:
viz(sents,sents[:10]) # pass a slice, don't print the entire df

no matches
