In [8]:
import re
from enum import Enum
import pandas as pd
import plotly.express as px
import torch
from IPython.core.display import display, HTML
from dotenv import load_dotenv
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
from transformers import pipeline
device = 'cuda' if torch.cuda.is_available() else 'cpu';print(f'Using {device}')
load_dotenv()

Label = Enum('Label', 'NEGATIVE NEUTRAL POSITIVE', start=0)

def view(df_, *args, **kwargs):
    with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.width', None):
        if args:
            if args[0] == -1:  # view all
                display(HTML(df_.head(len(df_)).to_html()))
            else:
                display(HTML(df_.head(args[0]).to_html()))
                display(HTML(df_.tail(args[1]).to_html()))
        elif kwargs:
            display(HTML(df_.head(kwargs['head']).to_html()))
            display(HTML(df_.tail(kwargs['tail']).to_html()))


def convert_label(clf_output):
    return Label(int(clf_output['label'].split('_')[-1])).name


def query_transcript(d, q, w, f):
    """
    d:  dataframe     - DataFrame containing trascript text and urls
    w:  width         - How much many surrounding text entries to include from the dataframe
                        as context for the query.
    q:  query         - The keyword or phrase to search in the transcript.
    f: regex flag
    """
    match_idxs = d.query('text.str.contains(@q,regex=True,flags=@f)', engine='python').index.values
    match_text = [' '.join(d.loc[i - w:i + w].text).replace('\n', ' ') for i in match_idxs]
    match_df = d.loc[match_idxs]
    return match_text, match_df


def infer(clf, text):
    return [{'label': convert_label(d), 'score': d['score']} for d in clf(text)]


def sent_df(q, text, sent):
    res = pd.DataFrame([a | b for a, b in zip([{'text': t} for t in text], sent)])
    res['keyword'] = q
    return res

def colour_df(sent_slice):
    return sent_slice.style.apply(lambda x: [
        "background:red" if Label(0).name in x.iloc[0] else "background:green" if Label(2).name in x.iloc[
            0] else "background:yellow" for v in x], axis=1, subset='label')

def viz(sent,sent_slice):
    if len(sent):
        fig = px.pie(sent,
                     names='label',
                     color='label',
                     color_discrete_map=dict(zip([_.name for _ in Label],'red yellow green'.split())),
                     width=400,
                     height=400)
        fig.update_traces(textposition='inside', textinfo='percent+label')
        fig.show()
        return colour_df(sent_slice)


Using cpu


In [9]:
df = pd.read_parquet('transcript.parquet')
df.drop_duplicates(inplace=True)

In [10]:
model_name = "cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
clf = pipeline(
    task='sentiment-analysis',
    model=model_name,
    tokenizer=tokenizer,
    max_length=512,
    truncation=True
)

In [29]:
query_str = ' code '
match_text, match_df = query_transcript(df, query_str, 1, re.I)

In [30]:
data = infer(clf, match_text)
sents = sent_df(query_str, match_text, data)

In [31]:
viz(sents,sents[:10]) # pass a slice, don't print the entire df

Unnamed: 0,text,label,score,keyword
0,as like they're like software or lines of code for operating uh civilization that's the rules and,NEUTRAL,0.77366,code
1,regulations but you have code accumulation but no code removal um and so it just gets,NEGATIVE,0.485977,code
2,poor fastest who have to maintain that code okay that's a that's a pain that's pain not even for,NEGATIVE,0.863241,code
3,thousands of miles or millions a lot of code around the mobile eye thing it doesn't just work by itself yes,NEUTRAL,0.658596,code
4,lot of freaking software man a lot of smart lines of code um for sure in order to have,NEUTRAL,0.492158,code
5,scheduling across all those things and so you're compiling the code down yeah it does all okay,POSITIVE,0.70253,code
6,it's it it's basically taking a whole bunch of c c plus code and and deleting a massive amount of c plus plus go and,NEUTRAL,0.553232,code
7,bag of points in the c code and turn it into vectors,NEUTRAL,0.841833,code
8,the the cc plus less control control code as opposed to,NEUTRAL,0.727504,code
9,whole thing so uh reducing reducing lines of code will actually go lower yeah that's fascinating,POSITIVE,0.804086,code
