In [20]:
import re
from enum import Enum
import pandas as pd
import plotly.express as px
import torch
from IPython.core.display import display, HTML
from dotenv import load_dotenv
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
from transformers import pipeline
device = 'cuda' if torch.cuda.is_available() else 'cpu';print(f'Using {device}')
load_dotenv()

S = {'NEGATIVE': 'red', 'NEUTRAL': 'yellow', 'POSITIVE': 'green'}

Label = Enum('Label', 'NEGATIVE NEUTRAL POSITIVE', start=0)


def view(df_, *args, **kwargs):
    with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.width', None):
        if args:
            if args[0] == -1:  # view all
                display(HTML(df_.head(len(df_)).to_html()))
            else:
                display(HTML(df_.head(args[0]).to_html()))
                display(HTML(df_.tail(args[1]).to_html()))
        elif kwargs:
            display(HTML(df_.head(kwargs['head']).to_html()))
            display(HTML(df_.tail(kwargs['tail']).to_html()))


def convert_label(clf_output):
    return Label(int(clf_output['label'].split('_')[-1])).name


def query_transcript(d, q, w, f):
    """
    d:  dataframe     - DataFrame containing trascript text and urls
    w:  width         - How much many surrounding text entries to include from the dataframe
                        as context for the query.
    q:  query         - The keyword or phrase to search in the transcript.
    f: regex flag
    """
    match_idxs = d.query('text.str.contains(@q,regex=True,flags=@f)', engine='python').index.values
    match_text = [' '.join(d.loc[i - w:i + w].text).replace('\n', ' ') for i in match_idxs]
    match_df = d.loc[match_idxs]
    return match_text, match_df


def infer(clf, text):
    return [{'label': convert_label(d), 'score': d['score']} for d in clf(text)]


def sent_df(text, sent):
    return pd.DataFrame([a | b for a, b in zip([{'text': t} for t in text], sent)])


def colour_df(sent_slice):
    return sent_slice.style.apply(lambda x: [
        "background:red" if 'NEGATIVE' in x.iloc[0] else "background:green" if 'POSITIVE' in x.iloc[
            0] else "background:yellow" for v in x], axis=1, subset='label')

def viz(sent,sent_slice):
    fig = px.pie(sent, names='label', color='label', color_discrete_map=S, width=400, height=400)
    fig.update_traces(textposition='inside', textinfo='percent+label')
    fig.show()
    return colour_df(sent_slice)

# def get_avg_sents(sents):
#     return {k:v for d in[{s:np.mean(sents[sents.label==s].score)} for s in sents.label.unique()]for k,v in d.items()}

# def get_sent_ratio(sents):
#     return {k:v for d in[{s:sum(sents.label.str.count(s))/len(sents)} for s in sents.label.unique()]for k,v in d.items()}
# read transcript data



Using cpu


In [2]:
df = pd.read_parquet('/home/x/Documents/youtube_transcript_scraper/yt_transcripts_fastai.parquet')
df.drop_duplicates(inplace=True)

model_name = "cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
clf = pipeline(
    task='sentiment-analysis',
    model=model_name,
    tokenizer=tokenizer,
    max_length=512,
    truncation=True
)

In [3]:
match_text, match_df = query_transcript(df, ' machine ', 1, re.I)

In [18]:
data = infer(clf, match_text)
sents = sent_df(match_text, data)

In [19]:
viz(sents,sents[:10]) # pass a slice, don't print the entire df

Unnamed: 0,text,label,score
0,"Okay, so let me introduce everybody to everybody else, first of all So we're here at the University of San Francisco learning Machine Learning or you might be at home watching this on video",NEUTRAL,0.711917
1,"You can then turn off enable GPU and click start Jupyter, and you'll have a Jupyter notebook instantly that costs you some money. It's Three cents an hour, okay, so if you don't mind spending three cents an hour to learn machine learning Here's a good way, so I'm going to go ahead and say start Jupyter",POSITIVE,0.513862
2,And approaches to setting up lots of different environments for Jupyter notebook Both the deep learning and for regular machine learning so check them out because there's lots of options So if I then go open Jupyter in a new tab,POSITIVE,0.630385
3,Lessons are inside the courses folder and the machine learning part one is in the ml1 folder if,NEUTRAL,0.875287
4,Other people's research. This is more a summary of 25 years of work that I've been doing in machine learning So a lot of this is,NEUTRAL,0.784483
5,"We're going to be using, The other library we'll use a lot is scikit-learn Which kind of implements a lot of machine learning stuff in Python. The scikit-learn Source code is often pretty readable and so very often if I want to really understand something",POSITIVE,0.766511
6,"Right, so these are pretty authentic Experiences for applied machine learning now of course you're missing all the bit that went before Which was why did this company to start up the side that predicting the option sale price of bulldozers was important",POSITIVE,0.576044
7,I refer to this as structured data now I say I refer to this as structured data because like there have been many arguments in the machine learning community on Twitter about What is structured data?,NEUTRAL,0.732114
8,"That looks like this and data like images where every column is of the same type Like that's the most important distinction in machine learning Yet, we don't have",NEUTRAL,0.531
9,"The academic books I've read say that that's one of the biggest risks of everything, but the practical books say, let's do some EDA first Yeah, so that the truth is kind of somewhere in between and I generally I generally try to do machine learning driven EDA and that's what we're going to learn today",NEUTRAL,0.694325
