# BJJ Sentiment Analyzer GPT

The goal of this notebook is to create a fine-tuned versoin of the ChatGPT model that will analyze whether a YouTube video title is related to a BJJ instructional video or not by returning a boolean value. Previous attempts at prompt engineering yielded inconsistent results.

### 1. Setup

In [200]:
# Have ChatGPT Generate the initial labels
import os
from openai import OpenAI
from dotenv import load_dotenv
from googleapiclient.discovery import build
load_dotenv()

# Initialize YouTube API Client
YOUTUBE_API_KEY = os.getenv("YOUTUBE_API_KEY")
youtube = build('youtube', 'v3', developerKey=YOUTUBE_API_KEY)

# Initialize OpenAI API Client
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=API_KEY)

### 2. Helper Methods

In [257]:
def yt_get_titles(query):
    '''
    This functions purpose is to retrieve the top 10 results from the YouTube Search API
    for every entered query.
    '''
    titles = []
    
    print(f"Getting Seach Results for {query}")
    youtube = build('youtube', 'v3', developerKey=YOUTUBE_API_KEY)
    request = youtube.search().list(part='snippet', type='video', q=query, maxResults=10)
    response = request.execute()
    
    for video in response["items"]:
        title = video["snippet"]["title"].replace("&#39;", "'").replace("&amp;", '&')
        titles.append({ "prompt": title, "completion": "" })
    
    return titles

def yt_query_search_terms(search_terms, all_titles=[]):
    '''
    This functions purpose is to:
        1. Iterate over every search term in the provided search term list.
        2. For each term, grab the top 10 results from the 'get_titles' function.
        3. Append them to the output 'all_titles' array and return once every term has been queried.
    '''
    current_query = search_terms.pop()
    titles = get_titles(current_query)
    all_titles = all_titles + titles
    search_terms_left = len(search_terms)
                           
    if search_terms_left != 0: 
        return get_all_titles(search_terms, all_titles)
    else:
        return all_titles

def openai_perform_request(messages):
    print("Getting responses. This may take a few minutes...")
    for message in messages:
        completion = client.chat.completions.create(
              model="gpt-3.5-turbo",
              messages=message["messages"]
        )
        
        assistant = { "role": "assistant", "content": completion.choices[0].message.content }
        message["messages"].append(assistant)
    
    print("Finished")
    return messages

### 2. Collect & Create Dataset of Titles from YouTube

In [None]:
# Initialize list of search terms and array of video titles.
search_terms = [
    "kimura trap attacks", 
    "ashi garami entries", 
    "bjj triangle escape", 
    "bjj americana", 
    "john danaher ankle lock",
    "guillotine from turtle",
    "darce choke",
    "double leg takedown",
    "b team",
    "gordon ryan vs dillon danis",
    "uchi mata judo",
    "blast double leg takedown",
    "arm drag to single leg takedown",
    "ankle pick takedown",
    "cross collar takedown"
]

video_titles = yt_query_search_terms(search_terms)

### 2. Format and Split Data into Test & Training sets

In [294]:
# Format into acceptable format for fine-tuning
data = [];
system_role = "You are helpful sentiment analysis assistant whose sole purpose is to determine if the provided YouTube video titles are Brazilian Ju-Jitsu, Judo, or Wrestling instructionial videos. I only want you to give 'True' or 'False' answers with no additional information."

for video in video_titles:
    system = {"role": "system", "content": system_role }
    user = {"role": "user", "content": video }
    data.append({"messages": [system, user]})
    
# Use ChatGPT to generate initial label outputs
data = openai_perform_request(data)

Getting responses. This may take a few minutes...
Finished


In [306]:
# Seperate into Train and Test and seperate into seperate JSONL files.
import random 
import math
import json

random.shuffle(data)

train_size = math.ceil(len(data) * 0.80)
test_size = math.ceil(len(data) * 0.20)

train_data = data[:120]
test_data = data[(len(data) - test_size):]

# Create train & test jsonl files.
with open("data/test.jsonl", "w") as file:
    for line in test_data:
        file.write(json.dumps(line))
        file.write("\n")
    file.close()
    
with open("data/train.jsonl", "w") as file:
    for line in train_data:
        file.write(json.dumps(line))
        file.write("\n")

At this point, I have a cleanly seperated my test and training data. However, because I used ChatGPT to quickly create the initial labels, they may not be accurate. I will need to perform a manuel check and correcty where applicable

### 2. Setup

In [181]:
import os
from openai import OpenAI
from dotenv import load_dotenv
load_dotenv()



In [314]:
train_data

[{'messages': [{'role': 'system',
    'content': "You are helpful sentiment analysis assistant whose sole purpose is to determine if the provided YouTube video titles are Brazilian Ju-Jitsu, Judo, or Wrestling instructionial videos. I only want you to give 'True' or 'False' answers with no additional information."},
   {'role': 'user',
    'content': 'How hard is it to break a leg with a leglock? | John Danaher and Lex Fridman'},
   {'role': 'assistant', 'content': 'False'}]},
 {'messages': [{'role': 'system',
    'content': "You are helpful sentiment analysis assistant whose sole purpose is to determine if the provided YouTube video titles are Brazilian Ju-Jitsu, Judo, or Wrestling instructionial videos. I only want you to give 'True' or 'False' answers with no additional information."},
   {'role': 'user',
    'content': 'Double Leg Takedown Technique - MMA Manchester'},
   {'role': 'assistant', 'content': 'True'}]},
 {'messages': [{'role': 'system',
    'content': "You are helpful s

In [313]:
test_data

[{'messages': [{'role': 'system',
    'content': "You are helpful sentiment analysis assistant whose sole purpose is to determine if the provided YouTube video titles are Brazilian Ju-Jitsu, Judo, or Wrestling instructionial videos. I only want you to give 'True' or 'False' answers with no additional information."},
   {'role': 'user', 'content': 'How To Do The Darce Choke In Jiu Jitsu!'},
   {'role': 'assistant', 'content': 'True'}]},
 {'messages': [{'role': 'system',
    'content': "You are helpful sentiment analysis assistant whose sole purpose is to determine if the provided YouTube video titles are Brazilian Ju-Jitsu, Judo, or Wrestling instructionial videos. I only want you to give 'True' or 'False' answers with no additional information."},
   {'role': 'user',
    'content': 'Kouchi Gari Fake to Collar Drag to Single Leg Takedown'},
   {'role': 'assistant', 'content': 'False'}]},
 {'messages': [{'role': 'system',
    'content': "You are helpful sentiment analysis assistant whose

In [315]:
data

[{'messages': [{'role': 'system',
    'content': "You are helpful sentiment analysis assistant whose sole purpose is to determine if the provided YouTube video titles are Brazilian Ju-Jitsu, Judo, or Wrestling instructionial videos. I only want you to give 'True' or 'False' answers with no additional information."},
   {'role': 'user',
    'content': 'How hard is it to break a leg with a leglock? | John Danaher and Lex Fridman'},
   {'role': 'assistant', 'content': 'False'}]},
 {'messages': [{'role': 'system',
    'content': "You are helpful sentiment analysis assistant whose sole purpose is to determine if the provided YouTube video titles are Brazilian Ju-Jitsu, Judo, or Wrestling instructionial videos. I only want you to give 'True' or 'False' answers with no additional information."},
   {'role': 'user',
    'content': 'Double Leg Takedown Technique - MMA Manchester'},
   {'role': 'assistant', 'content': 'True'}]},
 {'messages': [{'role': 'system',
    'content': "You are helpful s