In [1]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
import sys
sys.path.append('../../')

In [96]:
from src.preprocessor.preprocessing import FileIO, Utilities
from src.llm.llm_interface import LLM
from src.llm.llm_utils import get_token_count
from tqdm import tqdm

import os
import asyncio
import nest_asyncio
nest_asyncio.apply()

In [28]:
data_path = '../../data/huberman_labs.json'
data = FileIO().load_json(data_path)

In [29]:
def extract_fields(data: list[dict], 
                   extract_keys: list[str]=['title','short_description', 'content'], 
                   content_len: int=2500
                  ) -> list[dict]:
    extracted_fields = []
    for d in data:
        extracted_fields.append({k:v[:content_len] for k,v in d.items() if k in extract_keys})
    return extracted_fields

In [36]:
episodes = extract_fields(data)

In [20]:
guest_extraction_system = '''
Your primary goal is to extract guest information from pieces of text.  
Specifically you are an expert at extracting guest names from podcast shows.
'''

In [34]:
assistant_prompt = '''
You will be given the title, a summary, and an initial snippet of text from a podcast transcript from the Huberman Labs podcast.  
Using the title, summary, and context text, extract the guest of the show.  If the guest is present simply return the guest name. 
Do not provide any other information or explanation, simply return the extracted guests name. 
If there is no guest name given the information provided or if the title starts with the word AMA (which stands for Ask Me Anything), 
then return the name "Andrew Huberman". Again if no guest information is found, then return the name "Andrew Huberman". Do not reply
with any other information other than either the guest name or the name "Andrew Huberman". 
```
Title:\n
{title}
---------
Summary:\n
{summary}
---------
Transcript:\n
{transcript}
---------

Guest:
```
'''

In [106]:
llm = LLM(model_name='gpt-3.5-turbo-0125')

In [107]:
get_token_count([guest_extraction_system, assistant_prompt, episodes[0]['content'], episodes[0]['short_description']])

1327

In [108]:
async def get_guest(episode: dict, assistant_base: str):
    title = episode['title']
    summary = episode['short_description']
    transcript = episode['content']
    assistant_message = assistant_base.format(title=title, summary=summary, transcript=transcript)
    guest = await llm.achat_completion(guest_extraction_system, assistant_message, max_tokens=25, temperature=0.0, raw_response=False)
    return guest

In [109]:
async def guest_tasks(episodes: list[dict], assistant_base: str):
    tasks = await asyncio.gather(*[get_guest(episode, assistant_base) for episode in episodes])
    return tasks

In [110]:
%%time
import time
time.sleep(60)
last_100 = asyncio.run(guest_tasks(episodes[100:], assistant_prompt))

CPU times: user 3.89 s, sys: 220 ms, total: 4.11 s
Wall time: 1min 6s


In [111]:
len(last_100)

93

In [114]:
len(first_100 + last_100)

193

In [116]:
guests = first_100 + last_100

In [117]:
import json
with open('../../data/guests.json', 'w') as f:
    f.write(json.dumps(guests, indent=4))

### Combine Metadata

In [118]:
raw_data = FileIO().load_json('../../data/huberman_labs.json')
utils = Utilities()

In [124]:
for i, d in enumerate(raw_data):
    d['guest'] = guests[i]

In [125]:
with open('../../data/huberman_labs.json', 'w') as f:
    f.write(json.dumps(raw_data, indent=2))

Bad pipe message: %s [b"W!L}\xe7J\xb4\xf26\xa5\xb9B\xfe_\xeb4\x01\xa3\x00\x00|\xc0,\xc00\x00\xa3\x00\x9f\xcc\xa9\xcc\xa8\xcc\xaa\xc0\xaf\xc0\xad\xc0\xa3\xc0\x9f\xc0]\xc0a\xc0W\xc0S\xc0+\xc0/\x00\xa2\x00\x9e\xc0\xae\xc0\xac\xc0\xa2\xc0\x9e\xc0\\\xc0`\xc0V\xc0R\xc0$\xc0(\x00k\x00j\xc0#\xc0'\x00g\x00@\xc0\n\xc0\x14\x009\x008\xc0\t\xc0\x13\x003\x002\x00\x9d\xc0\xa1\xc0\x9d\xc0Q\x00\x9c\xc0\xa0\xc0\x9c\xc0P\x00=\x00<\x005\x00", b'\x9a\x00\x99\xc0\x07\xc0\x11\x00\x96\x00\x05\x00\xff\x01\x00\x00j\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.1\x00\x0b\x00\x04\x03\x00\x01\x02\x00\n\x00']
Bad pipe message: %s [b'\n\x00\x1d\x00\x17\x00\x1e\x00\x19\x00\x18']
Bad pipe message: %s [b'\x89\x14N\x88cD\x05\xd7\x8c\xee\x8a\xbf\xfc\xbbs\xbd\x12{\x00\x00>\xc0\x14\xc0\n\x009\x008\x007\x006\xc0\x0f\xc0\x05\x005\xc0\x13\xc0\t\x003\x002\x001\x000\xc0\x0e\xc0\x04\x00/\x00\x9a\x00\x99\x00\x98\x00\x97\x00\x96\x00\x07\xc0\x11\xc0\x07\xc0\x0c\xc0\x02\x00\x05\x00\x04\x00\xff\x02\x01\x00\x00C\x00\x00\x00\x0e\x00\x0c\x00

In [123]:
guests[62]

'Dr. Susanna Søberg'