In [None]:
import pysrt
import pandas
import os
from langchain.document_loaders import SRTLoader
import openai
import pinecone
import tiktoken
from langchain.text_splitter import RecursiveCharacterTextSplitter
from uuid import uuid4
from tqdm.auto import tqdm
import datetime
from time import sleep



## Loading Subtitle Files into Langchain Documents

In [None]:
loader = SRTLoader("transcripts/Behind the Scenes of Yes Theory ft Thomas Dajer.srt")
doc = loader.load()

In [None]:
#Load in all the transcripts into langchain documents
direct = "path/to/transcripts"
docs = []
for filename in os.listdir(direct):
  full_path = direct + '/' + filename
  title = os.path.splitext(filename)[0]
  loader = SRTLoader(full_path)
  result = loader.load()
  result[0].metadata["name"] = title
  docs.extend(result)

In [None]:
#Number of Podcast Episodes
len(docs)

199

## Splitting the Documents into Chunks

In [None]:
import tiktoken

tokenizer = tiktoken.get_encoding('cl100k_base') # OpenAI's token calulation function

# create the length function
def tiktoken_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)

In [None]:
leng = tiktoken_len(docs[10].page_content)
#number of tokens within a document
print(leng)

16968


In [None]:
#Splitting the documents based on chunk size
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=250,
    chunk_overlap=25,  # number of tokens overlap between chunks
    length_function=tiktoken_len,
    separators=['.', '?'," ",""]
)

In [None]:
len(docs[10].page_content)

80392

In [None]:
#Split all of chunks and format them into dictionary
chunks = []

for idx, record in enumerate(tqdm(docs[1:])):
    texts = text_splitter.split_text(record.page_content)
    chunks.extend([{
        'id': str(uuid4()), #unique id to each chunk
        'text': texts[i],
        'chunk': i,
        'name': record.metadata['name'] #title of podcast
    } for i in range(len(texts))])
     

In [None]:
chunks[0].text

{'id': 'a5c12d75-17eb-4efc-8d51-89d2f7b31937',
 'text': "Hey podcast listeners, it's Samir. I want to tell you about a company that's supporting the show this year, Spotter. Spotter has paid out $740 million to creators like MrBeast destroying Dude Perfect, Airac, and 400 more through catalog licensing deals. Their goal is to accelerate creators through capital and knowledge. So if you want to learn more about Spotter and the work that we're doing with them this year, go to spotter.com slash Colin and Samir. All right, enjoy the show. Hey podcast listeners, this is not a full episode. Colin and I are both still on break, but we did want to ask you for your help. Launching this podcast has been such an amazing experience, and a lot of that has to do with all of you. All the messages, emails, tweets, and reviews we get really help us understand the type of community we're building, and honestly, this is something we've always wanted. A community of like-minded and thoughtful individuals.

## Embedding text and Upserting Data to Pinecone

In [None]:

# Connecting to OpenAI
openai.api_key = "API-KEY"  #platform.openai.com

embed_model = "text-embedding-ada-002" 

#example Emedding
res = openai.Embedding.create(
    input=[
        "Sample document text goes here",
        "there will be several phrases in each batch"
    ], engine=embed_model
)
     

In [None]:
#Connecting to Pinecone Index
index_name = 'colinsamir'

# initialize connection to pinecone
pinecone.init(
    api_key="API-KEY",  # app.pinecone.io (console)
    environment="Location"  # next to API key in console
)

# check if index already exists
if index_name not in pinecone.list_indexes():
    # if does not exist, create index
    pinecone.create_index(
        index_name,
        dimension=len(res['data'][0]['embedding']),
        metric='dotproduct'
    )
# connect to index
index = pinecone.GRPCIndex(index_name)
# view index stats
index.describe_index_stats()


In [None]:
#Uploading batches of data into pinecone 

batch_size = 100  # how many embeddings we create and insert at once

for i in tqdm(range(0, len(chunks), batch_size)):
    # find end of batch
    i_end = min(len(chunks), i+batch_size)
    meta_batch = chunks[i:i_end]
    # get ids
    ids_batch = [x['id'] for x in meta_batch]
    # get texts to encode
    texts = [x['text'] for x in meta_batch]
    # create embeddings (try-except added to avoid RateLimitError)
    try:
        res = openai.Embedding.create(input=texts, engine=embed_model)
    except:
        done = False
        while not done:
            sleep(5)
            try:
                res = openai.Embedding.create(input=texts, engine=embed_model)
                done = True
            except:
                pass
    embeds = [record['embedding'] for record in res['data']]
    # cleanup metadata
    meta_batch = [{
        'text': x['text'],
        'chunk': x['chunk'],
        'name': x['name']
    } for x in meta_batch]
    to_upsert = list(zip(ids_batch, embeds, meta_batch))
    # upsert to Pinecone
    index.upsert(vectors=to_upsert)
     

  0%|          | 0/102 [00:00<?, ?it/s]

## Search Function

In [None]:

def creatorAI(query):
  res = openai.Embedding.create( # 1. Embed user query
    input=[query],
    engine=embed_model
  ) 
  xq = res['data'][0]['embedding']
  res = index.query(xq, top_k=5, include_metadata=True) #2. Search Pinecone Database using Embed Query to find top 5 Results

  contexts = [item['metadata']['text'] for item in res['matches']] 

  augmented_query = "\n\n---\n\n".join(contexts)+"\n\n-----\n\n"+query #Context + Query passed to LLM

  # highly specfic instructions for the model to follow:
  primer = f"""You are the Creative Companion, a highly sophisticated AI system 
  that is crucial for answering user questions based on the context supplied by the user above 
  each query about a podcast about digital media creation and the creator economy. 
  It is of utmost importance that if the necessary information 
  is not available within the user-provided context, you respond simply with 'I don't know.' 
  Your knowledge is derived from segments of a podcast called 'The Colin and Samir Podcast,' 
  which delves into the creator economy.

  """

  res = openai.ChatCompletion.create( #Create the chat by passing in instructions and Contexts+Query
      model="gpt-4",
      messages=[
          {"role": "system", "content": primer},
          {"role": "user", "content": augmented_query}
      ],
      temperature = .2
  )

  return display(Markdown(res['choices'][0]['message']['content'])) #Return the models response 





In [None]:
creatorAI("How did ludwig grow on twitch and why did he switch to youtube?")

Ludwig grew on Twitch by creating content on other platforms like YouTube and then bringing those audiences to his Twitch streams. As mentioned in the podcast, he said, "You don't grow on Twitch. You grow on YouTube or you grow elsewhere. And then you bring people to Twitch." He also utilized the strategy of cutting up his Twitch streams and posting them on YouTube, which helped him gain more visibility and monetization.

Ludwig decided to switch to YouTube because of the exclusive deals offered to creators and the potential for better monetization. He also appreciated the open communication between creators and YouTube executives, which allowed him to voice his concerns and suggestions for improvements on the platform. This level of engagement with creators seemed to be lacking on Twitch.