In [1]:
from langchain_community.vectorstores import Chroma
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_community.document_loaders import CSVLoader
from logger import logger, log_function, log_rag_query
from googleapiclient.discovery import build
from dotenv import load_dotenv
import os

from rag import search_youtube_song


In [2]:
import pandas as pd
import numpy as np

# Load CSV as pandas DataFrame first
df = pd.read_csv("../ideas.csv").replace({np.nan:None})

# Convert DataFrame to list of dictionaries for preprocessing if needed
records = df.to_dict('records')

def update_youtube_links(records):
    """
    Update the YouTube links for the ideas where a song is listed and a link doesn't already exist.
    """
    for record in records:
        if record['Song'] and not record['Link']:
            # Handle case where Song is a list of songs
            if isinstance(record['Song'], str) and record['Song'].startswith('['):
                # Convert string representation of list to actual list
                songs = eval(record['Song'])
                if isinstance(record['Timestamp'], str) and record['Timestamp'].startswith('['):
                    timestamps = eval(record['Timestamp'])
                else:
                    timestamps = [None]*len(songs)    
                links = []
                for song, timestamp in zip(songs, timestamps):
                    link = search_youtube_song(song, timestamp)
                    logger.info(f"Found YouTube link for {song}")
                    links.append(link)
                record['Link'] = str(links)
                continue
            else:
                record['Link'] = search_youtube_song(record['Song'], record['Timestamp'])
            logger.info(f"Found YouTube link for {record['Song']}")

    # Save the updated DataFrame back to CSV
    df = pd.DataFrame(records)
    df.to_csv("../ideas.csv", index=False)
    logger.info("Saved updated ideas.csv")




2024-10-25 17:10:14,660 - INFO - file_cache is only supported with oauth2client<4.0.0
2024-10-25 17:10:14,663 - INFO - Searching YouTube for: Flume - Ezra official
2024-10-25 17:10:15,381 - INFO - Found YouTube video: https://www.youtube.com/watch?v=MGtKETJIcZs
2024-10-25 17:10:15,384 - INFO - Found YouTube link for Flume - Ezra
2024-10-25 17:10:15,386 - INFO - file_cache is only supported with oauth2client<4.0.0
2024-10-25 17:10:15,391 - INFO - Searching YouTube for: Flume - More than you thought official
2024-10-25 17:10:16,129 - INFO - Found YouTube video: https://www.youtube.com/watch?v=Nf5qRewILzc
2024-10-25 17:10:16,131 - INFO - Found YouTube link for Flume - More than you thought
2024-10-25 17:10:16,133 - INFO - file_cache is only supported with oauth2client<4.0.0
2024-10-25 17:10:16,136 - INFO - Searching YouTube for: #1 Dads - Camberwall official
2024-10-25 17:10:16,890 - INFO - Found YouTube video: https://www.youtube.com/watch?v=ZQolhBeaqqQ
2024-10-25 17:10:16,892 - INFO - F

In [5]:
record


{'Category': 'Lyrics',
 'Technique': 'Write a character/event profile',
 'Description': 'Expand the story separate to the lyrics and see if anything else comes out of it',
 'Example': nan,
 'Song': 'Bleachers - Dizzy',
 'Timestamp': nan,
 'Link': nan}

In [6]:
link

'https://www.youtube.com/watch?v=wIT_LoegOv4'

In [None]:


loader = CSVLoader("../ideas.csv")
documents = loader.load()
documents


In [None]:

embedding_model = HuggingFaceBgeEmbeddings(
    model_name="BAAI/llm-embedder",
    model_kwargs={"device": "cuda"},
    encode_kwargs={"normalize_embeddings": True},
)

vectorstore = Chroma.from_documents(texts, embedding_model)