# Lex Podcast Dataset Processing

In [None]:
# https://huggingface.co/datasets/Whispering-GPT/lex-fridman-podcast/tree/main/data
# Lex dataset conversion
import pandas as pd
import numpy as np
df = pd.read_parquet('train-00000-of-00001-25f40520d4548308.parquet')
df.to_csv('lex.csv')

In [1]:
# Setup env
import pysqlite3
import sys
sys.modules["sqlite3"] = sys.modules.pop("pysqlite3")
import os
import google.generativeai as genai
genai.configure(api_key=os.environ['GOOGLE_API_KEY'])

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Imports
import csv
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from chromadb import Documents, EmbeddingFunction, Embeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_chroma import Chroma
import chromadb
import time

In [3]:
# Load data
csv.field_size_limit(100000000)
all_text = []
with open("./data/lex.csv", "r") as csvfile:
    datareader = csv.reader(csvfile)
    next(datareader)
    for row in datareader:
        all_text.append(row[-2])
print(all_text[0])
        

 The following is a conversation with Jed Buckwald, a professor of history and a philosopher of science at Caltech. Interested especially in the development of scientific concepts and the instruments used to create and explore new effects and ideas in science. To support this podcast, please check out our sponsors in the description. This is the Lex Friedman podcast and here is my conversation with Jed Buckwald. Does science progress via paradigm shifts and revolutions as philosopher Thomas Kuhn said, or does it progress gradually? What do you think? Well, I got into this field because I was Tom Kuhn's research assistant 50 years ago, 52 years ago. He pulled me into it out of physics instead. So I know his work pretty well. And in the years when I was at MIT running an institute, he was then in the philosophy department, used to come over all the time to the talks we held and so on. So what would I say about that? He, of course, developed his ideas a lot over the years. The thing that 

In [4]:
# Split data
text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=50)
all_chunks = text_splitter.create_documents(all_text)


In [10]:

# Batch data
batched_list = []
index_list = []

#Create batches of 1300 for inputting to chromadb due to rate limit
for i in range(0, len(all_chunks), 1300):
    # Create a sublist containing the 'page_content' for each item in the batch
    batch = [item.page_content for item in all_chunks[i:i+1300]]
    # Add the sublist to the batched content list
    batched_list.append(batch)

    # Create a list of the original indexes for this batch
    indexes = list(range(i, min(i + 1300, len(all_chunks))))
    # Add the list of indexes to the index list
    index_list.append(list(map(str, indexes)))


In [6]:
print(len(index_list[-1]))
print(len(batched_list[-1]))
print(batched_list[-1][0])
print(index_list[-1][0])

1044
1044
is there something beyond mere equilibrium state biology and chemistry and biochemistry that drives what makes things work? I think there is. So Adrian Bajan has this Constructo Law. There's other things you look at when you look at the life sciences and you look at any kind of statistical physics and statistical mechanics. When you look at things far out of equilibrium, when you have excess energy, what happens then? Life doesn't just make a hot soup. It starts making structure. There's something there. The poetry of reaches for order when there's an excess of energy because you brought up game of life. You did it, not me. I love cellular automata, so I have to sort of linger on that for a little bit. So cellular automata, I guess, or game of life is a very simple example of reaching for
55900


In [7]:
# Setup embedding function
class GeminiEmbeddingFunction(EmbeddingFunction):
  def __call__(self, input: Documents) -> Embeddings:
    model = "models/text-embedding-004"
    title = "Custom query"
    return genai.embed_content(model=model,
                                content=input,
                                task_type="retrieval_document",
                                title=title)["embedding"]

In [12]:

# Get client and process files
chroma_client = chromadb.PersistentClient(path="contexts")
db = chroma_client.get_or_create_collection(name="lex", embedding_function=GeminiEmbeddingFunction())
for i, d in zip(index_list,batched_list):
  print(f"Start: {i[0]}")
  db.add(
    documents=d,
    ids=i
  )
  time.sleep(60)
  print(f"Finish: {i[0]}")



Start: 0
Finish: 0
Start: 1300
Finish: 1300
Start: 2600
Finish: 2600
Start: 3900
Finish: 3900
Start: 5200
Finish: 5200
Start: 6500
Finish: 6500
Start: 7800
Finish: 7800
Start: 9100
Finish: 9100
Start: 10400
Finish: 10400
Start: 11700
Finish: 11700
Start: 13000
Finish: 13000
Start: 14300
Finish: 14300
Start: 15600
Finish: 15600
Start: 16900
Finish: 16900
Start: 18200
Finish: 18200
Start: 19500
Finish: 19500
Start: 20800
Finish: 20800
Start: 22100
Finish: 22100
Start: 23400
Finish: 23400
Start: 24700
Finish: 24700
Start: 26000
Finish: 26000
Start: 27300
Finish: 27300
Start: 28600
Finish: 28600
Start: 29900
Finish: 29900
Start: 31200
Finish: 31200
Start: 32500
Finish: 32500
Start: 33800
Finish: 33800
Start: 35100
Finish: 35100
Start: 36400
Finish: 36400
Start: 37700
Finish: 37700
Start: 39000
Finish: 39000
Start: 40300
Finish: 40300
Start: 41600
Finish: 41600
Start: 42900
Finish: 42900
Start: 44200
Finish: 44200
Start: 45500
Finish: 45500
Start: 46800
Finish: 46800
Start: 48100
Finish: 48

In [13]:
print(len(db.get()["ids"]))
db.peek(3)

56944


{'ids': ['0', '1', '2'],
 'embeddings': array([[-0.00012366, -0.02054776, -0.0640145 , ..., -0.02011356,
          0.03430229,  0.01769254],
        [-0.04914856,  0.00249596, -0.03986755, ..., -0.00587873,
          0.03537952,  0.00661766],
        [ 0.00388132, -0.02367176, -0.07034157, ..., -0.01211566,
          0.03348244, -0.02950112]]),
 'metadatas': [None, None, None],
 'documents': ["The following is a conversation with Jed Buckwald, a professor of history and a philosopher of science at Caltech. Interested especially in the development of scientific concepts and the instruments used to create and explore new effects and ideas in science. To support this podcast, please check out our sponsors in the description. This is the Lex Friedman podcast and here is my conversation with Jed Buckwald. Does science progress via paradigm shifts and revolutions as philosopher Thomas Kuhn said, or does it progress gradually? What do you think? Well, I got into this field because I was Tom K

# PDF File Processing

In [6]:
embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")
file_path = "LLM_ discussion.pdf"
loader = PyPDFLoader(file_path)
data = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
texts = text_splitter.split_documents(data)
chroma_client = chromadb.PersistentClient(path="contexts")
db = Chroma.from_documents(client=chroma_client,collection_name="temp",documents=texts, embedding=embeddings)

In [21]:
retriever = db.as_retriever(
    search_type="mmr",
    search_kwargs={'k': 5}
)

retriever.invoke("Power")

[]

In [11]:
chroma_client.delete_collection(name="lex")

In [3]:
import pandas as pd

# Load the lex.csv file
file_path = './data/lex.csv'
df = pd.read_csv(file_path)

# Calculate the number of words in each row of the 'text' column
df['word_count'] = df['text'].apply(lambda x: len(str(x).split()))

# Calculate the total word count
total_word_count = df['word_count'].sum()

# Calculate the average word count
average_word_count = df['word_count'].mean()

# Print the results
print(f"Total word count in the 'text' column: {total_word_count}")
print(f"Average word count in the 'text' column: {average_word_count}")



Total word count in the 'text' column: 7770912
Average word count in the 'text' column: 22459.28323699422


In [5]:
df.count()

Unnamed: 0     346
id             346
channel        346
channel_id     346
title          346
categories     346
tags           346
description    281
text           346
segments         0
word_count     346
dtype: int64