In [1]:
# imports

import os
import glob
from dotenv import load_dotenv
import gradio as gr
import pandas as pd

from get_api_key import get_api_key
openai, anthropic, gemini = get_api_key()

OpenAI API Key exists and begins sk-proj-
Anthropic API Key exists and begins sk-ant-
Google API Key exists and begins AI


In [None]:
# imports for langchain and Chroma and plotly

from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
import numpy as np
from sklearn.manifold import TSNE
import plotly.graph_objects as go
import pandas as pd

In [None]:
dataset_url = "hf://datasets/spencer/python_slack/data/train-00000-of-00001.parquet"

df = pd.read_parquet(dataset_url)

In [46]:
df_essential = df[['sentences', 'user', 'channel']]
print(type(df_essential))

dict_array = df_essential.to_dict('records')
print(type(dict_array))
print(f"Total number of data: {len(dict_array)}")
print(f"First data: {type(dict_array[0])} / {dict_array[0]}")

<class 'pandas.core.frame.DataFrame'>
<class 'list'>
Total number of data: 106262
First data: <class 'dict'> / {'sentences': 'Is it possible to switch between conda and virtualenv? That is I want to switch the actual environment managers not just environments in them … I typically use conda but want to try something out that requires virtual env and is not compatible with conda.  Thanks.', 'user': 'Melvin', 'channel': 'help'}


In [31]:
# 딕셔너리 배열을 LangChain Document 객체 배열로 변환
# Document 객체는 page_content(텍스트 내용)와 metadata(추가 정보)를 가집니다
documents = []
for item in dict_array:
    doc = Document(
        page_content=item['sentences'],  # 텍스트 내용
        metadata={
            'user': item['user'],
            'channel': item['channel']
        }
    )
    documents.append(doc)


print(f"First document: {documents[0]}")
print(f"metadata: {documents[0].metadata}")

First document: page_content='Is it possible to switch between conda and virtualenv? That is I want to switch the actual environment managers not just environments in them … I typically use conda but want to try something out that requires virtual env and is not compatible with conda.  Thanks.' metadata={'user': 'Melvin', 'channel': 'help'}
metadata: {'user': 'Melvin', 'channel': 'help'}


In [27]:
# 이제 Document 객체들을 텍스트로 분할할 수 있습니다
text_splitter = CharacterTextSplitter(chunk_size=2000, chunk_overlap=200)
chunks = text_splitter.split_documents(documents)

print(f"Number of documents: {len(documents)}")
print(f"Number of chunks: {len(chunks)}")
print(f"\nFirst chunk:")
print(chunks[0])

Created a chunk of size 2550, which is longer than the specified 2000
Created a chunk of size 3443, which is longer than the specified 2000
Created a chunk of size 2812, which is longer than the specified 2000
Created a chunk of size 2402, which is longer than the specified 2000
Created a chunk of size 3166, which is longer than the specified 2000
Created a chunk of size 2939, which is longer than the specified 2000
Created a chunk of size 3929, which is longer than the specified 2000


Number of documents: 106262
Number of chunks: 106330

First chunk:
page_content='Is it possible to switch between conda and virtualenv? That is I want to switch the actual environment managers not just environments in them … I typically use conda but want to try something out that requires virtual env and is not compatible with conda.  Thanks.' metadata={'user': 'Melvin', 'channel': 'help'}


In [43]:
channel_names = set(chunk.metadata['channel'] for chunk in chunks)
users = set(chunk.metadata['user'] for chunk in chunks)
print(f"Channel Names found: {', '.join(channel_names)}")
print(f"Total Number of users found: {len(users)}\nUsers found: {', '.join(users)}")

Channel Names found: help
Total Number of users found: 2663
Users found: Fidelia, Shad, Inger, Jessie, Porsha, Tandra, Victor, Gerri, Derick, Verline, Digna, Louella, Lavette, Raphael, Royce, Myra, Kristyn, Marquita, Prudence, Vinita, Syble, Heriberto, Myung, Loida, Briana, Joann, Nohemi, Natosha, Ena, Star, Junita, Chaya, Susanne, Merlin, Merilyn, Tammi, Ellen, Kelli, Lovella, Collen, Glinda, Diana, Harrison, Leone, Broderick, Titus, Felicita, Annemarie, Coral, Ela, Dee, Mellisa, Ben, Felisa, Layla, Renetta, Romona, Catherine, Shona, Tara, Errol, Marlon, Amie, Luci, Jed, Ray, Val, Tien, Gertrudis, Rhona, Catina, Darcie, Carisa, Samuel, Mandi, Noreen, Yesenia, Iluminada, Luciano, Tami, Vanda, Lashunda, Bethany, Valery, Cleveland, Rochel, Piper, Doug, Ashlee, Marcos, Emory, Bernita, Mariano, Flor, Irish, Oswaldo, Dominque, Delilah, Kristine, Simone, Josefina, Desire, Leonie, Malia, Florene, Lavinia, Maddie, Tora, Jerry, Melanie, Sana, Li, Kali, Dortha, Roscoe, Lanelle, Sherice, Camie, B

In [38]:
MODEL = "gpt-4o-mini"
db_name = "slack_db"

# Load environment variables in a file called .env
load_dotenv(override=True)
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')

embeddings = OpenAIEmbeddings()

In [39]:
# Check if a Chroma Datastore already exists - if so, delete the collection to start from scratch

if os.path.exists(db_name):
    Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()

# Create our Chroma vectorstore!

vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)
print(f"Vectorstore created with {vectorstore._collection.count()} documents")

Vectorstore created with 106330 documents


In [40]:
# Get one vector and find how many dimensions it has

collection = vectorstore._collection
sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]
dimensions = len(sample_embedding)
print(f"The vectors have {dimensions:,} dimensions")
print(f"\nSample Embedding: {sample_embedding}")

The vectors have 1,536 dimensions

Sample Embedding: [-0.0099178  -0.01245001  0.02408233 ...  0.00627776 -0.01184993
 -0.01436235]


In [44]:
# Prework

result = collection.get(include=['embeddings', 'documents', 'metadatas'])
vectors = np.array(result['embeddings'])
documents = result['documents']
channel_names = [metadata['channel'] for metadata in result['metadatas']]
colors = [['blue'][['help'].index(t)] for t in channel_names]

In [45]:
tsne = TSNE(n_components=2, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

# Create the 2D scatter plot
fig = go.Figure(data=[go.Scatter(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    mode='markers',
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
    hoverinfo='text'
)])

fig.update_layout(
    title='2D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x',yaxis_title='y'),
    width=800,
    height=600,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()