# Create AI-Tutor vector database

In [1]:
import os

# Set the "OPENAI_API_KEY" in the Python environment. Will be used by OpenAI client later.
os.environ["OPENAI_API_KEY"] = "sk-"

In [2]:
import nest_asyncio

nest_asyncio.apply()

In [3]:
from llama_index.llms.openai import OpenAI

llm = OpenAI(temperature=0.9, model="gpt-3.5-turbo", max_tokens=512)

In [4]:
import chromadb

# create client and a new collection
# chromadb.EphemeralClient saves data in-memory.
chroma_client = chromadb.PersistentClient(path="./ai-tutor-db")
chroma_collection = chroma_client.create_collection("ai-tutor-db")

In [5]:
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext

# Define a storage context object using the created vector database.
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)



In [6]:
import os
import csv
from llama_index.core.schema import TextNode

def load_csv_files_from_directory(directory):
    nodes = []
    node_count = 0

    # Iterate over all files in the given directory
    for filename in os.listdir(directory):
        if filename.endswith(".csv"):
            filepath = os.path.join(directory, filename)
            with open(filepath, mode='r', encoding='utf-8') as file:
                csv_reader = csv.reader(file)
                headers = next(csv_reader, None)  # Read the header row
                
                # Dynamically determine the column indices
                title_idx = headers.index('title') if 'title' in headers else None
                url_idx = headers.index('url') if 'url' in headers else None
                content_idx = headers.index('content') if 'content' in headers else None
                source_idx = headers.index('source') if 'source' in headers else None
                
                for row in csv_reader:
                    if title_idx is not None and url_idx is not None and content_idx is not None and source_idx is not None:
                        node_id = f"node_{node_count}"
                        node = TextNode(
                            text=row[content_idx],
                            metadata={
                                "title": row[title_idx],
                                "url": row[url_idx],
                                "source": row[source_idx]
                            },
                            id_=node_id
                        )
                        nodes.append(node)
                        node_count += 1

    return nodes

In [7]:
directory_path = '../data/ai-tutor-csv-files'
nodes = load_csv_files_from_directory(directory_path)

node = nodes[0]
print(f"ID: {node.id_} \nText: {node.text}, \nMetadata: {node.metadata}")

node = nodes[-5000]
print(f"ID: {node.id_} \nText: {node.text}, \nMetadata: {node.metadata}")

ID: node_0 
Text: # Introduction
This lesson will explore the powerful concept of LangChain memory, which is designed to help chatbots maintain context and improve their conversational capabilities in more details. The traditional approach to chatbot development involves processing user prompts independently and without considering the history of interactions. This can lead to disjointed and unsatisfactory user experiences. LangChain provides memory components to manage and manipulate previous chat messages and incorporate them into chains. This is crucial for chatbots, which require remembering the prior interactions. ![ Image by Midjourney](Mastering%20Memory%20Types%20in%20LangChain%20A%20Comprehensiv%209a0515e0407345888439a8c036e47e43/membot.png) Image by Midjourney By default, LLMs are stateless, which means they process each incoming query in isolation, without considering previous interactions. To overcome this limitation, LangChain offers a standard interface for memory, a vari

In [8]:
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import VectorStoreIndex

# Build index / generate embeddings using OpenAI.
index = VectorStoreIndex(nodes=nodes, show_progress=True, use_async=True, storage_context=storage_context, embed_model=OpenAIEmbedding(), insert_batch_size=1000,)

  from .autonotebook import tqdm as notebook_tqdm
Generating embeddings: 100%|██████████| 10/10 [00:01<00:00,  5.27it/s]
Generating embeddings: 100%|██████████| 10/10 [00:01<00:00,  7.23it/s]
Generating embeddings: 100%|██████████| 10/10 [00:00<00:00, 10.93it/s]
Generating embeddings: 100%|██████████| 10/10 [00:01<00:00,  6.51it/s]
Generating embeddings: 100%|██████████| 10/10 [00:00<00:00, 10.74it/s]
Generating embeddings: 100%|██████████| 10/10 [00:01<00:00,  9.41it/s]
Generating embeddings: 100%|██████████| 10/10 [00:01<00:00,  8.36it/s]
Generating embeddings: 100%|██████████| 10/10 [00:01<00:00,  6.57it/s]
Generating embeddings: 100%|██████████| 10/10 [00:01<00:00,  7.08it/s]
Generating embeddings: 100%|██████████| 10/10 [00:01<00:00,  9.90it/s]
Generating embeddings: 100%|██████████| 10/10 [00:01<00:00,  8.22it/s]
Generating embeddings: 100%|██████████| 10/10 [00:01<00:00,  6.77it/s]
Generating embeddings: 100%|██████████| 10/10 [00:01<00:00,  6.02it/s]
Generating embeddings: 100%

In [9]:
query_engine = index.as_query_engine(top_k=5)

In [31]:
res = query_engine.query("what can you tell me about the llama2 llm")

In [32]:
res.response

'I cannot provide an answer to the query as there is no relevant information or context provided about "llama2 llm" in the given text.'

In [28]:
for src in res.source_nodes:
  print("Node ID\t", src.node_id)
  print("Title\t", src.metadata['title'])
  print("Text\t", src.text)
  print("Score\t", src.score)
  print("-_"*20)

Node ID	 node_1708
Title	 The Generative AI Revolution: Exploring the Current Landscape
Text	 1. OpenAI's GPT Models Notable Models Task specific models Find model information here: https://platform.openai.com/docs/models/gpt-3 Image & Audio Models OpenAI, the company behind the GPT models, is an AI research and deployment company. The San Francisco-based lab was founded in 2015 as a nonprofit with the goal of building "artificial general intelligence" (AGI), which is essentially software as smart as humans. OpenAI conducts innovative research in various fields of AI, such as deep learning, natural language processing, computer vision, and robotics, and develops AI technologies and products intended to solve real-world problems. OpenAI transitioned into a for-profit company in 2019. The company plans to cap the profit of the investors at a fixed multiple of their investment (noted by Sam Altman as currently ranging between 7x and 100x depending on the investment round date and risk). A

# Load DB from disk

In [33]:
import chromadb
from llama_index.vector_stores.chroma import ChromaVectorStore
# Create your index
db2 = chromadb.PersistentClient(path="ai-tutor-db")
chroma_collection = db2.get_or_create_collection("ai-tutor-db")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

In [34]:
# Create your index
from llama_index.core import VectorStoreIndex
index = VectorStoreIndex.from_vector_store(vector_store=vector_store)

In [35]:
query_engine = index.as_query_engine()

In [36]:
res = query_engine.query("How many parameters LLaMA2 model has?")

In [37]:
res.response

'The LLaMA2 model has 13 billion parameters.'

In [39]:
for src in res.source_nodes:
  print("Node ID\t", src.node_id)
  print("Source\t", src.metadata['source'])
  print("Title\t", src.metadata['title'])
  print("Text\t", src.text)
  print("Score\t", src.score)
  print("-_"*20)

Node ID	 node_3662
Source	 towards_ai
Title	 Sorting & Analytics Pane in Tableau: A Road to Tableau Desktop Specialist Certification
Text	 Sample Certification Questions from this Topic Sorting from field label gives ______ sort by default.a. Nestedb. Non-Nestedc. Manuald. Data Source order Solution: Non-nested 
Score	 0.7556534272859884
-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_
Node ID	 node_16411
Source	 hf_transformers
Title	 Overview
Text	 The LLaMA model was proposed in LLaMA: Open and Efficient Foundation Language Models by Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample. It is a collection of foundation language models ranging from 7B to 65B parameters.
The abstract from the paper is the following:
We introduce LLaMA, a collection of foundation language models ranging from 7B to 65B parameters. We train 