## Installations

In [9]:
!pip install llama_index llama_hub weaviate_client urllib3 llama-cpp-python > /dev/null

In [11]:
from llama_index import (
    VectorStoreIndex,
    ServiceContext,
    StorageContext,
    SQLDatabase,
    download_loader
)
from llama_index.vector_stores import WeaviateVectorStore

import weaviate
import openai

## Connect to Weaviate

In [5]:
client = weaviate.Client(
    embedded_options=weaviate.embedded.EmbeddedOptions()
)

Binary /root/.cache/weaviate-embedded did not exist. Downloading binary from https://github.com/weaviate/weaviate/releases/download/v1.21.1/weaviate-v1.21.1-Linux-amd64.tar.gz
Started /root/.cache/weaviate-embedded: process ID 1699


### Create Schema

In [6]:
podcast_schema = {
   "classes": [
       {
           "class": "Podcast",
           "description": "Weaviate podcast",
           "vectorizer": "text2vec-openai",
           "properties": [
               {
                  "name": "Content",
                  "dataType": ["text"],
                  "description": "Content from the podcasts.",
               }
            ]
        }
    ]
}

client.schema.create(podcast_schema)
print("Podcast schema was created.")

Podcast schema was created.


## Load in Data

In [7]:
from llama_index import download_loader

YouTubeTranscriptReader = download_loader("YoutubeTranscriptReader")

loader = YouTubeTranscriptReader()
podcasts = loader.load_data(ytlinks=['https://www.youtube.com/watch?v=xk28RMhRy1U', 'https://www.youtube.com/watch?v=Du6IphCcCec',
'https://www.youtube.com/watch?v=Q7f2JeuMN7E', 'https://www.youtube.com/watch?v=nSCUk5pHXlo'])

## Build the Weaviate Index

In [12]:
openai.api_key = "sk-key"

vector_store = WeaviateVectorStore(weaviate_client=client, class_prefix="Podcasts_index")

storage_context = StorageContext.from_defaults(vector_store=vector_store)

podcast_index = VectorStoreIndex.from_documents(podcasts, storage_context=storage_context)



Embedded weaviate wasn't listening on port 6666, so starting embedded weaviate again
Started /root/.cache/weaviate-embedded: process ID 3416


[nltk_data] Downloading package punkt to /tmp/llama_index...
[nltk_data]   Unzipping tokenizers/punkt.zip.


## Create SQL Table

In [13]:
from sqlalchemy import (
    create_engine,
    MetaData,
    Table,
    Column,
    String,
    Integer,
    select,
    column,
)

engine = create_engine("sqlite:///:memory:", future=True)
metadata_obj = MetaData()

In [14]:
table_name = "podcast_stats"
podcast_stats_table = Table(
    table_name,
    metadata_obj,
    Column("podcast_title", String(16), primary_key=True),
    Column("views", Integer),
    Column("duration", Integer),
)

metadata_obj.create_all(engine)

In [15]:
metadata_obj.tables.keys()

dict_keys(['podcast_stats'])

In [16]:
from sqlalchemy import insert

rows = [
    {"podcast_title": "Weaviate 1.20", "views": 328, "duration": 65},
    {"podcast_title": "Weaviate 1.19", "views": 280, "duration": 27},
    {"podcast_title": "Weaviate 1.18", "views": 428, "duration": 65},
    {"podcast_title": "Weaviate 1.17", "views": 257, "duration": 43}
]

for row in rows:
  stmt = insert(podcast_stats_table).values(**row)
  with engine.connect() as connection:
    cursor = connection.execute(stmt)
    connection.commit()

## Create SQL Table in LlamaIndex

In [18]:
sql_database = SQLDatabase(engine, include_tables=["podcast_stats"])

In [19]:
from llama_index.indices.struct_store.sql_query import NLSQLTableQueryEngine

In [20]:
# set up text2SQL prompt
sql_query_engine = NLSQLTableQueryEngine(
    sql_database=sql_database,
    tables=["podcast_stats"],
)

## Build Query Engine

In [21]:
vector_query_engine = podcast_index.as_query_engine()

## Tell LlamaIndex about the Tools

In [22]:
from llama_index.tools.query_engine import QueryEngineTool

sql_tool = QueryEngineTool.from_defaults(
    query_engine = sql_query_engine,
    description=(
        "Useful for translating a natural language query into a SQL query over a table containing: "
        "podcast_stats, containing the views/duration of each podcast"
    ),
)
vector_tool = QueryEngineTool.from_defaults(
    query_engine=vector_query_engine,
    description="Useful for answering semantic questions about Weaviate release podcasts",
)

In [23]:
from llama_index.query_engine.router_query_engine import RouterQueryEngine
from llama_index.selectors.llm_selectors import LLMSingleSelector

query_engine = RouterQueryEngine(
    selector=LLMSingleSelector.from_defaults(),
    query_engine_tools=([sql_tool] + [vector_tool]),
)

## Query

In [24]:
response = query_engine.query("Which release podcast had the most views?")
print(str(response))

The podcast episode titled "Weaviate 1.18" had the most views with a total of 428 views.


In [25]:
response = query_engine.query("Tell me about a new feature in Weaviate 1.20")
print(str(response))

In Weaviate 1.20, a new feature called multi-tenancy was introduced. This feature allows users to separate and isolate their data within the application. For example, if a user has an application that indexes documents from their hard drive, they can ensure that only they have access to search through those documents and that other users cannot access them. Multi-tenancy in Weaviate helps to limit the vector space and allows for efficient searching and filtering of data. It provides a technical solution to the problem of managing a large graph of vectors spread across multiple tenants.
