In [2]:
#from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import Pinecone
from pinecone import ServerlessSpec
from dotenv import load_dotenv
from openai import OpenAI
import os
import pandas as pd
import numpy as np
from uuid import uuid4

# Load environment variables from the .env file (if present)
load_dotenv()

True

In [3]:
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
pc = Pinecone(api_key=PINECONE_API_KEY)

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=OPENAI_API_KEY)

## Batching Upsert

### Defining a function for chunking

To be able to batch upserts in a reproducible way, you'll need to define a function to split your list of vectors into chunks.

In [19]:
import itertools
def chunks(iterable, batch_size=100):
    """A helper function to break an iterable into chunks of size batch_size."""
    # Convert the iterable into an iterator
    it = iter(iterable)
    # Slice the iterator into chunks of size batch_size
    chunk = tuple(itertools.islice(it, batch_size))
    while chunk:
        # Yield the chunk
        yield chunk
        chunk = tuple(itertools.islice(it, batch_size))

### Batching upserts in chunks

In [3]:

import time
index_name = "my-second-index"
# Create your Pinecone index
pc.create_index(
    name=index_name,
    dimension=1536,
    spec=ServerlessSpec(
        cloud='aws',
        region='us-east-1'
    )
)

# wait for index to be initialized
while not pc.describe_index(index_name).status['ready']:
    time.sleep(1)
    
# Print a list of your indexes
print(pc.list_indexes())     

[{
    "name": "my-second-index",
    "metric": "cosine",
    "host": "my-second-index-tbbbaqp.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 1536,
    "deletion_protection": "disabled",
    "tags": null
}, {
    "name": "dotproduct-index",
    "metric": "dotproduct",
    "host": "dotproduct-index-tbbbaqp.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 1536,
    "deletion_protection": "disabled",
    "tags": null
}, {
    "name": "my-first-index",
    "metric": "cosine",
    "host": "my-first-index-tbbbaqp.svc.aped-4627-b74a.pinecone.io",
    "spec": {
    

In [6]:
import json

# Open and read the JSON file
with open('../data/vectors.json', 'r') as file:
    vectors = json.load(file)

index = pc.Index("my-second-index")

for chunk in chunks(vectors):
    index.upsert(vectors=chunk)
    
# Print the index statistics
print(index.describe_index_stats())    

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 0}},
 'total_vector_count': 0}


In [7]:
print(index.describe_index_stats())

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 100}},
 'total_vector_count': 100}


### Batching upserts in parallel

In [18]:
# Delete and create index
#pc.delete_index("my-second-index")
import time
index_name = "my-second-index"
# Create your Pinecone index
pc.create_index(
    name=index_name,
    dimension=1536,
    spec=ServerlessSpec(
        cloud='aws',
        region='us-east-1'
    )
)

# wait for index to be initialized
while not pc.describe_index(index_name).status['ready']:
    time.sleep(1)
    
# Print a list of your indexes
print(pc.list_indexes())    

[{
    "name": "my-first-index",
    "metric": "cosine",
    "host": "my-first-index-tbbbaqp.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 1536,
    "deletion_protection": "disabled",
    "tags": null
}, {
    "name": "test-image-index",
    "metric": "cosine",
    "host": "test-image-index-tbbbaqp.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 1280,
    "deletion_protection": "disabled",
    "tags": null
}, {
    "name": "my-second-index",
    "metric": "cosine",
    "host": "my-second-index-tbbbaqp.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        

In [12]:
# Initialize the Pinecone client to allow 20 simultaneous requests.
#pc = Pinecone(api_key=PINECONE_API_KEY, pool_threads=20)

In [20]:
# Upsert the vectors in vectors in batches of 200 vectors per request asynchronously, configuring 20 simultaneous requests.
import json

# Open and read the JSON file
with open('../data/vectors.json', 'r') as file:
    vectors = json.load(file)

index = pc.Index("my-second-index")

with pc.Index('my-second-index', pool_threads=20) as index:
    async_results = [index.upsert(vectors=chunk, async_req=True) for chunk in chunks(vectors, batch_size=200)]
    [async_result.get() for async_result in async_results]
    

In [22]:
# Retrieve statistics of the connected Pinecone index
print(index.describe_index_stats())

{'dimension': 1536,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'': {'vector_count': 100}},
 'total_vector_count': 100,
 'vector_type': 'dense'}


## Multitenancy and Namespaces

### Namespaces

In [23]:
# Upsert the vectors in vector_set1 into the namespace namespace1.

index = pc.Index("my-second-index")

# Open and read the JSON file
with open('../data/vector_set1.json', 'r') as file:
    vector_set1 = json.load(file)

index.upsert(vectors=vector_set1, namespace="Namespace1")

# Upsert the vectors in vector_set1 into the namespace namespace2.

# Open and read the JSON file
with open('../data/vector_set2.json', 'r') as file:
    vector_set2 = json.load(file)
    
index.upsert(vectors=vector_set2, namespace="Namespace2")

{'upserted_count': 50}

In [24]:
# Print the index statistics
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'': {'vector_count': 100},
                'Namespace1': {'vector_count': 50},
                'Namespace2': {'vector_count': 50}},
 'total_vector_count': 200,
 'vector_type': 'dense'}

### Querying namespaces

In [25]:
search_vector = [0.30340956141764597, 0.12139923696095312, 0.5161532585888067, -0.3342778958665069, -0.9001706203222355, 0.10082498306576215, 0.6429446285654061, 0.6741472730969968, 0.5995241253036068, 0.1176186131035315, 0.46488289785350934, -0.7587124097745286, -0.6127611376918998, 0.9439869585822185, -0.1729400507058052, -0.1958358229715187, -0.8454698079483147, -0.10339456399279423, 0.3177489506161437, 0.4808334746267675, -0.2548388951511882, 0.9106516710117112, -0.07740962009812535, -0.026532773488110672, 0.31822641758092973, -0.7551392047188192, 0.33583880665193155, -0.27251163455881855, -0.9436715445668895, 0.5585109832551127, 0.8786490729832233, 0.14930849139537217, -0.2739372039412975, 0.3928660963661892, 0.8100738914100429, -0.3700351829175308, 0.12202772882830626, -0.5187468509479523, 0.2847166468287139, 0.5809452496904854, -0.5727076985452668, 0.10935023524699639, -0.6427168943706403, -0.05238104586738901, 0.35099656755179187, -0.9345729914315137, 0.4857467688515005, -0.2871343207746455, 0.12380908085718678, -0.07087717803680804, 0.5582870671870102, 0.48541642484016356, -0.9949854124693744, -0.7072150178598293, -0.5217029324304066, 0.027096958907800284, -0.39731783201843696, -0.7630788009769838, -0.8330174317321237, -0.2260408093771804, -0.213564416255817, 0.3482833698645551, -0.7174720347840862, -0.22519682349329018, -0.36936639528570625, -0.04009622048388617, 0.6348001706597044, -0.32734166283336896, 0.9871860751753763, 0.8927435896925917, -0.555543976697622, 0.030034512323814377, 0.8252795835492002, 0.42849269180960636, 0.7469849451358115, 0.599124721154302, 0.24071996662926454, 0.31555383833046746, -0.4840604411265237, -0.6278745064909577, 0.14269869538050672, -0.8288405974414366, -0.4936198749993024, 0.21657674731586551, -0.8112436037588346, 0.25786013676714825, 0.066360155270075, -0.7248937718941084, -0.0018462450197092029, 0.42652573976378894, 0.13714243008756632, 0.7702400810441943, 0.8974496180741867, -0.2724890316657491, -0.48002299513439617, -0.18866732467518288, -0.07869046332659035, -0.6276387133174794, 0.48438144983623976, 0.8288549380912167, -0.04841115957287867, -0.8909054065345572, 0.9686555595490423, -0.31085053035478016, -0.5611428796041309, -0.24361895243180043, -0.08091602657159003, -0.40163258990371964, -0.1254320575458694, 0.8572757263738662, 0.25716461432999105, -0.35509171997421607, -0.9853014210732447, 0.22265121725091186, -0.19917532638646507, -0.06642210887168098, 0.7446811185532649, -0.10476686353554698, -0.7209189407090615, 0.08264937322333998, 0.41065531119743093, -0.39944973941462103, -0.5018780392590279, -0.25313801802196845, -0.2102311392727103, -0.6324297875978309, 0.2701112112367692, 0.5442949429440389, 0.23786892945531335, 0.006577193801201142, 0.14625246905013212, 0.027066019121444018, -0.07233281131816738, -0.9436948320416509, 0.0817409305095036, -0.8503205517922627, 0.09318195036605115, -0.007252745795490512, 0.7091017560923802, -0.1773167454388329, 0.9059965337704516, -0.08333693167792999, 0.8964912258261115, -0.68570582308496, 0.7489350585424699, 0.5581721924949572, -0.5151741964594152, 0.9924110395480703, -0.47754929206442687, -0.00633806253316882, 0.44910847472190296, -0.9921578909419053, -0.30330876393000294, 0.14046658103306076, 0.18913962382482774, 0.526888356064906, 0.08511732024086283, 0.9798509300190439, 0.30357501438912826, 0.2972343596395337, -0.11491949920205147, -0.0750959278196397, -0.015128655550114, 0.919893688093931, 0.16143939556697084, -0.8914636611190554, 0.31978601402129403, -0.66031769023489, -0.16344651736597093, -0.8152900210197918, -0.6885521290540333, 0.5114244305717248, -0.26678184267174854, -0.9042478831398575, 0.44737032371707297, -0.9872615268210598, 0.6130712834106136, 0.11592229479155147, 0.7096203186804451, -0.44068939645089955, -0.6598061340795975, -0.865576676428577, 0.3995451896316742, 0.8196487170471152, 0.40081670424893323, 0.2232354481150396, -0.9882627392526491, -0.27557561456166146, 0.8854209676190881, 0.09203060574035238, -0.4729051050837647, -0.7570882036464484, -0.21520068954250227, -0.7173265403833677, 0.9162175097491019, 0.6807219780532379, -0.050785944248782355, 0.9807458223593883, -0.6980985507644013, -0.675190651593212, -0.680311969096842, -0.7213649371914761, -0.014243025848110102, 0.5463883224408539, 0.2844489750771275, -0.1589854754260953, -0.941758987257036, -0.3131227291555079, -0.45303767901841807, -0.5362015038403882, -0.21493923172180218, 0.3769837734940593, 0.4880966547509751, -0.622250377912253, 0.23108467259990162, -0.35422677718815665, -0.5270664539972456, 0.9947272238578873, -0.8741719678353514, 0.70583005025546, -0.2025851659616862, 0.8836528818890377, -0.6059470915424647, 0.9917688929516062, 0.5063085658776061, 0.12572990524946892, 0.5176960947606553, 0.47066758060799474, -0.31779738492730414, 0.8657396416955829, 0.8751514148597728, -0.4542832715289471, -0.48575958719546986, 0.7175692421701707, -0.38853580596638393, 0.8350409404902337, 0.3220870127316966, 0.9299963931187245, 0.6334470928091047, 0.6826542212352784, -0.8437409449616236, -0.002556654787214674, -0.18741949814048264, 0.5639638688702511, -0.35486931548037215, 0.3539701043147896, 0.38449866701125246, -0.38720301557414616, 0.1359303438474897, 0.40646124405239203, 0.27599889863362903, -0.3424158931023935, -0.45274636765105547, -0.38348474349670436, -0.8923879801698269, 0.27766353321391035, 0.4441710216276986, -0.9083363359121917, 0.9460457808899638, 0.574379630438776, 0.4243691127447822, 0.6706148437630546, 0.7742991270655621, -0.4137245302716852, -0.03481397164020317, 0.28058964116487495, 0.7660403962034348, 0.438985966056578, 0.2958861056693036, 0.08054368770152553, -0.3633295225628548, 0.4297221270442326, -0.0953341883447325, -0.7894013092270895, 0.6120995164747993, 0.34920236017611916, -0.012084558983241012, 0.7699262590315583, 0.4530746542641346, 0.17140993172521912, 0.5259600707454477, 0.7262483248029865, 0.17971589179569247, 0.7523259312287933, 0.34112433842064105, 0.12786844768906902, -0.10369865729147643, -0.706364373012496, -0.9671822747415277, -0.561960116016845, -0.61436375557789, -0.5486964623759334, -0.2926010843442022, -0.07592795134477437, 0.6629179964632801, -0.2607087713236158, -0.5332932409058089, 0.9774371404615871, 0.9957802501970701, 0.5265893651218565, -0.2668240128561403, -0.19088846635109102, -0.6395807246618357, -0.4739712227349706, 0.3494869204431563, 0.16993841810256805, -0.07920119473862997, 0.23997766607207227, -0.03248350409123146, 0.9122667608033865, 0.711491564707412, 0.5202015698365605, -0.7981186167530634, -0.05362318218675255, 0.22606197715584764, -0.8232166995469494, 0.8227020502324602, -0.5685962481838931, -0.9897242882781556, -0.1719308816239815, -0.7512209803797159, 0.4549848129521943, 0.6898391799006609, -0.1306036350870361, 0.9461171258245304, 0.02884218672311145, 0.053436777605615715, 0.5627555109905089, -0.6541808361085051, -0.6109674731316257, 0.6564787561987953, -0.9028203038742559, -0.4731414616549643, 0.20041947572944663, 0.24725796198180183, 0.3200252495995539, 0.8466328759142141, -0.7370997451859878, -0.21223311994200222, -0.559830703057087, 0.6340113969389385, -0.5983892959545019, 0.39040749863599866, 0.9934762919011921, -0.20106842354652388, -0.06350901801072606, -0.1632620099786417, 0.08752750072503046, -0.10533253239647489, 0.45999263688092795, -0.9807417805726881, -0.05981344296329749, 0.15013790316946407, -0.9385329442769099, -0.7835188785239495, -0.24727239382298416, -0.3167827120595317, 0.8409445033648573, -0.9759866746158818, 0.6338947720014394, -0.8918622149425088, 0.6788751839114506, -0.1270147063401723, 0.2834144182023475, -0.6422762732859075, 0.5965899555927558, -0.02630355462709244, -0.9060275032268232, 0.5975118570554188, 0.48146565146961384, 0.5320262228111068, -0.05432289778539734, -0.964476937893431, 0.7505010416011859, 0.2690221194509641, 0.7781187365600326, -0.5680734191603056, 0.3909630248724303, 0.8932881591437496, 0.9926432696851024, -0.35589715589755344, 0.31264336187792985, -0.17593542856598288, 0.9356435413456345, -0.2673602546186318, -0.31358310016994606, -0.953454090458874, -0.2923116848443654, -0.3852299831894055, -0.742290218157504, -0.9312464409223216, -0.9686878479069136, -0.24251937112817945, -0.2472706736047925, -0.41581385542638816, -0.07171022883066658, 0.8464349621075917, -0.4061626903545146, 0.5565734041595689, -0.9185975894082705, 0.8613621701206258, -0.36130185682814386, -0.29418922815461657, -0.11006891857023171, -0.4434715094809314, 0.797741491264752, -0.9038344712275961, -0.8597902151053838, -0.2258854370834935, -0.6708275848760978, 0.5058653641479058, -0.5298367079351061, -0.4315881927894307, -0.4841455409272464, 0.7987733606418916, 0.6462306062855694, 0.07915429602904234, -0.9957420957331216, -0.34328926445408814, -0.0458332665667871, 0.04140819638116944, -0.5064491739984398, 0.2668060848236842, -0.6179254151086495, -0.0508551614098538, -0.2316620417461832, -0.2970255811048812, -0.248177172486427, -0.10686153717048374, -0.5088732922354915, -0.5046326868012789, -0.588537325441177, 0.33074920613334147, -0.48889395657738266, -0.9484592775104914, -0.12759865976997653, -0.010443060920020342, 0.9146022749562841, 0.26212624853836153, -0.7883913649063687, -0.09895993916235701, 0.49797798252525527, 0.6553961807943636, 0.24933504643880888, 0.7926676390149756, 0.06639375210856935, -0.23862855539621508, -0.027713423938239057, -0.90560386035953, -0.9805102677490709, 0.20702911804079238, -0.542524094325934, -0.17329964516280505, 0.37413628362941265, -0.23832329926303664, 0.3282953153311816, -0.14814592862306042, -0.9407006325822738, 0.6863924530760621, 0.5711758544476089, 0.22230680052740714, -0.009860359043486033, -0.8056959079078054, 0.3458422068267262, -0.999986856004796, -0.2974649684281985, 0.0543791112552785, 0.44566184554546995, -0.26891961954977495, 0.2095053631432191, -0.8723479580934064, 0.4240987246690553, -0.6025011626313712, -0.9897284083793465, 0.293859019629225, -0.8758907310802362, -0.09742356454512557, 0.7161840180888208, -0.2701113571376794, 0.3105784880925959, 0.7810074842248229, 0.4659508099051082, 0.04599655088145971, -0.22659891699399992, 0.8023974461469809, -0.49603457960399444, 0.1314421101430896, 0.6807539907235001, -0.5467310231548947, -0.09793000545889008, 0.591647890930203, -0.03300160204457914, 0.023003797737979337, -0.06103937645267443, -0.0804472482397598, 0.9013016884307559, 0.5835399237994212, 0.30732139154612437, 0.19164596158681602, -0.931739634230359, 0.5510186291864403, 0.0026226301333989177, 0.993931943262288, 0.03777630071791749, 0.35762471925727923, 0.6616646699574846, -0.6285805984087778, 0.10365369409202274, 0.9568246545111425, -0.4962457321268625, 0.93940119149087, 0.7017595512513022, 0.6004041412446277, -0.7042539741503073, -0.7304850979010273, 0.6699491028262481, 0.26270595830538146, 0.08406764170752101, 0.8873926796465843, -0.7028903418024595, 0.890898973427098, 0.1016751539440186, -0.7109356789989165, -0.1266047896758613, 0.5207668435739938, -0.815882161640521, 0.8418503442565457, -0.5339296515516649, -0.995010620692649, 0.388864438421211, -0.11699627179139527, -0.4346045283533426, 0.09473399875544097, -0.5021168082576897, -0.9870827576412609, -0.08872836700724362, -0.05806121670066977, -0.5582645991294095, -0.1917659414877122, 0.5107792496654613, 0.8706417561796844, 0.4673905536906342, -0.22251588227916752, -0.22623819364802955, -0.8068155659602565, -0.7624792030910805, 0.8290285261352652, 0.8774813403112225, 0.15217940126605356, -0.5604321968844597, 0.2614713818614889, 0.1838827861143535, 0.7318950711287999, -0.09268380884024974, -0.6330055450548335, -0.20942567794422207, -0.3538071465889414, -0.7077505436144687, -0.8897306220545309, -0.4528079519949042, -0.27256732160230435, 0.8591278890158147, -0.4110801508621589, 0.5947665995378557, 0.4733652021677839, -0.11734157074776408, 0.06005731761736244, -0.05079223517982512, 0.09154664593857231, 0.015079096676577697, -0.09664777658284773, 0.6128811175826663, 0.2832814900908096, -0.23154992696163146, 0.13238233891597662, -0.45932375141179604, -0.995273828464786, 0.7132999819110946, 0.5049682208070205, -0.9406940967724831, -0.8031394139076093, 0.5979750368559509, 0.4538883275745682, -0.44334384467313703, -0.7220006905800329, -0.7027502106209678, 0.6105656292370196, 0.8617872917709235, 0.1959273250697917, -0.40704027741818605, 0.9567371437788319, -0.37126413497514066, 0.3917996931796608, 0.8420925888490676, 0.4613580810721696, 0.8333166550284736, -0.5071426161634576, -0.31314243607747216, 0.3541679393214652, -0.021773319571900984, -0.6334949563772938, 0.5051313253512315, -0.31499413272606547, 0.7980729224834817, 0.4691947664620897, -0.32967938373526073, -0.026757200364937672, 0.8792089009520403, 0.17041160510900166, 0.5104131266768663, 0.9149994691489403, 0.5526433434715083, -0.9170063503576602, -0.48726208020769013, -0.4418602260750406, 0.055577311635570004, -0.27721292007751375, 0.23057119775822588, 0.8427631771227921, 0.47880922189703856, -0.9898123593308117, 0.8949136031996399, -0.6313846039330853, -0.2937669584095257, -0.8662845894332754, -0.6512171019542172, 0.8853949262991854, 0.1875781572059121, -0.5324791206170492, -0.6254564585479243, 0.7126825084362371, -0.7492791299562163, -0.3036944104761181, -0.21174898012456045, -0.8509894032544225, -0.1749968304812346, 0.5693078337242938, 0.562107721954491, -0.6177446511372608, 0.5165463117894338, -0.4503965601794684, 0.44715283047328436, 0.4102204268561125, 0.6796687556325056, 0.9758278373921461, -0.8072578015227287, -0.48903330459690064, 0.2405528086427149, 0.08922532173493058, 0.6675905541861376, -0.7970559393829382, -0.023721922588082256, -0.30140827490075695, -0.8537613991711588, 0.4914520926762658, -0.060259719438144455, -0.8637187615208934, -0.9840508050143129, -0.9526860980617056, -0.21241106028407697, 0.8124282230115583, -0.4940074629254052, -0.7336846085999993, 0.1615552274129095, -0.05358767372252582, 0.4167254319597671, -0.25748109922288265, -0.5237347014019349, 0.49863779580894496, -0.012670797749064766, 0.6600464278906337, -0.928399406227989, -0.7897799664529466, -0.2296849772497862, -0.34277387824950867, -0.9043462646982754, -0.45787637702371486, -0.038730593618965026, 0.41589646343818254, -0.5117210090998194, 0.6133863722283155, 0.7033952592889201, -0.483063228404903, -0.19324806781259896, 0.931062699130901, 0.21551874601589915, -0.1509152946253658, 0.33183997394283504, -0.21025915367477155, 0.9832242635328527, -0.9379410874776632, -0.7248341289630846, 0.5462190027351503, 0.6270382220307236, 0.06954045611011694, -0.4106921972830857, 0.41872363486535624, -0.639277577487527, -0.5227088910957782, 0.5808105193936548, 0.3906991024458135, 0.08531175962193749, -0.6563750687362158, 0.6847719859806516, -0.8565068738042878, 0.31574518273787877, 0.4978995377920252, -0.4195286972512817, 0.5443845645215948, -0.9470264543466731, 0.8681426036540985, -0.23919936261293961, 0.6773000905630229, -0.36242346801142866, 0.4138002633065796, 0.13603130293028576, -0.10272599594445997, 0.9832913596962196, 0.5952997091298851, -0.7411130918136328, 0.9520578956981034, -0.06157794062712152, 0.27384135320264047, -0.7634999573678762, 0.9748353747320904, -0.047285313246904215, -0.0790983977125983, -0.4010351247000288, -0.22652306437760172, 0.5298234008506657, -0.8135531310809658, -0.4432379666540951, -0.31425161475824903, -0.4977764815323096, -0.5726035671323138, 0.9976903989658383, 0.6323556149718859, -0.5638362468832774, 0.7918092380111277, -0.2116771549995926, -0.8406760246808083, 0.190276362063021, 0.06171091813820295, -0.08423150653188216, -0.6362494750025427, 0.5215568254278156, 0.03753968842093136, 0.6423620230839615, -0.6159771678309387, 0.9515838028329622, 0.34711825561504717, -0.02149750400863537, 0.511472886472927, 0.19447514619716633, -0.15292081441486105, 0.5546153712904796, -0.32698933516941575, -0.9488344045515904, 0.5838051188203384, -0.4011447193358024, 0.2190424963197326, 0.7863712521078852, -0.48092873988664, 0.7877497669995062, -0.6400340880837143, -0.5012063808010858, 0.11962425036521251, 0.06700784488545963, 0.7640884163970671, -0.5252841594320625, 0.9196277709007636, 0.1642615847350899, 0.2580487334987507, -0.9727809651019397, 0.9925734212143569, -0.30543871907360476, -0.03099841551318261, -0.912661931196217, -0.05310606061672063, -0.8194960198598653, 0.9324856507382215, 0.6824788769709398, 0.43087984831836823, 0.0778074970770759, 0.5824993345291836, -0.6530418825611055, -0.7653149977305889, -0.9627677678805977, 0.0013209363540895858, 0.3747921960484488, -0.35337591656697054, -0.9526265369868869, -0.8662244723155239, -0.687791812138409, -0.10229097189645553, -0.13903267508059058, -0.7270715022610301, 0.6258221220201718, 0.29419570165170095, -0.07192314543565947, 0.4553162073310255, 0.9360677217306226, -0.8833950944720526, 0.989120011871994, 0.9668664619206371, -0.7287939006943176, 0.79999953406787, 0.46105015464066756, -0.584460742333256, 0.6119528594482169, -0.7234915966957587, -0.7048897908510396, -0.6859819673784633, 0.4050894434748966, -0.9623620800520922, 0.020331300145395748, -0.13789443671160861, -0.6511585630665828, -0.8373167982960252, -0.29088728314276135, -0.33776708879098516, 0.9468528564699856, -0.608032475004151, -0.013494924827449406, -0.019211386911966333, -0.020086370650759022, 0.8968703466853223, -0.26708729355941974, -0.824432270862822, -0.8376443965673195, 0.7017431279269026, 0.1435132662685168, -0.5859377358974787, 0.7090647790371212, 0.5465763808226931, -0.24957661557091315, 0.03263733425280413, 0.2740080910631142, -0.5376880856579165, 0.29378478104445316, 0.210106734262423, -0.24436123340098237, 0.20846657140599412, -0.4134780316178466, 0.1338569530578717, 0.1577191093910224, 0.9673849163282247, -0.7758042334618847, -0.8502469126084682, -0.4226688568872954, -0.38990874688479993, 0.9321873978510831, 0.9975938662070309, 0.4403376604389564, -0.7181423076091338, 0.13993528758460205, 0.7502042165769687, -0.07310382556123463, 0.0923803475179461, 0.17811368546854878, 0.32818179258913793, 0.2575559012637907, 0.013932914300899713, -0.29609817333803745, -0.24692190074887632, -0.34320493661160323, 0.557284213420614, -0.062423078086386585, -0.7897627199101487, -0.43137503231032515, -0.6230772102095019, -0.8303881953625911, -0.37545982836247394, -0.9224408453687665, -0.2945732732283741, -0.9799423050022755, 0.23644095075660188, -0.2999693775440304, -0.9943240446198562, 0.6283912796484137, -0.8165939892315697, -0.1743065388817968, -0.9709652699561988, 0.7098969475036951, -0.47716607776756814, 0.4322666878720245, -0.6338990732121181, 0.910450966513094, 0.635276743566173, -0.8851931957484402, -0.8045786635637986, -0.7270993854988952, 0.025973107943074725, 0.8837421439339972, 0.18622295288658997, -0.4883665481397581, -0.05720131064557976, -0.5612254343957721, 0.8928298139725528, 0.8047748628550224, 0.9095445310051897, 0.9471532364160029, -0.7841942923294307, 0.9114936259808506, 0.2363701301989627, -0.442428435285938, -0.007461958817522429, -0.8279375375493718, -0.4591436037629135, 0.3166969291981605, -0.44051840875617354, -0.3319878538986407, 0.3798738418967964, -0.3603462235028474, -0.9398716568370744, 0.16945131345386644, 0.7968271755831784, 0.6425133364837734, -0.2675327112436281, -0.6405271424143624, -0.7289392457242891, -0.33260659677399795, -0.7286618149729247, 0.2967780577421535, 0.9509870301411036, -0.8705012627156541, -0.268867529628533, 0.9768836528283542, -0.958145814608151, -0.895415773134143, -0.2689740232246276, -0.3067314347526582, 0.5596395184741421, -0.2076469201679516, -0.24317925789552208, 0.2359374231194209, 0.4233791074444917, 0.9969776936311388, -0.2085308589475685, -0.08848853746938601, -0.8416582126815289, -0.2317303438845908, -0.764381320717199, 0.21432528547959584, 0.30812941213945444, 0.7213032672834536, -0.7963871593709833, -0.19242911826479925, -0.0014302094783842367, 0.2943784065810966, 0.9494355244747694, -0.9181469880981401, 0.5696733980122992, -0.7784716055508383, -0.11276355909577918, -0.32003676686396454, 0.5831168478537545, 0.4710339444309779, 0.7202041009245801, 0.7249221756723081, -0.280069707579202, 0.06526147158021423, -0.7997243135216818, -0.1707492119219245, 0.280355341431042, 0.9007177618588273, -0.3547109278694074, 0.7452500503660375, 0.5931112311871694, 0.033756709692330134, -0.7966722828842012, 0.7666941890938757, -0.7929329005571442, 0.9938545003987549, 0.23303078902870933, 0.3632492019442757, -0.10958057900112861, -0.4208216988464031, -0.020126066457254677, 0.6463807494329938, 0.1903239008013129, -0.8586607238986828, 0.21788012829091663, -0.8750156878980759, -0.879972538033422, 0.2462413151287619, -0.8583485594691931, 0.380895115872395, 0.4734357854809237, -0.78634095434472, 0.06553603849291822, 0.9115210922260963, -0.39330694174446146, -0.44861773367757496, -0.11251873217649067, 0.64009584946771, 0.3179655715170977, -0.1092857326209975, 0.30714477905414417, 0.3758063422626723, -0.9862009255144213, 0.03464967044562739, -0.7961314635546484, -0.542715226590452, 0.7029484924611413, 0.3635200800772018, -0.04930004863044557, -0.13946266220795622, -0.41678705747569444, -0.1528271013782947, -0.6857205076993662, 0.11842069980496772, -0.08768578855057396, 0.6947597102491834, -0.46867090250454746, 0.9837053974502112, 0.8458374811702492, 0.10929927191787714, 0.32417919441510823, 0.9181477607444499, -0.37862209111894907, -0.9278487292507012, -0.46176437021684547, -0.8371480682284709, 0.5664172768874844, -0.43369088984460524, -0.7344220654313638, 0.9552491767671767, -0.9528360812707197, 0.9709024203877723, 0.319195794281216, 0.07879180068240843, 0.2832355386458132, 0.883862557687082, -0.8090724840243069, 0.32058642763156, -0.09140019084099116, 0.686463418137857, -0.11712389466378714, -0.17700587576751525, 0.8624512715992199, 0.5960201740335553, -0.5570661788020463, -0.0633897693507004, 0.4199311437215656, -0.5245445211239703, 0.44921642490771907, 0.6084845773717293, 0.5813846855345692, -0.8249600805228237, -0.30188098526076934, 0.9794349020703719, 0.49285101519910346, 0.5970887232769573, 0.37977688305776924, 0.7470279439540302, 0.4430853740105447, 0.15755059820021877, 0.26706204805015354, -0.02779973452734419, -0.7292958854751699, -0.551244056024333, -0.7486446318716775, 0.5621778913595321, -0.09460829751263633, -0.22996417567727012, -0.49136933276426076, -0.7294196679408473, -0.15112554432832326, -0.3708874331572103, -0.6025458784795561, -0.5148138821303452, 0.743233113475148, -0.3299920775312568, 0.7464340350263714, 0.7714137500875737, 0.214796726061798, 0.7704846868269191, 0.8456421851038509, -0.5524844081919251, 0.6911598736337972, 0.9333693438966904, 0.03931111372208562, 0.7581332057046062, 0.22866353333808642, 0.6621315970911941, -0.5552543350613148, -0.5879157851523493, 0.6443553001502442, 0.4741438898226389, 0.7703554587754287, -0.9459279212938312, 0.24440633485851504, -0.12030785816030698, -0.32224600871966524, 0.711599366666197, -0.35853368583934264, -0.24108587501345324, -0.5174438278578952, 0.9457845155333835, 0.2643781876074509, 0.4522312740610652, 0.9636600490139029, 0.10116987639693509, -0.05531700832169428, -0.05357651572612476, 0.4428179286564231, 0.816300998622044, 0.5512254929125886, 0.05039426905287403, -0.47335236171801687, 0.6775359174612083, 0.6603050163596187, -0.5568572263843985, 0.3731543331007874, 0.10962782090929846, -0.5626587783780319, -0.4170155884650475, 0.45723375231946606, 0.9003177126302757, 0.2972682145408305, 0.947601654129081, 0.09110681261134612, 0.7092813468215216, 0.9483881241408769, -0.9416572365276079, -0.9151127361384928, -0.4971336908988202, -0.43483271645661037, 0.11246064594871208, 0.00018139226027336086, 0.6703410258839719, 0.10569662992527196, -0.44808331336225193, -0.561022918200301, -0.41698257008242634, -0.7182712494226224, 0.2137206723440075, -0.516685430614918, 0.7319689330678716, 0.9301647402970603, -0.22050190431513816, 0.8599292876513223, 0.7391775936125526, 0.03373952103123257, -0.6368888556066976, 0.16444761084723813, 0.5474000450815473, -0.7510758143134029, 0.8716877643878294, 0.5159872843334106, 0.6652753278603993, 0.13595681059318343, -0.5125579917354897, -0.9395131415337514, -0.06573541897008717, -0.8395937964021079, -0.6322017644997955, -0.055209107763354304, 0.03793638444039482, 0.013330325310385138, 0.04893222151692522, -0.4104197398897256, 0.2410920169550843, -0.27991402323361103, -0.04361852779120712, -0.3864664545042653, -0.7529344727130818, -0.7306992373265291, 0.580994265960874, 0.23532217463610472, -0.9592029088847156, 0.6479953510214138, -0.9970157398471617, -0.7267458948016743, -0.8318146570602123, -0.25556972869529115, -0.6791576844043599, -0.4117944492217491, -0.6255049326563327, 0.6020277382534953, -0.7424572374127576, -0.8099952934129433, -0.4836102397344071, -0.32190537709346456, -0.27222535183002905, -0.9416399944782947, -0.2086411079080157, -0.47116291242103947, 0.10974112428376759, -0.5711103989900961, 0.5195796715747998, 0.4507199307388612, -0.830571376155234, -0.46220328383077924, -0.21571335877618547, -0.6199910694388631, -0.14900864768014488, 0.5452084802466686, -0.9755988233929451, -0.866044695827664, 0.648676392474951, 0.19788970819383778, -0.46490834456776864, 0.9760510036960928, 0.10739767799356215, 0.4377120352429631, 0.7877739525301286, -0.2436754093619402, -0.8468237260965041, 0.9748322505804587, -0.42324606994765235, 0.33329937162467504, -0.7702123803397714, -0.2991254840335673, -0.44856399599804964, 0.9267237549532756, -0.7962226713743068, -0.9632796440051214, 0.19388833891239687, -0.08624144978041604, -0.031604333037703825, -0.14841833169688634, -0.9483943201064633, -0.4999875011459167, 0.7318451163273776, 0.45296975972652964, 0.5212007414003841, 0.8434887631689576, -0.2809282670859292, -0.41243772975262494, 0.6748962260893201, 0.827593872262504, 0.7544175818331931, 0.3420255245393682, -0.8568100859071912, -0.5470586127747021, 0.8589458574549098, 0.3843353078005176, -0.07668658883280743, 0.3352093021640221, 0.796565179729902, 0.42528360517312147, -0.13928520945648293, 0.9272732457344368, 0.41761946278978623, 0.5976550663295366, -0.6877244248830221, 0.2761707570559466, 0.6569871537802583, 0.25480269361864893, -0.8366080056375642, -0.8565196359169378, -0.307297757991458, 0.1585355631068337, 0.6082574714850579, 0.49600591084641543, -0.9438646705185629, -0.4276708685610331, 0.6818405144530095, -0.17216136481991584, -0.9523555461284685, 0.5447139033885096, 0.13679778379870733, -0.7253288769043909, -0.02502892037507043, -0.8969263166873194, 0.8099572942598801, -0.5282323264248816, 0.6700511440785872, 0.5138745984057156, -0.060674732883021054, -0.6732603407988116, 0.7842147170162215, -0.5770085674334062, -0.6355823124418909, -0.5609433213902575, -0.41665643955704157, -0.22107261497513853, 0.9814156385038157, 0.3263659130635068, -0.8350841977755521, -0.8447762130583827, -0.8786974531654757, -0.07117837358792767, -0.20654285619904011, 0.7333148456551357, -0.22766230289789302, -0.1323623385262238, 0.270628952587064, 0.5848837131334252, 0.6205236457119239, 0.46185662326401045, -0.01830245885399462, 0.8771772663428494, -0.16210365513242664, -0.11744169242490443, 0.7081767553386746, -0.319836744254123, -0.21621914096669936, -0.5638360989152695, -0.9647058438357563, 0.4673767657201011, -0.9450287188295348, 0.05162250370691179, 0.9497448525950343, 0.20189430958816312, -0.9395973799598802, 0.38364213023028837, 0.5290560333111212, -0.3458777531522421, -0.9027747592526794, -0.39438613793511146, -0.9323247966356623, -0.3417716081857818, 0.5805230364913114, 0.8221607010472998, -0.17053852110603018, 0.8309549653393709, 0.7174557166223854, -0.08559334231703786, 0.7900131688398448, 0.02540069602031325, 0.3622559302182853, -0.9862256461836192, 0.2774256817888161, 0.3125248844530937, -0.36641844641289656, 0.655394012090498, -0.5086202884116291, 0.1572890558919089, 0.011351283185139005, -0.07057618844903346, -0.9577283032239312, 0.9355231293367048, 0.48720210448267753, -0.9495498578601587, -0.2632156656390561, -0.33443834325864175, -0.3166487232869484, 0.07949323612048098, -0.9689473277637601, -0.6227448003823568, -0.6760002024155765, -0.4867557454969753, 0.7010174037626444, -0.9150124666455461, -0.42244437777030397, 0.8240855825852915, 0.6555546223259459, 0.21018241767563461, -0.49497496270220576, 0.31361990094027026, 0.614019117678499, -0.3310980854277721, -0.8918585297691302, 0.976943500767862, -0.27199796096082496, 0.6898647678318832, -0.6059389667340198, 0.7109699868668236, 0.8596047062420602, 0.2146553129521498, 0.6407906819637448, 0.7610111230671326, 0.670165066702642, 0.8743696259687936, -0.48278789659205334, -0.7611598419477361, 0.7514250790519597, 0.965810676130691, -0.6771750974569688, 0.4098971143768326, 0.09096614144029136, -0.528445379900073, 0.42606962725945796, 0.3107713409489914, -0.16667539327903036, -0.9666223389099406, 0.65933429259566, -0.5536712338252805, 0.3853213649995033, 0.9961930108012855, 0.8621130185277563, -0.03092573595828685, -0.8312727002432092, 0.7863282589302314, -0.9753286847935569, -0.3703661942405847, 0.45873395004890005, 0.5770565252399202, 0.15170347066226242, -0.3534110391913734, -0.768866430449519, -0.009944041469533094, 0.22263861957634168, 0.9566192571815253, 0.9505596121932158, 0.6344960525399967, -0.7667930728625192, 0.8463704694683054, -0.12090892754594251, -0.8308668844924783, 0.40158599522244987, 0.43262199231861276, -0.7893500874895616, -0.8740582843364753, 0.9177671877857587, 0.33960227242534646, -0.47636140335003274, 0.02731026793581459, 0.2871826591440372, -0.05021788194846932, -0.5850078062147912, -0.8638085091296264, 0.6384335706916011, 0.21091219615454926, -0.9217449317938091, 0.6728168236336536, -0.01423621674316955, -0.10224955121248347, -0.935888849405891, 0.8360212769208899, -0.7161539008263924, 0.274541879614578, -0.7379092398675808, 0.43389007780544064, 0.2729494838836475, -0.8546352755594282, -0.639179541960601, 0.5276184016077305, 0.7865207982532798, -0.2630835277891781, -0.09526872382223228, -0.44619935482101614, 0.9388032782075895, -0.3603369223557811, -0.8380303160170155, -0.5237878973092456, -0.038402429225397494, -0.3089871151594821, -0.04010064987292683, 0.316753341489896, -0.37429487888054114, -0.07327437117624092, 0.6411062311845839, 0.8093184722172098, 0.4946785761891723, -0.7218675852997838, 0.3149858415868514, 0.8334267687912003, 0.5101657732980054, -0.8185233088885877, 0.24273952467271243, -0.4701926282068998, 0.9477247898321133, -0.12331561925249246, -0.7039340326807813, -0.2673222611727457, -0.45909161794281506, 0.6514798111564921, 0.2806760388900713, -0.45303492343149876, 0.5904359301109381, -0.69441449430604, -0.586132690243885, -0.7990881017456097, 0.18115827323139855, -0.9098137800211585, 0.4058891008664669, -0.5908848413672392, 0.3111852061308107, 0.23375428291528144, -0.7407733599900899, -0.9095128699982917, -0.9190905970935372, -0.710710852407066, 0.07792591040911478, 0.10540612856585607, -0.37625893294141943, -0.8830470917929842, 0.4281605856476427, -0.6703072974111182, 0.426550694077231, -0.6627321144489513, -0.771364812468643, 0.6711502825553421, 0.2595329073159054, -0.8623885952649608, -0.8292965872602129, -0.5890780912376616, 0.16915939206674424, 0.7870972000255143, 0.9672282599979349, -0.46938366180147506, 0.37692229290366686, 0.7087746094755216, -0.59661774907329, -0.05795064366033298, 0.8687770313400593, -0.12383339467224252, 0.21164216578737505, -0.10553838278015126, 0.12088016456875761, 0.8719168966228164, -0.8606345651789984, -0.5508267506500326, -0.015436674243734183, -0.5247416919641223, 0.11219723569875262, -0.6960305515810263, 0.5078712272119699, 0.17369249181734037, -0.19998804050771568, 0.49279817230838985, 0.6736970590160292, 0.9476517032452285, 0.8087767672947619, 0.9216783992313053, 0.43705728542417255, -0.7186688931591598, -0.9700417956483616, -0.09827951811206814, 0.4008528655411585, -0.3453385112595846, 0.40957947823034946, -0.5149597520406966, 0.002564899852486091, -0.052027246882375255, 0.4100878913634969, 0.3231623007099471, -0.12766163933009822, -0.5958789421531423, 0.5032031909187331, 0.31665468550628795, -0.8573642368333949]

# Query namespace1 with the vector provided
query_result = index.query(
    vector=search_vector,
    namespace="Namespace1",
    top_k=3
)
print(query_result)


{'matches': [{'id': '1033', 'score': 0.0681690648, 'values': []},
             {'id': '1041', 'score': 0.0520279706, 'values': []},
             {'id': '1013', 'score': 0.0510935709, 'values': []}],
 'namespace': 'Namespace1',
 'usage': {'read_units': 5}}


## Semantic Search

### Creating and configuring a Pinecone index

In [55]:
# Create Pinecone index
pc.create_index(
    name='semantic-search-squad', 
    dimension=1536,
    spec=ServerlessSpec(cloud='aws', region='us-east-1')
)

# wait for index to be initialized
while not pc.describe_index(index_name).status['ready']:
    time.sleep(1)

# Connect to index and print the index statistics
index = pc.Index("semantic-search-squad")
   
print(index.describe_index_stats())

{'dimension': 1536,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {},
 'total_vector_count': 0,
 'vector_type': 'dense'}


### Upserting vectors for semantic search

In [56]:
# load the data into a dataframe
df = pd.read_csv("../data/squad_dataset.csv")
df

Unnamed: 0,id,text,title
0,5733be284776f41900661182,"Architecturally, the school has a Catholic cha...",University_of_Notre_Dame
1,5733bf84d058e614000b61be,"As at most other universities, Notre Dame's st...",University_of_Notre_Dame
2,5733bed24776f41900661188,The university is the major seat of the Congre...,University_of_Notre_Dame
3,5733a6424776f41900660f51,The College of Engineering was established in ...,University_of_Notre_Dame
4,5733a70c4776f41900660f64,All of Notre Dame's undergraduate students are...,University_of_Notre_Dame
...,...,...,...
1995,56ddea8366d3e219004dae05,"In early modern times, cardinals often had imp...",Cardinal_(Catholicism)
1996,56ddeb0c9a695914005b96b8,Pope Sixtus V limited the number of cardinals ...,Cardinal_(Catholicism)
1997,56ddeba09a695914005b96c2,"Each cardinal takes on a titular church, eithe...",Cardinal_(Catholicism)
1998,56ddec019a695914005b96ca,The Dean of the College of Cardinals in additi...,Cardinal_(Catholicism)


In [57]:
# Ingesting documents to Pinecone index

batch_limit = 100

# Extract the 'id', 'text', and 'title' metadata from each row in the batch
for batch in np.array_split(df, len(df) / batch_limit):
    # Extract the metadata from each row
    metadatas = [{
      "text_id": row['id'],
      "text": row['text'],
      "title": row['title']} for _, row in batch.iterrows()]
    
    texts = batch['text'].tolist()
    ids = [str(uuid4()) for _ in range(len(texts))]


    # Encode texts using 'text-embedding-3-small' from OpenAI with dimensionality 1536
    response = client.embeddings.create(input=texts, model="text-embedding-3-small")
    embeds = [np.array(x.embedding) for x in response.data]
    
    # Upsert the vectors and metadatas to a namespace called 'squad_dataset'.
    index.upsert(
        vectors= zip(ids,embeds,metadatas), namespace='squad_dataset'
    )

  return bound(*args, **kwds)


In [58]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'squad_dataset': {'vector_count': 2000}},
 'total_vector_count': 2000,
 'vector_type': 'dense'}

### Querying vectors for semantic search

In [61]:
query = "What is in front of the Notre Dame Main Building?"

# Create a query vector by embedding the query provided with the same OpenAI embedding model you used for embedding the other vectors.

query_response = client.embeddings.create(
    input=query,
    model="text-embedding-3-small"
)
query_emb = query_response.data[0].embedding

# Query the "squad_dataset" namespace using query_emb, returning the top five most similar results.
retrieved_docs = index.query(vector=query_emb, top_k=3,namespace="squad_dataset",include_metadata=True)


for result in retrieved_docs['matches']:
    print(f"{round(result['score'], 2)}: {result['metadata']['text']}")
    print('\n')

0.54: The library system of the university is divided between the main library and each of the colleges and schools. The main building is the 14-story Theodore M. Hesburgh Library, completed in 1963, which is the third building to house the main collection of books. The front of the library is adorned with the Word of Life mural designed by artist Millard Sheets. This mural is popularly known as "Touchdown Jesus" because of its proximity to Notre Dame Stadium and Jesus' arms appearing to make the signal for a touchdown.


0.53: Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, Fr

## RAG Chatbot

### Craeting New Pinecone Index

In [4]:
# Create Pinecone index
pc.create_index(
    name='rag-index', 
    dimension=1536,
    spec=ServerlessSpec(cloud='aws', region='us-east-1')
)

# wait for index to be initialized
while not pc.describe_index(index_name).status['ready']:
    time.sleep(1)

# Connect to index and print the index statistics
index = pc.Index("rag-index")
   
print(index.describe_index_stats())

NameError: name 'index_name' is not defined

### Upserting YouTube Transcripts

In [7]:
# load the data into a dataframe
youtube_df = pd.read_csv("../data/youtube_rag_data.csv")
youtube_df

Unnamed: 0,id,blob,channel_id,end,published,start,text,title,url
0,35Pdoyi6ZoQ-t0.0,"{'channel_id': 'UCv83tO5cePwHMt1952IVVHw', 'en...",UCv83tO5cePwHMt1952IVVHw,74,2021-07-06 13:00:03 UTC,0,"Hi, welcome to the video. So this is the fourt...",Training and Testing an Italian BERT - Transfo...,https://youtu.be/35Pdoyi6ZoQ
1,35Pdoyi6ZoQ-t18.48,"{'channel_id': 'UCv83tO5cePwHMt1952IVVHw', 'en...",UCv83tO5cePwHMt1952IVVHw,94,2021-07-06 13:00:03 UTC,18,So we got some data. We built a tokenizer with...,Training and Testing an Italian BERT - Transfo...,https://youtu.be/35Pdoyi6ZoQ
2,35Pdoyi6ZoQ-t32.36,"{'channel_id': 'UCv83tO5cePwHMt1952IVVHw', 'en...",UCv83tO5cePwHMt1952IVVHw,108,2021-07-06 13:00:03 UTC,32,So let's move over to the code. And we see her...,Training and Testing an Italian BERT - Transfo...,https://youtu.be/35Pdoyi6ZoQ
3,35Pdoyi6ZoQ-t51.519999999999996,"{'channel_id': 'UCv83tO5cePwHMt1952IVVHw', 'en...",UCv83tO5cePwHMt1952IVVHw,125,2021-07-06 13:00:03 UTC,51,"PyTorch data loader, ready. And we can begin t...",Training and Testing an Italian BERT - Transfo...,https://youtu.be/35Pdoyi6ZoQ
4,35Pdoyi6ZoQ-t67.28,"{'channel_id': 'UCv83tO5cePwHMt1952IVVHw', 'en...",UCv83tO5cePwHMt1952IVVHw,140,2021-07-06 13:00:03 UTC,67,So when we're training a model for mass langua...,Training and Testing an Italian BERT - Transfo...,https://youtu.be/35Pdoyi6ZoQ
...,...,...,...,...,...,...,...,...,...
2280,coaaSxys5so-t779.0,"{'channel_id': 'UCv83tO5cePwHMt1952IVVHw', 'en...",UCv83tO5cePwHMt1952IVVHw,964,2022-07-07 13:24:35 UTC,779,And then we use this get embedding to actually...,How to build next-level Q&A with OpenAI,https://youtu.be/coaaSxys5so
2281,coaaSxys5so-t815.0,"{'channel_id': 'UCv83tO5cePwHMt1952IVVHw', 'en...",UCv83tO5cePwHMt1952IVVHw,1003,2022-07-07 13:24:35 UTC,815,"OK, so get the embedding. Where is that? So he...",How to build next-level Q&A with OpenAI,https://youtu.be/coaaSxys5so
2282,coaaSxys5so-t857.0,"{'channel_id': 'UCv83tO5cePwHMt1952IVVHw', 'en...",UCv83tO5cePwHMt1952IVVHw,1051,2022-07-07 13:24:35 UTC,857,This is just all of the source context that we...,How to build next-level Q&A with OpenAI,https://youtu.be/coaaSxys5so
2283,coaaSxys5so-t898.0,"{'channel_id': 'UCv83tO5cePwHMt1952IVVHw', 'en...",UCv83tO5cePwHMt1952IVVHw,1100,2022-07-07 13:24:35 UTC,898,"So let's go with, let's restrict everything to...",How to build next-level Q&A with OpenAI,https://youtu.be/coaaSxys5so


In [8]:
# Inserting documents into Pinecone

# Connect to index and print the index statistics
index = pc.Index("rag-index")
   
batch_limit = 100

for batch in np.array_split(youtube_df, len(youtube_df) / batch_limit):
    # Extract the metadata from each row
    metadatas = [{
      "text_id": row['id'],
      "text": row['text'],
      "title": row['title'],
      "url": row['url'],
      "published": row['published']} for _, row in batch.iterrows()]
    texts = batch['text'].tolist()
    
    ids = [str(uuid4()) for _ in range(len(texts))]
    
    # Encode texts using OpenAI
    response = client.embeddings.create(input=texts, model="text-embedding-3-small")
    embeds = [np.array(x.embedding) for x in response.data]
    
    # Upsert vectors to the correct namespace
    index.upsert(vectors=zip(ids, embeds, metadatas), namespace='youtube_rag_dataset')

print(index.describe_index_stats())#print(index.describe_index_stats())

  return bound(*args, **kwds)


{'dimension': 1536,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'youtube_rag_dataset': {'vector_count': 2285}},
 'total_vector_count': 2285,
 'vector_type': 'dense'}


### Building a retrieval function

In [9]:
# Define a retrieve function that takes four arguments: query, top_k, namespace, and emb_model
def retrieve(query, top_k, namespace, emb_model):
    # Encode the input query using OpenAI
    query_response = client.embeddings.create(
        input=query,
        model=emb_model
    )
    
    query_emb = query_response.data[0].embedding
    
    # Query the index using the query_emb
    docs = index.query(vector=query_emb, top_k=top_k, namespace=namespace, include_metadata=True)
    
    retrieved_docs = []
    sources = []
    for doc in docs['matches']:
        retrieved_docs.append(doc['metadata']['text'])
        sources.append((doc['metadata']['title'], doc['metadata']['url']))
        
    return retrieved_docs, sources

documents, sources = retrieve(
  query="How to build next-level Q&A with OpenAI",
  top_k=3,
  namespace='youtube_rag_dataset',
  emb_model="text-embedding-3-small"
)
print(documents)
print(sources)

["to use for Open Domain Question Answering. We're going to start with a few examples. Over here we have Google and we can ask Google questions like we would a normal person. So we can say, how do I tie my shoelaces? So what we have right here is three components to the question and answer. And I want you to remember these because these are relevant for what we are going to be building. We have the query at the top. We have what we can refer to as a context, which is the video, which is where we're getting this small, more specific answer from. And we can ask another question. Is Google SkyNet? So we have our question at the top. We have this paragraph, which is our context. And then we have the answer, which is yes, which is highlighted here. So it's slightly different to the previous one where we had the video. This time we have actual text, which is our context. And this is more aligned with what we will see throughout this video as well. Now, what we really want to be asking here i

### RAG Q-A Function

In [11]:
def prompt_with_context_builder(query, docs):
    delim = '\n\n---\n\n'
    prompt_start = 'Answer the question based on the context below.\n\nContext:\n'
    prompt_end = f'\n\nQuestion: {query}\nAnswer:'

    prompt = prompt_start + delim.join(docs) + prompt_end
    return prompt

query = "How to build next-level Q&A with OpenAI"

# Retrieve the top three most similar documents and their sources
documents, sources = retrieve(query, top_k=3, namespace='youtube_rag_dataset', emb_model="text-embedding-3-small")

prompt_with_context = prompt_with_context_builder(query, documents)
print(prompt_with_context)

def question_answering(prompt, sources, chat_model):
    sys_prompt = "You are a helpful assistant that always answers questions."
    
    # Use OpenAI chat completions to generate a response
    res = client.chat.completions.create(
        model=chat_model,
        messages=[
            {"role": "system", "content": sys_prompt},
            {"role": "user", "content": prompt}
        ],
        temperature=0
    )
    answer = res.choices[0].message.content.strip()
    answer += "\n\nSources:"
    for source in sources:
        answer += "\n" + source[0] + ": " + source[1]
    
    return answer

answer = question_answering(
  prompt=prompt_with_context,
  sources=sources,
  chat_model='gpt-4o-mini')
print(answer)

Answer the question based on the context below.

Context:
to use for Open Domain Question Answering. We're going to start with a few examples. Over here we have Google and we can ask Google questions like we would a normal person. So we can say, how do I tie my shoelaces? So what we have right here is three components to the question and answer. And I want you to remember these because these are relevant for what we are going to be building. We have the query at the top. We have what we can refer to as a context, which is the video, which is where we're getting this small, more specific answer from. And we can ask another question. Is Google SkyNet? So we have our question at the top. We have this paragraph, which is our context. And then we have the answer, which is yes, which is highlighted here. So it's slightly different to the previous one where we had the video. This time we have actual text, which is our context. And this is more aligned with what we will see throughout this vid