### References for using Pinecone and Chat Retrieval

- https://python.langchain.com/en/latest/modules/chains/index_examples/chat_vector_db.html
- https://blog.langchain.dev/retrieval/
- https://github.com/PineappleExpress808/lex-gpt/blob/main/scripts/get_data.ipynb

### Imports

In [144]:
import langchain 
import importlib
from langchain.chains import ConversationalRetrievalChain

from langchain.chains.qa_with_sources import load_qa_with_sources_chain
from langchain.chains import LLMChain
from langchain.chains.conversational_retrieval.prompts import CONDENSE_QUESTION_PROMPT


In [145]:
from sentence_transformers import SentenceTransformer


In [4]:
from datasets import load_dataset
import pinecone
from sentence_transformers import SentenceTransformer

# load the squad dataset into a pandas dataframe
df = load_dataset("squad", split="train").to_pandas()
# select only title and context column
df = df[["title", "context"]]
# drop rows containing duplicate context passages
df = df.drop_duplicates(subset="context")
df

Found cached dataset squad (/Users/tanguyrenaudie/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)


Unnamed: 0,title,context
0,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha..."
5,University_of_Notre_Dame,"As at most other universities, Notre Dame's st..."
10,University_of_Notre_Dame,The university is the major seat of the Congre...
15,University_of_Notre_Dame,The College of Engineering was established in ...
20,University_of_Notre_Dame,All of Notre Dame's undergraduate students are...
...,...,...
87574,Kathmandu,"Institute of Medicine, the central college of ..."
87579,Kathmandu,Football and Cricket are the most popular spor...
87584,Kathmandu,The total length of roads in Nepal is recorded...
87589,Kathmandu,The main international airport serving Kathman...


In [3]:
df.iloc[10, 1]

"Father Joseph Carrier, C.S.C. was Director of the Science Museum and the Library and Professor of Chemistry and Physics until 1874. Carrier taught that scientific research and its promise for progress were not antagonistic to the ideals of intellectual and moral culture endorsed by the Church. One of Carrier's students was Father John Augustine Zahm (1851–1921) who was made Professor and Co-Director of the Science Department at age 23 and by 1900 was a nationally prominent scientist and naturalist. Zahm was active in the Catholic Summer School movement, which introduced Catholic laity to contemporary intellectual issues. His book Evolution and Dogma (1896) defended certain aspects of evolutionary theory as true, and argued, moreover, that even the great Church teachers Thomas Aquinas and Augustine taught something like it. The intervention of Irish American Catholics in Rome prevented Zahm's censure by the Vatican. In 1913, Zahm and former President Theodore Roosevelt embarked on a ma

### Index and Retriever

In [8]:
pinecone.init(
    api_key="8fdad3f2-158b-474a-813d-372dc9d218ce",
    environment="us-east4-gcp"
)
pinecone.list_indexes()
index_name = "extractive-question-answering"

# check if the extractive-question-answering index exists
if index_name not in pinecone.list_indexes():
    # create the index if it does not exist
    pinecone.create_index(
        index_name,
        dimension=384,
        metric="cosine"
    )
# connect to extractive-question-answering index we created
index = pinecone.Index(index_name)
retriever = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1', device='cpu')
retriever

Downloading (…)5fedf/.gitattributes: 100%|██████████| 737/737 [00:00<00:00, 400kB/s]
Downloading (…)_Pooling/config.json: 100%|██████████| 190/190 [00:00<00:00, 113kB/s]
Downloading (…)2cb455fedf/README.md: 100%|██████████| 11.5k/11.5k [00:00<00:00, 4.45MB/s]
Downloading (…)b455fedf/config.json: 100%|██████████| 612/612 [00:00<00:00, 232kB/s]
Downloading (…)ce_transformers.json: 100%|██████████| 116/116 [00:00<00:00, 34.1kB/s]
Downloading (…)edf/data_config.json: 100%|██████████| 25.5k/25.5k [00:00<00:00, 9.52MB/s]
Downloading pytorch_model.bin: 100%|██████████| 90.9M/90.9M [00:13<00:00, 6.66MB/s]
Downloading (…)nce_bert_config.json: 100%|██████████| 53.0/53.0 [00:00<00:00, 21.3kB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 112/112 [00:00<00:00, 45.6kB/s]
Downloading (…)5fedf/tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 6.55MB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 383/383 [00:00<00:00, 153kB/s]
Downloading (…)fedf/train_script.py: 100%|██████

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
)

In [9]:
from tqdm.auto import tqdm

# we will use batches of 64
batch_size = 64

for i in tqdm(range(0, len(df), batch_size)):
    # find end of batch
    i_end = min(i+batch_size, len(df))
    # extract batch
    batch = df.iloc[i:i_end]
    # generate embeddings for batch
    emb = retriever.encode(batch["context"].tolist()).tolist()
    # get metadata
    meta = batch.to_dict(orient="records")
    # create unique IDs
    ids = [f"{idx}" for idx in range(i, i_end)]
    # add all to upsert list
    to_upsert = list(zip(ids, emb, meta))
    # upsert/insert these records to pinecone
    _ = index.upsert(vectors=to_upsert)

# check that we have all vectors in index
index.describe_index_stats()

100%|██████████| 296/296 [16:55<00:00,  3.43s/it]


{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 18891}},
 'total_vector_count': 18891}

In [17]:
matchdict = index.query(
  vector=[0.3]*384,
  top_k=3,
  include_values=True
)
print(type(matchdict).mro())


[<class 'pinecone.core.client.model.query_response.QueryResponse'>, <class 'pinecone.core.client.model_utils.ModelNormal'>, <class 'pinecone.core.client.model_utils.OpenApiModel'>, <class 'object'>]


### Batching upserts
For clients upserting larger amounts of data, you should insert data into an index in batches of 100 vectors or fewer over multiple upsert requests.

Articles:
- what is the size of one vectors 
- 

In [32]:
for i in tqdm(range(0, len(df), batch_size)):
    # find end of batch
    i_end = min(i+batch_size, len(df))
    # extract batch
    batch = df.iloc[i:i_end]
    # generate embeddings for batch
    emb = retriever.encode(batch["context"].tolist()).tolist()
    print(len(batch))
    print(len(emb))
    print("embedding shape: ", len(emb[0]))
    # get metadata
    meta = batch.to_dict(orient="records") #batch : pd.DataFrame -> [{title1: , context1:}, {title2: , context2:} ...]
    # create unique IDs
    ids = [f"{idx}" for idx in range(i, i_end)]
    # add all to upsert list
    to_upsert = list(zip(ids, emb, meta))
    upsert0 = to_upsert[0]
    print("meta", upsert0[2], to_upsert[1][2], to_upsert[2][2])
    # # upsert/insert these records to pinecone
    # _ = index.upsert(vectors=to_upsert)
    break

  0%|          | 0/296 [00:02<?, ?it/s]

64
64
embedding shape:  384
meta {'title': 'University_of_Notre_Dame', 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.'} {'title': 'University_of_Notre_Dame', 'context': "As at most other universities, Notre Dame's students run a number of news media outlets. The nine student-run outlets include three newspapers, both a radio and televis




In [46]:
namespaces, dim, _, vcount = index.describe_index_stats().to_dict()

In [41]:
getattributes = lambda obj: [k for k in dir(obj) if not k.startswith('_')]

In [60]:
import random
import itertools
index2 =  pinecone.Index("extractive-question-answering")
print(getattributes(index2))
print(index2.describe_index_stats())
print(id(index2))
print(id(index))
print(index==index2)
dim = index.describe_index_stats().get("dimension")

def chunks(iterable, batch_size=100):
    """A helper function to break an iterable into chunks of size batch_size."""
    it = iter(iterable)
    chunk = tuple(itertools.islice(it, batch_size))
    while chunk:
        yield chunk
        chunk = tuple(itertools.islice(it, batch_size))

vector_dim = int(dim)
vector_count = 1000
chars = 'abcdefghijklmnopqrstuvwxyz'
# Example generator that generates many (id, vector) pairs
example_data_generator = map(lambda i: (f'id-{i}', [random.random() for _ in range(vector_dim)] ), range(vector_count)) #(id : str,emb : (bz,384 ), but no metadata : dict)

print(next(example_data_generator)[1])
#can i add metadata to the upsert?
#i want batches of size 100 vectors, so i want to upsert 100 vectors at a time
# Upsert data with 100 vectors per upsert request
for ids_vectors_chunk in tqdm(chunks(example_data_generator, batch_size=100)):
    index2.upsert(vectors=ids_vectors_chunk)  # Assuming `index` defined elsewhere




['call_api', 'close', 'configuration', 'cookie', 'default_headers', 'delete', 'describe_index_stats', 'deserialize', 'fetch', 'files_parameters', 'get_file_data_and_close_file', 'parameters_to_multipart', 'parameters_to_tuples', 'pool', 'pool_threads', 'query', 'request', 'rest_client', 'sanitize_for_serialization', 'select_header_accept', 'select_header_content_type', 'set_default_header', 'update', 'update_params_for_auth', 'upsert', 'upsert_from_dataframe', 'user_agent']
{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 19890}},
 'total_vector_count': 19890}
5980346480
5676015936
False
[0.9874296963714088, 0.4573333156705156, 0.801802522376049, 0.22166061172826634, 0.3527397609671158, 0.31673023760138597, 0.8972480312118875, 0.8730149597797916, 0.7365002704881274, 0.6704337005474801, 0.2477178324195629, 0.18059122309522735, 0.4055164095652404, 0.5390656873564539, 0.37416966530851314, 0.934641660391679, 0.10035276506416546, 0.17644310650358275, 0.7603411

10it [03:53, 23.38s/it]


### Benchmarking
Benchmarking upserts:
- including metadata for 1000 vectors : 3 min
- no metadata: 3 min environ 

Benhmarking query time:
- including values and metadata takes more time (20X more)
- 

In [58]:
import numpy as np
queryVector = np.abs(np.random.randn(dim,))
print(queryVector[:5])
queryRes = index2.query(vector = queryVector.tolist(),top_k=2, include_values=True, include_metadata=True)


[0.78027482 1.89486151 0.47939304 0.12452854 0.81482226]


In [64]:
queryRes['matches'][1]["metadata"]

{'context': 'w', 'title': 'title-854'}

### Testing querying 

In [1]:
# Update - 
from langchain.vectorstores import Pinecone
from langchain.embeddings import OpenAIEmbeddings
from langchain import OpenAI
from langchain.chains import ConversationalRetrievalChain
import config
import os
import importlib
importlib.reload(config)
from langchain.chat_models import ChatOpenAI
import config,os, pinecone

from langchain.vectorstores import Pinecone
#add_documents, add_texts
import re



  from tqdm.autonotebook import tqdm


### Testing Conversational Retrieval Chain with Return Sources 

We cannot use the Embedding provided by the retriever, because it is not compatible with the Embedding used by ChatGPT.

In [2]:
#recreating index
from langchain.schema import Document
from langchain.text_splitter import CharacterTextSplitter

def getDocs():
    if 'testarticles' in os.listdir():
        path_to_articles = './testarticles'
    else: 
        path_to_articles = 'backend/testarticles'
        # iterate over files in testarticles directory
    for file in os.listdir(path_to_articles):
        if file.endswith(".txt"):
            with open(os.path.join(path_to_articles,file), "r") as f:
                github_url = f"{file}"
                yield {"page_content":f.read(), "metadata":{"source": github_url}}

chunksize = 512 #important parameter
sources:dict = getDocs()
source_chunks = []
splitter = CharacterTextSplitter(separator=" ", chunk_size=chunksize, chunk_overlap=0)
for source in sources:
    print(source.get('metadata'))
    for chunk in splitter.split_text(source.get("page_content")):
        newdoc = Document(page_content=chunk, metadata=source.get('metadata').copy())
        source_chunks.append(newdoc)
    print("----------")


print([doc.metadata for doc in source_chunks])


{'source': 'articleIncome.txt'}
----------
{'source': 'articleIsrael.txt'}
----------
{'source': 'articleWorldBank.txt'}
----------
[{'source': 'articleIncome.txt'}, {'source': 'articleIncome.txt'}, {'source': 'articleIncome.txt'}, {'source': 'articleIncome.txt'}, {'source': 'articleIncome.txt'}, {'source': 'articleIncome.txt'}, {'source': 'articleIncome.txt'}, {'source': 'articleIncome.txt'}, {'source': 'articleIncome.txt'}, {'source': 'articleIncome.txt'}, {'source': 'articleIsrael.txt'}, {'source': 'articleIsrael.txt'}, {'source': 'articleIsrael.txt'}, {'source': 'articleIsrael.txt'}, {'source': 'articleIsrael.txt'}, {'source': 'articleIsrael.txt'}, {'source': 'articleIsrael.txt'}, {'source': 'articleIsrael.txt'}, {'source': 'articleIsrael.txt'}, {'source': 'articleIsrael.txt'}, {'source': 'articleIsrael.txt'}, {'source': 'articleIsrael.txt'}, {'source': 'articleIsrael.txt'}, {'source': 'articleWorldBank.txt'}, {'source': 'articleWorldBank.txt'}, {'source': 'articleWorldBank.txt'}, 

In [3]:
#checking whethet get relevant documents looks for the top k specific passages then gets the documents relevant to those passages
#or if it just gets the top k documents relevant to the query
source_chunks[22]
#it seeems like it get the top k passages then gets the documents relevant to those passages

Document(page_content='allies bound together by shared values stand together in times of need — not only to\nsupport each other but to reaffirm the inviolable obligations we have to defend those values. And that is\nwhy I am standing up again now.', metadata={'source': 'articleIsrael.txt'})

In [4]:
source_chunks

[Document(page_content='The Income Gap Is Becoming a Physical-Activity Divide\nNationwide, poor children and adolescents are participating far less in sports and fitness\nactivities than their more affluent peers.\nOn a sunny day, teens in running gear gather at the starting line of a track.\nBy Matt Richtel\nUpdated March 25, 2023\nOver the last two decades, technology companies and policymakers warned of a “digital divide”\nin which poor children could fall behind their more affluent peers without equal access to\ntechnology. Today,', metadata={'source': 'articleIncome.txt'}),
 Document(page_content='with widespread internet access and smartphone ownership, the gap has\nnarrowed sharply.\nBut with less fanfare a different division has appeared: Across the country, poor children and\nadolescents are participating far less in sports and fitness activities than more affluent\nyoungsters are. Call it the physical divide.\nData from multiple sources reveal a significant gap in sports part

In [5]:
[doc.metadata for doc in source_chunks]

[{'source': 'articleIncome.txt'},
 {'source': 'articleIncome.txt'},
 {'source': 'articleIncome.txt'},
 {'source': 'articleIncome.txt'},
 {'source': 'articleIncome.txt'},
 {'source': 'articleIncome.txt'},
 {'source': 'articleIncome.txt'},
 {'source': 'articleIncome.txt'},
 {'source': 'articleIncome.txt'},
 {'source': 'articleIncome.txt'},
 {'source': 'articleIsrael.txt'},
 {'source': 'articleIsrael.txt'},
 {'source': 'articleIsrael.txt'},
 {'source': 'articleIsrael.txt'},
 {'source': 'articleIsrael.txt'},
 {'source': 'articleIsrael.txt'},
 {'source': 'articleIsrael.txt'},
 {'source': 'articleIsrael.txt'},
 {'source': 'articleIsrael.txt'},
 {'source': 'articleIsrael.txt'},
 {'source': 'articleIsrael.txt'},
 {'source': 'articleIsrael.txt'},
 {'source': 'articleIsrael.txt'},
 {'source': 'articleWorldBank.txt'},
 {'source': 'articleWorldBank.txt'},
 {'source': 'articleWorldBank.txt'},
 {'source': 'articleWorldBank.txt'},
 {'source': 'articleWorldBank.txt'},
 {'source': 'articleWorldBank.txt

In [6]:
metadatas = [doc.metadata for doc in source_chunks]
[id(obj) for obj in metadatas]

[4895061696,
 4895062080,
 4895061440,
 4895062336,
 4570327232,
 4895062272,
 4895062464,
 4570382528,
 4895062144,
 4895062848,
 4891517120,
 4895061824,
 4570702400,
 4895062976,
 4570710336,
 4895062720,
 4895063488,
 4895063680,
 4895063808,
 4895063936,
 4895064128,
 4895064256,
 4895064384,
 4889641536,
 4895061632,
 4570712576,
 4895064512,
 4894972032,
 4895065088,
 4895065216,
 4895065344,
 4895065472,
 4895065600]

In [12]:
importlib.reload(pinecone)
os.environ['OPENAI_API_KEY'] = config.Config.openai_api_key
os.environ['PINECONE_API_KEY'] = config.Config.pinecone_api_key

pinecone.init(
    api_key=os.environ.get('PINECONE_API_KEY'),  
    environment="us-east4-gcp",
)

pinecone.delete_index('extractive-qa2')


In [11]:
#https://python.langchain.com/en/latest/modules/chains/index_examples/chat_vector_db.html
#https://blog.langchain.dev/retrieval/

os.environ['OPENAI_API_KEY'] = config.Config.openai_api_key
os.environ['PINECONE_API_KEY'] = config.Config.pinecone_api_key

pinecone.init(
    api_key=os.environ.get('PINECONE_API_KEY'),  
    environment="us-east4-gcp",
)


index_name = 'extractive-qa2'
if not index_name in pinecone.list_indexes():
    # Create a new index with the given name and dimension
    pinecone.create_index(index_name, dimension=1536)
    print(f"The {index_name} index has been created")
index = pinecone.Index(index_name)



p = Pinecone.from_existing_index(index_name, embedding=OpenAIEmbeddings())

#upsert takes (id,emb,meta) in batches of size 100 or a list of documents of type Document

# un comment this line to upsert the data


p._index.describe_index_stats()
# indexes = p.add_documents(source_chunks) #metadatas = [doc.metadata for doc in documents]
#(id, emb, meta)

#how is the metadata becoming {source: 'articleIsrael.txt', text : 'allies...'}
# print(indexes[:5])




{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 33}},
 'total_vector_count': 33}

In [14]:
p._index.fetch(indexes[:2])

{'namespace': '',
 'vectors': {'b8687f6d-ea0b-4e94-8089-6a8dc204c9e4': {'id': 'b8687f6d-ea0b-4e94-8089-6a8dc204c9e4',
                                                      'metadata': {'source': 'articleIncome.txt',
                                                                   'text': 'The '
                                                                           'Income '
                                                                           'Gap '
                                                                           'Is '
                                                                           'Becoming '
                                                                           'a '
                                                                           'Physical-Activity '
                                                                           'Divide\n'
                                                                           'Nationwide, '
               

In [30]:
vectors_fetched = p._index.fetch(ids = indexes[:5])['vectors']
vectors_fetched
for k in range(5):
    print(vectors_fetched[indexes[k]]['metadata']['text'])

The Income Gap Is Becoming a Physical-Activity Divide
Nationwide, poor children and adolescents are participating far less in sports and fitness
activities than their more affluent peers.
On a sunny day, teens in running gear gather at the starting line of a track.
By Matt Richtel
Updated March 25, 2023
Over the last two decades, technology companies and policymakers warned of a “digital divide”
in which poor children could fall behind their more affluent peers without equal access to
technology. Today,
with widespread internet access and smartphone ownership, the gap has
narrowed sharply.
But with less fanfare a different division has appeared: Across the country, poor children and
adolescents are participating far less in sports and fitness activities than more affluent
youngsters are. Call it the physical divide.
Data from multiple sources reveal a significant gap in sports participation by income level. A
Centers for Disease Control and Prevention study found that 70 percent of chi

### Creating Chain

In [9]:
# print(type(p).mro())
def runchain(vectorstore):
     # qa_chain = load_qa_with_sources_chain(OpenAI(), chain_type="stuff")
    # qa = RetrievalQAWithSourcesChain(combine_documents_chain=qa_chain, retriever=vectorstore.as_retriever())
    llm =ChatOpenAI(
    openai_api_key=os.environ['OPENAI_API_KEY'],
    temperature=0,
    model_name='gpt-3.5-turbo'
)
    #path_to_ggml = "/Users/tanguyrenaudie/Documents/TanguyML/gpt4all_model/gpt4all-converted.bin"
    #llmGPT4all = GPT4All(model=path_to_ggml)
    qa = ConversationalRetrievalChain.from_llm(
        llm, vectorstore.as_retriever(), return_source_documents=True)
    return qa

chain = runchain(p)


In [11]:
chat_history = []
query = "What does Israel think of the USA"
result = chain({"question": query, "chat_history": chat_history})
result['answer'], result['source_documents'][0]


calling conversational retrieval chain
question What does Israel think of the USA
chat_history_str 
new_question What does Israel think of the USA
docs [Document(page_content='allies bound together by shared values stand together in times of need — not only to\nsupport each other but to reaffirm the inviolable obligations we have to defend those values. And that is\nwhy I am standing up again now.', metadata={'source': 'articleIsrael.txt'}), Document(page_content='allies bound together by shared values stand together in times of need — not only to\nsupport each other but to reaffirm the inviolable obligations we have to defend those values. And that is\nwhy I am standing up again now.', metadata={'source': 'articleIsrael.txt'}), Document(page_content='allies bound together by shared values stand together in times of need — not only to\nsupport each other but to reaffirm the inviolable obligations we have to defend those values. And that is\nwhy I am standing up again now.', metadata={'

('The given context does not provide a clear answer to what Israel thinks of the USA. It only mentions that the relationship between Israel and the USA is built on shared values such as freedom, equality, and democracy, and that Israel risks weakening its ties to the USA and other free nations if it moves towards authoritarianism.',
 Document(page_content='allies bound together by shared values stand together in times of need — not only to\nsupport each other but to reaffirm the inviolable obligations we have to defend those values. And that is\nwhy I am standing up again now.', metadata={'source': 'articleIsrael.txt'}))

### Creating Chain 2 


ConversationalRetrievalChain with Question Answering with sources
You can also use this chain with the question answering with sources chain.



In [10]:
import langchain 
import importlib
from langchain.chains import ConversationalRetrievalChain

from langchain.chains.qa_with_sources import load_qa_with_sources_chain
from langchain.chains import LLMChain
from langchain.chains.conversational_retrieval.prompts import CONDENSE_QUESTION_PROMPT

def createchain(vectorstore):
     # qa_chain = load_qa_with_sources_chain(OpenAI(), chain_type="stuff")
    # qa = RetrievalQAWithSourcesChain(combine_documents_chain=qa_chain, retriever=vectorstore.as_retriever())
    llm =ChatOpenAI(
    openai_api_key=os.environ['OPENAI_API_KEY'],
    temperature=0,
    model_name='gpt-3.5-turbo'
    )

    question_generator = LLMChain(llm=llm, prompt=CONDENSE_QUESTION_PROMPT) #1. question + history -> question2
    doc_chain = load_qa_with_sources_chain(llm, chain_type="map_reduce") #1. q2 (+ history?) + sources -> [answers with sources] ->  one answer with sources
    doc_chain.return_intermediate_steps = True
    chain = ConversationalRetrievalChain(
    retriever=vectorstore.as_retriever(),
    question_generator=question_generator,
    combine_docs_chain=doc_chain, return_source_documents=True)
    return chain

chain = createchain(p)
from utils.logger import logger
logger.info(repr(chain))
# print(CONDENSE_QUESTION_PROMPT) #given the conv, and a question, rephrase the question 

NameError: name 'p' is not defined

In [26]:
res = p._index.query(vector=OpenAIEmbeddings().embed_query('How is israel judging the united states'), top_k=5, include_values=True, include_metadata = True)


In [27]:
res['matches']

[{'id': '33d6739d-2806-4f40-83f6-a58bbc9d4582',
  'metadata': {'source': 'articleIsrael.txt',
               'text': 'partly on a relationship with the United\n'
                       'States built on shared values — freedom, equality, '
                       'democracy — that can only be sustained by a\n'
                       'commitment to the rule of law, including an independent '
                       'judiciary capable of upholding it. If Israel\n'
                       'retreats from that long-term commitment and moves its '
                       'model of governance toward one that mirrors\n'
                       'those of authoritarian countries, it risks weakening '
                       'its ties to the United States and other free nations.\n'
                       'That would be a devastating loss for Israel’s security, '
                       'harm prospects'},
  'score': 0.843657732,
  'values': [0.00232430687,
             -0.008389703,
             0.0039108

In [28]:
res['matches']
[(obj['id'], obj['metadata']["text"]) for obj in res['matches']]

[('33d6739d-2806-4f40-83f6-a58bbc9d4582',
  'partly on a relationship with the United\nStates built on shared values — freedom, equality, democracy — that can only be sustained by a\ncommitment to the rule of law, including an independent judiciary capable of upholding it. If Israel\nretreats from that long-term commitment and moves its model of governance toward one that mirrors\nthose of authoritarian countries, it risks weakening its ties to the United States and other free nations.\nThat would be a devastating loss for Israel’s security, harm prospects'),
 ('9f755d3a-a8f6-4358-b8e1-5be2e1fe4e5e',
  'Netanyahu will convince his coalition of the need to heed President Isaac Herzog’s\nplea to pull back and slow down.\nIn 2014, when the Federal Aviation Administration prohibited U.S. airlines from flying to Israel after a\nHamas rocket landed near Ben-Gurion airport, I boarded an El Al flight, never fearing any danger. Israel\ntakes extraordinary measures to ensure the security of airl

In [29]:
p.similarity_search("How is Israel's response?", k=20) #does not work, only the same doc returned

[Document(page_content='Netanyahu will convince his coalition of the need to heed President Isaac Herzog’s\nplea to pull back and slow down.\nIn 2014, when the Federal Aviation Administration prohibited U.S. airlines from flying to Israel after a\nHamas rocket landed near Ben-Gurion airport, I boarded an El Al flight, never fearing any danger. Israel\ntakes extraordinary measures to ensure the security of airline passengers, and it correctly argued that\nbanning flights amounted to a capitulation to Hamas that would effectively close', metadata={'source': 'articleIsrael.txt'}),
 Document(page_content='the country’s economy,\ngiven air travel is the only practical way to get in and out for nearly all travelers. I wanted to stand with\nIsrael against Hamas, by highlighting the safety of travel to Israel and urging the Obama administration to\nreverse course — which it soon did, to its credit.\nGreeting me on the tarmac that day was Prime Minister Benjamin Netanyahu. He thanked me for my\

In [33]:

chain = runchain(p)
chat_history = []
query = "How has climate become an issue for the World Wide Bank?"
result = chain({"question": query, "chat_history": chat_history})


calling conversational retrieval chain
question How has climate become an issue for the World Wide Bank?
chat_history_str 
new_question How has climate become an issue for the World Wide Bank?
docs [Document(page_content='World Bank has\nplayed an active role in making progress in those areas. It has begun to help countries\nincorporate climate change into their overall economic development plans and should continue\nthis necessary work.\nClimate-related funding has already grown in importance at the bank; in fact, some of the\npoorest countries are already worried that it will cut into funding for basics like education and\nhealth care. That’s why additional funding is needed to assure them that taking global action on\nclimate', metadata={'source': 'articleWorldBank.txt'}), Document(page_content='Bank rules,\ndespite their acute vulnerability to climate change. Those rules should be revisited, in some\ncases, to make sure that climate financing is prioritizing the areas that will mak

In [36]:
result['answer']


"Climate has become an important issue for the World Bank as it has increased its funding for climate-related projects, which some of the poorest countries fear may cut into funding for basic needs like education and healthcare. The bank's rules should be revisited to prioritize areas that will make the biggest difference in climate financing, and it should provide more grants and below-market financing related to climate. The World Bank and multilateral development banks provided only a small percentage of their adaptation and mitigation finance through climate-related projects. \nSOURCES: articleWorldBank.txt"

In [37]:
result['source_documents'][3]

Document(page_content='of record-keeping, threatening millions with famine, even though the\nentire continent of Africa contributes less than 4 percent of global carbon emissions.\nThe World Bank and the donor countries that control it can do more to step up and tackle this\ngenerational challenge. To make the World Bank and other multilateral lending institutions fit for\npurpose in the 21st century, leaders need to figure out how to raise and leverage the massive\namounts of capital that are going to be necessary in the coming years', metadata={'source': 'articleWorldBank.txt'})

In [137]:
result['intermediate_steps']



KeyError: 'intermediate_steps'

In [114]:
p._index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [40]:
p.as_retriever().get_relevant_documents("Does Israel view USA as an ally")

[Document(page_content='allies bound together by shared values stand together in times of need — not only to\nsupport each other but to reaffirm the inviolable obligations we have to defend those values. And that is\nwhy I am standing up again now.', metadata={'source': 'articleIsrael.txt'}),
 Document(page_content='allies bound together by shared values stand together in times of need — not only to\nsupport each other but to reaffirm the inviolable obligations we have to defend those values. And that is\nwhy I am standing up again now.', metadata={'source': 'articleIsrael.txt'}),
 Document(page_content='allies bound together by shared values stand together in times of need — not only to\nsupport each other but to reaffirm the inviolable obligations we have to defend those values. And that is\nwhy I am standing up again now.', metadata={'source': 'articleIsrael.txt'}),
 Document(page_content='allies bound together by shared values stand together in times of need — not only to\nsupport 

In [48]:
p.as_retriever().__class__.mro()
from langchain.vectorstores.base import VectorStoreRetriever
p.similarity_search("Does Israel view USA as an ally", k=10)

"""Return pinecone documents most similar to query.

        Args:
            query: Text to look up documents similar to.
            k: Number of Documents to return. Defaults to 4.
            filter: Dictionary of argument(s) to filter on metadata
            namespace: Namespace to search in. Default will search in '' namespace.

        Returns:
            List of Documents most similar to the query and score for each
        """
query = "Does Israel view USA as an ally"
queryobj = p._embedding_function(query)
queryobj.__len__()
results = p._index.query(
            [queryobj],
            top_k=5,
            include_metadata=True,
        )

In [56]:
ids = [res['id'] for res in results['matches']]
ids
vectors = p._index.fetch(ids)['vectors']
for vect in vectors.values():
    print(vect['metadata']['text'])

allies bound together by shared values stand together in times of need — not only to
support each other but to reaffirm the inviolable obligations we have to defend those values. And that is
why I am standing up again now.
allies bound together by shared values stand together in times of need — not only to
support each other but to reaffirm the inviolable obligations we have to defend those values. And that is
why I am standing up again now.
allies bound together by shared values stand together in times of need — not only to
support each other but to reaffirm the inviolable obligations we have to defend those values. And that is
why I am standing up again now.
allies bound together by shared values stand together in times of need — not only to
support each other but to reaffirm the inviolable obligations we have to defend those values. And that is
why I am standing up again now.
allies bound together by shared values stand together in times of need — not only to
support each other but 

In [16]:
import pinecone

index_name = 'extractive-qa2'

# Retrieve the index object
index = pinecone.Index(index_name)

# Retrieve all the vectors from the index
vectors = index.query(p._embedding_function(" "), top_k=40, include_metadata=True)['matches']

# Check for duplicates in the vector list
print(vectors[10])
# if len(unique_vectors) != len(vectors):
#     print("There are duplicates in the index")
# else:
#     print("There are no duplicates in the index")


{'id': '4788dc3c-66a1-4a0d-bf26-42b046f5e07b',
 'metadata': {'source': 'articleIsrael.txt',
              'text': 'for a peaceful resolution of the\n'
                      'Palestinian conflict and could even imperil the future '
                      'of the Jewish homeland. It would also undermine\n'
                      'the deep attachment millions of people around the world '
                      'feel toward the country, often because of the\n'
                      'pride our parents instilled in us not only for its '
                      'Jewish character but also for its strong commitment to\n'
                      'freedom.\n'
                      'In the United States, our founding fathers’ insistence '
                      'on checks and balances to control the tyrannical\n'
                      'tendencies of majorities was part of their genius. Our'},
 'score': 0.744755268,
 'values': []}


### WTF is this?

In [17]:
print(len(vectors))
print(vectors[25])
print(vectors[26])
print(vectors[27])
print(vectors[28])
print(vectors[29])
print(vectors[30])



33
{'id': '34b32c55-046e-4d24-ad46-5e3462256f54',
 'metadata': {'source': 'articleIncome.txt',
              'text': 'students qualify for free lunch. Here, she said, she\n'
                      'was seeing the impact of “this club and school '
                      'divide.”\n'
                      'More affluent children are often highly trained in '
                      'sports — “a little bit ahead,” said Ms.\n'
                      'Paulls-Neal, who is also the executive director of the '
                      'New Mexico chapter of the Society of\n'
                      'Health and Physical Educators, or SHAPE America. “And '
                      'they are more comfortable moving,\n'
                      'where the students in low-income areas are not.”\n'
                      'A similar pattern is emerging in Unit District No. 5 in '
                      'McLean County, Ill. Faced'},
 'score': 0.734241962,
 'values': []}
{'id': 'ea60d620-cd49-4c01-90fd-4099d97947d3',
 '

### How is the metadata stored?

In [103]:
#make a new index with the same name

index = pinecone.Index(index_name)


# for i in tqdm(range(0, len(df), batch_size)):
#     # find end of batch
#     i_end = min(i+batch_size, len(df))
#     # extract batch
#     batch = df.iloc[i:i_end]
#     # generate embeddings for batch
#     emb = retriever.encode(batch["context"].tolist()).tolist()
#     # get metadata
#     meta = batch.to_dict(orient="records") # [{title: "title", text: "text"}, ...}]
#     # create unique IDs
#     ids = [f"{idx}" for idx in range(i, i_end)]
#     # add all to upsert list
#     to_upsert = list(zip(ids, emb, meta))
#     # upsert/insert these records to pinecone
#     _ = index.upsert(vectors=to_upsert)




# check that we have all vectors in index


In [104]:
index

In [99]:
emb = OpenAIEmbeddings()
embedding5vectors = emb.embed_documents([doc.page_content for doc in source_chunks[:5]])
embedding5vectors.__len__()
embedding5vectors[0].__len__()
#to_upsert = list(zip(ids, emb, meta))
import uuid
ids = [str(uuid.uuid4()) for i in range(5)]
metas = [doc.page_content for doc in source_chunks[:5]]
index.upsert(zip(ids, embedding5vectors, metas))

#instead of 

indexes = p.add_documents(source_chunks)

AttributeError: 'NoneType' object has no attribute 'upsert'

### Ask question function

In [32]:
chain.__class__

langchain.chains.conversational_retrieval.base.ConversationalRetrievalChain

In [15]:
def ask_question(question: str, vectorstore:Pinecone,chain :ConversationalRetrievalChain, chat_history: list[dict] = None,  ) -> dict:
    """Use a question and the chat history to return the answer and the source documents.
    Updates chat history with the question and answer.
    Returns: dict with keys "answer", "sources"
    - answer: str
    - sources: list of dicts
        - filename: str
        - text : str
        - page : str #not yet implemented
        - etc
    """
    result = chain({"question": question, "chat_history": chat_history})
    chat_history.append({"question": question, "answer": result["answer"]})
    answer = result['answer']
    sources = []

    for sourcedoc in result['source_documents']:
        sources.append({'filename' : sourcedoc.metadata['source'], 'text': sourcedoc.page_content})
    return {"answer": answer, "sources":sources}


In [16]:
chat_history = []
chain = runchain(p)

In [17]:
result2 = ask_question('how does the wage gap income inequality affect sports and kids health', p, chat_history=chat_history, chain=chain)

calling conversational retrieval chain
question how does the wage gap income inequality affect sports and kids health
chat_history_str 
new_question how does the wage gap income inequality affect sports and kids health
docs [Document(page_content='with widespread internet access and smartphone ownership, the gap has\nnarrowed sharply.\nBut with less fanfare a different division has appeared: Across the country, poor children and\nadolescents are participating far less in sports and fitness activities than more affluent\nyoungsters are. Call it the physical divide.\nData from multiple sources reveal a significant gap in sports participation by income level. A\nCenters for Disease Control and Prevention study found that 70 percent of children from\nfamilies', metadata={'source': 'articleIncome.txt'}), Document(page_content='The Income Gap Is Becoming a Physical-Activity Divide\nNationwide, poor children and adolescents are participating far less in sports and fitness\nactivities than the

In [18]:
result2

{'answer': 'The wage gap income inequality affects sports participation among children, with those from families with lower incomes being less likely to participate in team sports and physical activity outside of school. This disparity is concerning as physical activity is linked to better health and academic outcomes. The privatization of sports has contributed to this gap, with participation rates falling among children from families making less than $50,000 a year. The disparity in sports participation rates is higher among families at or below the poverty line. \nSOURCES: articleIncome.txt',
 'sources': [{'filename': 'articleIncome.txt',
   'text': 'with widespread internet access and smartphone ownership, the gap has\nnarrowed sharply.\nBut with less fanfare a different division has appeared: Across the country, poor children and\nadolescents are participating far less in sports and fitness activities than more affluent\nyoungsters are. Call it the physical divide.\nData from mult

In [None]:
def ask_question2(question: str, vectorstore:Pinecone,chain :ConversationalRetrievalChain, chat_history: list[dict] = None,  ) -> dict:
    """This version removes the chat history from the chain input on the second LLM
    Use a question and the chat history to return the answer and the source documents.
    Updates chat history with the question and answer.
    Returns: dict with keys "answer", "sources"
    - answer: str
    - sources: list of dicts
        - filename: str
        - text : str
        - page : str #not yet implemented
        - etc

    """
    #not yet implemented, i need to figure out how to get the chat history out of the chain
    result = chain({"question": question, "chat_history": chat_history})
    chat_history.append({"question": question, "answer": result["answer"]})
    answer = result['answer']
    sources = []

    for sourcedoc in result['source_documents']:
        sources.append({'filename' : sourcedoc.metadata['source'], 'text': sourcedoc.page_content})
    return {"answer": answer, "sources":sources}


### How to remove chat history ? The prompt is weird 

In [35]:
combinedocschain = chain.combine_docs_chain
penult_llm = combinedocschain.llm_chain
penult_llm.prompt
ult_llm = combinedocschain.combine_document_chain #creates summaries from the big vectordb output texts
ult_llm.llm_chain.prompt = "" #i can change it here 



''

### Testing Pinecone namespaces 


In [5]:
# Update - 
from langchain.vectorstores import Pinecone
from langchain.embeddings import OpenAIEmbeddings
from langchain import OpenAI
from langchain.chains import ConversationalRetrievalChain
from langchain.text_splitter import CharacterTextSplitter

import os
import importlib
from langchain.chat_models import ChatOpenAI
import config,os, pinecone

from langchain.vectorstores import Pinecone
#add_documents, add_texts
import re

from utils.redirect_stdout import redirect_stdout_to_logger
from utils.logger import logger
import PyPDF2



In [6]:
#https://python.langchain.com/en/latest/modules/chains/index_examples/chat_vector_db.html
#https://blog.langchain.dev/retrieval/

os.environ['OPENAI_API_KEY'] = config.Config.openai_api_key
os.environ['PINECONE_API_KEY'] = config.Config.pinecone_api_key

pinecone.init(
    api_key=os.environ.get('PINECONE_API_KEY'),  
    environment="us-east4-gcp",
)


index_name = 'extractive-qa2'
if not index_name in pinecone.list_indexes():
    # Create a new index with the given name and dimension
    pinecone.create_index(index_name, dimension=1536)
    print(f"The {index_name} index has been created")
index = pinecone.Index(index_name)


p = Pinecone.from_existing_index(index_name, embedding=OpenAIEmbeddings())

p._index.describe_index_stats()





{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 143}},
 'total_vector_count': 143}

In [10]:
def save_file_to_Pinecone(filepath:str, vectorstore:Pinecone, sid : str):
    """Reads one file from the temp directory (pdf and .txt files supported) then splits and saves to Pinecone"""

    filename, file_extension = os.path.splitext(filepath)
    filename.replace("\\temp", "")
    if file_extension.lower() == '.pdf':
        pdf_reader = PyPDF2.PdfReader(filepath)
        content = ""

        for page in pdf_reader.pages:
            content += page.extract_text()

    elif file_extension.lower() == '.txt':
        with open(filepath, 'r') as file:
            content = file.read()
    else:
        raise ValueError(f"Invalid file type: {filepath}. Only PDF and text files are supported.")

    #write from filepath, content to Pinecone
    chunksize = 512 #important parameter
    source = {"page_content":content, "metadata":{'source':filepath}}
    source_chunks = []
    splitter = CharacterTextSplitter(separator=" ", chunk_size=chunksize, chunk_overlap=0)
    for i,chunk in enumerate(splitter.split_text(source.get("page_content"))):
        embedded_chunk = vectorstore._embedding_function(chunk)
        newdoc = (str(i), embedded_chunk, source.get('metadata').copy() ) #(i,emb, metadata)
        source_chunks.append(newdoc)
    with redirect_stdout_to_logger(logger):
        indexes = vectorstore._index.upsert(vectors = source_chunks, namespace=sid)

    logger.info(f"added to vectorstore {len(source_chunks)} chunks from {filepath}")
    logger.info(f"vectorstore stats: {vectorstore._index.describe_index_stats()}")


In [8]:
sid = '0c2369f0-740c-4c77-a7c7-4427b5503405'
save_file_to_Pinecone("./temp/sample.txt", p, sid)

In [9]:
p._index.delete(delete_all=True, namespace=sid)
p._index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 143}},
 'total_vector_count': 143}

### Testing Pinecone metadata filtering 

Watchout, overwriting the data if giving same id


/Users/tanguyrenaudie/Documents/TanguyML/MineGPT


In [1]:
# Update - 
%cd ..

from langchain.vectorstores import Pinecone
from langchain.embeddings import OpenAIEmbeddings
from langchain import OpenAI
from langchain.chains import ConversationalRetrievalChain
from langchain.text_splitter import CharacterTextSplitter

import os
import importlib
from langchain.chat_models import ChatOpenAI
import config,os, pinecone

from langchain.vectorstores import Pinecone
#add_documents, add_texts
import re

from utils.redirect_stdout import redirect_stdout_to_logger
from utils.logger import logger
import PyPDF2
import uuid
from utils.ingest import save_file_to_Pinecone_metadata
from utils.getchain import createchain_with_filter
from utils.ask_question import ask_question


#https://python.langchain.com/en/latest/modules/chains/index_examples/chat_vector_db.html
#https://blog.langchain.dev/retrieval/

os.environ['OPENAI_API_KEY'] = config.Config.openai_api_key
os.environ['PINECONE_API_KEY'] = config.Config.pinecone_api_key

pinecone.init(
    api_key=os.environ.get('PINECONE_API_KEY'),  
    environment="us-east4-gcp",
)


index_name = 'extractive-qa2'
if not index_name in pinecone.list_indexes():
    # Create a new index with the given name and dimension
    pinecone.create_index(index_name, dimension=1536)
    print(f"The {index_name} index has been created")
index = pinecone.Index(index_name)


p = Pinecone.from_existing_index(index_name, embedding=OpenAIEmbeddings())

p._index.describe_index_stats()





/Users/tanguyrenaudie/Documents/TanguyML/MineGPT


  from tqdm.autonotebook import tqdm


{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 71},
                '0c2369f0-740c-4c77-a7c7-4427b5503405': {'vector_count': 11}},
 'total_vector_count': 82}

In [2]:
# chain.retriever.vectorstore.similarity_search(
#     query=[0.1]*1536,
#     top_k=20,
#     include_metadata=True
# )

### Testing delete

In [3]:
p._index.describe_index_stats()

chain = createchain_with_filter(p)

result = ask_question('should the x industry use the y industry', p,chain,[], 'hello')
with redirect_stdout_to_logger(logger):
    print("result", result)

calling conversational retrieval chain
question should the x industry use the y industry
chat_history_str []
answer There is no relevant text to answer the question.
SOURCES: ./temp/sample5.txt
extradict intermediate steps:  ['No relevant text found.', 'There is no relevant text to answer the question.', 'No relevant text found.', 'No relevant text found.']
setting return source documents to false


In [7]:
import os 
filepath = "backend/temp/article.Txt"
filename = os.path.basename(filepath)
filename

'article.Txt'

### changing prompts
Here, we are changing the prompt to allow chatgpt to give his own info. 

In [2]:
chain : ConversationalRetrievalChain = createchain_with_filter(p)

In [6]:
mapreducechain = chain.combine_docs_chain
mapreducechain.llm_chain #returns Relevant text, if any
mapreducechain.combine_document_chain
logger.info(mapreducechain.combine_document_chain)

In [44]:
#if the llm chain does not know, maybe 

In [19]:
chain.combine_docs_chain



In [None]:
chain.combine_docs_chain.return_intermediate_steps = True
result = ask_question('who is joe rogan', p,chain,[], 'hello')
with redirect_stdout_to_logger(logger):
    print("result", result)

In [3]:
result = ask_question('why is the lord of the flies a good shaman', p,chain,[], 'hello')
with redirect_stdout_to_logger(logger):
    print("result", result)

calling conversational retrieval chain
question why is the lord of the flies a good shaman
chat_history_str []
answer I don't know. 
SOURCES: ./temp/sample5.txt
['There is no relevant text to answer this question.', 'There is no relevant text to answer the question.', 'No relevant text found.', 'There is no relevant text in the given portion of the document to answer the question.']


In [37]:
mapreducechain



In [4]:
mapreducechain.combine_docs

NameError: name 'mapreducechain' is not defined

In [16]:
mapreducechain
from langchain.chains import ConversationalRetrievalChain

In [None]:
from langchain.prompts import 

### Fixing Delete 

- create new user id
- send a new article for that user id 
- delete that article 
- 

In [1]:
# Update - 
%cd ..

from langchain.vectorstores import Pinecone
from langchain.embeddings import OpenAIEmbeddings
from langchain import OpenAI
from langchain.chains import ConversationalRetrievalChain
from langchain.text_splitter import CharacterTextSplitter

import os
import importlib
from langchain.chat_models import ChatOpenAI
import config,os, pinecone

from langchain.vectorstores import Pinecone
#add_documents, add_texts
import re

from utils.redirect_stdout import redirect_stdout_to_logger
from utils.logger import logger
import PyPDF2
import uuid
from utils.ingest import save_file_to_Pinecone_metadata
from utils.getchain import createchain_with_filter
from utils.ask_question import ask_question
import boto3
from config import Config
from io import BytesIO
import tempfile



#https://python.langchain.com/en/latest/modules/chains/index_examples/chat_vector_db.html
#https://blog.langchain.dev/retrieval/

os.environ['OPENAI_API_KEY'] = config.Config.openai_api_key
os.environ['PINECONE_API_KEY'] = config.Config.pinecone_api_key

pinecone.init(
    api_key=os.environ.get('PINECONE_API_KEY'),  
    environment="us-east4-gcp",
)


index_name = 'extractive-qa2'
if not index_name in pinecone.list_indexes():
    # Create a new index with the given name and dimension
    pinecone.create_index(index_name, dimension=1536)
    print(f"The {index_name} index has been created")
index = pinecone.Index(index_name)


p = Pinecone.from_existing_index(index_name, embedding=OpenAIEmbeddings())

p._index.describe_index_stats()





/Users/tanguyrenaudie/Documents/TanguyML/MineGPTDeploy


  from tqdm.autonotebook import tqdm


{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [22]:
p._index.delete(delete_all=True, namespace='')

{}

/dir1
 /dir2
  file1
  notebook1

In [17]:
#make some base Mine file documents for user_id 0 (ADMIN USER)
%cd .. 
print('just like in app.py')
from sqlalchemy import create_engine
import sys

# Import the necessary modules and classes
from app import app, User, DocSource, db

with app.app_context():
    print('inside testing pinecone' , app.config['SQLALCHEMY_DATABASE_URI'])
    print("inside testingPinecone resolved database path:", db.engine.url.database)

def get_docsources_for_user(user_id):
    # Query the DocSource records for the specified user_id
    docsources = DocSource.query.filter_by(user_id=user_id).all()

    # Print out the DocSource records
    print(f"DocSource records for user_id {user_id}:")
    for docsource in docsources:
        print(docsource.to_dict())

def get_users():
    users = User.query.all()
    for user in users:
        print(user.to_dict())

if __name__ == '__main__':
    user_id = 1
    with app.app_context():
        db.create_all()
        users = db.session.execute(db.select(User).order_by(User.id)).scalars()
        print("inside testingPinecone resolved database path:", db.engine.url.database)

        for user in users:
            print(f"user {user}, id {user.id} email: {user.email}, pwd: {user.password}" )

        #create a base user
        # user = User(id = 1, email = 'email@etu.minesparis.psl.eu', password = 'p')
        # db.session.add(user)
        # db.session.commit()

/
just like in app.py
inside testing pinecone sqlite:////Users/tanguyrenaudie/Documents/TanguyML/MineGPTDeploy/users.db
inside testingPinecone resolved database path: /Users/tanguyrenaudie/Documents/TanguyML/MineGPTDeploy/users.db
inside testingPinecone resolved database path: /Users/tanguyrenaudie/Documents/TanguyML/MineGPTDeploy/users.db
user <User 0>, id 0 email: email, pwd: password
user <User 1>, id 1 email: email@etu.minesparis.psl.eu, pwd: p


### From AWS to Pinecone

In [2]:
! pwd

def save_file_AWS_to_Pinecone(metadata:str, vectorstore:Pinecone):
    # Create a new session using the access key and secret access key of the new user

    user_id = metadata.get('user_id')
    file_id = metadata.get('file_id')
    filename_only = metadata.get('filename_only')

    session = boto3.Session(aws_access_key_id=Config.AWS_ACCESS_KEY_ID,
                            aws_secret_access_key=Config.AWS_SECRET_ACCESS_KEY)
    bucket_name = 'minefiles'


    s3_client = session.client('s3')
    
    if not user_id:
        raise ValueError('Access token is missing or invalid')

    try: 
        temp_file_path = os.path.join('temp',filename_only)
        with open(temp_file_path, 'wb') as temp_file:
            s3_client.download_fileobj(bucket_name, filename_only, temp_file)
            temp_file_path = temp_file.name
            print(f"temp_file_path: {temp_file_path}")
            logger.info(
                f"uploading file {temp_file_path} with id {file_id} for user {user_id} ")

            with redirect_stdout_to_logger(logger):
                # add file to Pinecone
                filename_only = os.path.basename(temp_file_path)

                # Save file to Pinecone with metadata
                save_file_to_Pinecone_metadata(temp_file_path, metadata, vectorstore)
    finally:
        os.remove(temp_file_path)




/Users/tanguyrenaudie/Documents/TanguyML/MineGPTDeploy


In [6]:
p._index.delete(delete_all=True, namespace='')

{}

In [8]:
#uploading a couple of documents
metadata = {
    'user_id': 1,
    'file_id': 1,
    'filename_only': 'GE_COURS 5 Transferts thermiques - Rayonnement V2023.pdf'
}
    
save_file_AWS_to_Pinecone(metadata,p)

temp_file_path: temp/GE_COURS 5 Transferts thermiques - Rayonnement V2023.pdf


In [9]:

#uploading a couple of documents
metadata = {
    'user_id': 1,
    'file_id': 2,
    'filename_only': 'Math_S1_CalDiff.pdf'
}
    
save_file_AWS_to_Pinecone(metadata,p)

temp_file_path: temp/Math_S1_CalDiff.pdf


In [7]:
p._index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [16]:
#uploading a couple of documents
metadata = {
    'user_id': 1,
    'file_id': 3,
    'filename_only': 'MicroEco_microeconomie-3.pdf'
}
    
save_file_AWS_to_Pinecone(metadata,p)

temp_file_path: temp/MicroEco_microeconomie-3.pdf


In [17]:
p._index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 215}},
 'total_vector_count': 215}

In [18]:

#uploading a couple of documents
metadata = {
    'user_id': 1,
    'file_id': 4,
    'filename_only': 'PhyStat_cor_pcps1.pdf'
}
    
save_file_AWS_to_Pinecone(metadata,p)

temp_file_path: temp/PhyStat_cor_pcps1.pdf


In [20]:
p._index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 234}},
 'total_vector_count': 234}