In [20]:
# %pip install langchain-community
# %pip install langchain-google-vertexai langchain-core
# %pip install langchain_experimental
# %pip install unstructured
# %pip install nltk
# %pip install jq
# %pip install pypdf
# %pip install rapidocr-onnxruntime
# %pip install -qU langchain-text-splitters
# %pip install numpy
# %pip install wikipedia-api
# %pip install faiss-gpu
# %pip install faiss-cpu
# %pip install google-cloud-aiplatform
# %pip install google-auth
# %pip install chromadb

Collecting langchain_experimental
  Downloading langchain_experimental-0.0.64-py3-none-any.whl (204 kB)
[K     |████████████████████████████████| 204 kB 2.9 MB/s eta 0:00:01
Installing collected packages: langchain-experimental
Successfully installed langchain-experimental-0.0.64
You should consider upgrading via the '/Users/vancence.ho/.pyenv/versions/3.10.0/bin/python -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
import vertexai
from google.oauth2 import service_account

PROJECT_ID = "cc-sa-sandbox-20200619"  
REGION = "asia-southeast1"
CREDS_PATH = "/Users/vancence.ho/Downloads/cc-sa-sandbox-creds.json"  

try:
    credentials = service_account.Credentials.from_service_account_file(
        CREDS_PATH, scopes=["https://www.googleapis.com/auth/cloud-platform"],
    )
except:
    print("Error while getting service account credentials!")


vertexai.init(credentials=credentials, project=PROJECT_ID, location=REGION)
print("Vertex AI initialized!")

Vertex AI initialized!


In [21]:
from langchain_community.vectorstores.faiss import FAISS
from langchain_community.vectorstores.chroma import Chroma
from langchain_google_vertexai import VertexAIEmbeddings
from langchain_core.documents import Document
from langchain_experimental.text_splitter import  SemanticChunker

import wikipediaapi
import numpy as np

In [5]:
wiki_wiki = wikipediaapi.Wikipedia(user_agent='sample-rag/1.0 (vancence.ho@ollion.com)', language='en')

def fetch_wiki_page(title):
    page = wiki_wiki.page(title)
    if page.exists():
        return page.text
    else:
        return None 
    
print("Wikipedia API initialized as: wiki_wiki!")

Wikipedia API initialized as: wiki_wiki!


In [6]:
article_titles = ["Python (programming language)", "Artificial Intelligence", "Machine Learning", "Natural Language Processing", "Retrieval Augmented Generation", "OpenAI", "Deep Learning"]

documents = []

for title in article_titles:
    content = fetch_wiki_page(title)
    if content:
        documents.append(Document(page_content=content, metadata={"title": title, "source": "Wikipedia"}))

print(f"Number of documents fetched: {len(documents)}")
print(documents)

Number of documents fetched: 6


In [4]:
d = []
d.append(Document(page_content="Hello"))

In [5]:
d.append(Document(page_content="Bye"))

In [6]:
d

[Document(page_content='Hello'), Document(page_content='Bye')]

In [8]:
embed = VertexAIEmbeddings(model_name="text-embedding-004", credentials=credentials, project=PROJECT_ID, region=REGION)

In [8]:
print(f"Type of embedding: {type(embed)}")

texts = [doc.page_content for doc in d]
embeddings = embed.embed_documents(texts)
print(f"Type of embeddings: {type(embeddings)}")

Type of embedding: <class 'langchain_google_vertexai.embeddings.VertexAIEmbeddings'>
Type of embeddings: <class 'list'>


### Using SemanticChunker

In [25]:
text_splitter = SemanticChunker(VertexAIEmbeddings(model_name="text-embedding-004", credentials=credentials, project=PROJECT_ID, region=REGION), breakpoint_threshold_type="gradient")

docs = text_splitter.split_documents(documents)

In [24]:
print(docs[0].metadata)

{'title': 'Python (programming language)', 'source': 'Wikipedia'}


In [26]:
vector_store = Chroma.from_documents(docs, embedding=embed)

In [30]:
a = vector_store.similarity_search("when was python invented?", k=1)

In [35]:
#a[0].page_content
print(a[0].page_content)

Python is a high-level, general-purpose programming language. Its design philosophy emphasizes code readability with the use of significant indentation. Python is dynamically typed and garbage-collected. It supports multiple programming paradigms, including structured (particularly procedural), object-oriented and functional programming. It is often described as a "batteries included" language due to its comprehensive standard library. Guido van Rossum began working on Python in the late 1980s as a successor to the ABC programming language and first released it in 1991 as Python 0.9.0. Python 2.0 was released in 2000. Python 3.0, released in 2008, was a major revision not completely backward-compatible with earlier versions. Python 2.7.18, released in 2020, was the last release of Python 2. Python consistently ranks as one of the most popular programming languages, and has gained widespread use in the machine learning community. History
Python was invented in the late 1980s by Guido va

### Using ChromaDB

In [9]:
chroma_store = Chroma.from_documents(documents, embedding=embed)


In [14]:
chroma_store.similarity_search("Python (programming language)", k=1)



### Using FAISS

In [15]:
vectorstore = FAISS.from_documents(documents, embedding=embed)

In [18]:
vectorstore.similarity_search("nlp",k=1)

[Document(metadata={'title': 'Natural Language Processing', 'source': 'Wikipedia'}, page_content='Natural language processing (NLP) is an interdisciplinary subfield of computer science and artificial intelligence. It is primarily concerned with providing computers with the ability to process data encoded in natural language and is thus closely related to information retrieval, knowledge representation and computational linguistics, a subfield of linguistics. Typically data is collected in text corpora, using either rule-based, statistical or neural-based approaches in machine learning and deep learning.\nMajor tasks in natural language processing are speech recognition, text classification, natural-language understanding, and natural-language generation.\n\nHistory\nNatural language processing has its roots in the 1940s. Already in 1940, Alan Turing published an article titled "Computing Machinery and Intelligence" which proposed what is now called the Turing test as a criterion of int