In [None]:
# install necessary libraries and packages
%pip install numpy
%pip install pandas
%pip install matplotlib
%pip install seaborn
%pip install scikit-learn
%pip install openai
%pip install wikipedia-api
%pip install -qU langchain-openai
%pip install langchain-core
%pip install langchain_community

# Getting OpenAI GPT-3 API Key

In [None]:
# Get OpenAI GPT-3 API key from environment variable
import os
import getpass

os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OPEN_AI_API_KEY: ")

from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

In [None]:
import faiss
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore

index = faiss.IndexFlatL2(len(embeddings.embed_query("hello world")))

vector_store = FAISS(embedding_function=embeddings, index=index, docstore=InMemoryDocStore(), index_to_docstore_id={})

## Utilising Wikipedia-API to get Documents

In [None]:
# USING WIKIPEDIA DATA
import wikipediaapi
import numpy as np
from langchain_core.documents import Document

# Initialising the Wikipedia API
wiki_wiki = wikipediaapi.Wikipedia(user_agent='FASISS-test/1.0 (vancence@example.com)', language='en')

# function to get Wikipedia articles content data
def fetch_wiki_articles(title):
    page = wiki_wiki.page(title)
    if page.exists():
        return page.text
    else:
        return None

### Adding documents to LangChain vector store

In [None]:
# Step 1: List of Wikipedia articles to fetch
article_titles = ["Python (programming language)", "Artificial Intelligence", "Machine Learning", "Natural Language Processing", "Retrieval Augmented Generation", "OpenAI", "Deep Learning"]

# Step 2: Fetching Wikipedia articles content data
documents = []

for title in article_titles:
    content = fetch_wiki_articles(title)
    if content:
        documents.append(Document(page_content=content, metadata={"title": title, "source": "Wikipedia"}))

# print(documents)

# Step 3: Embedding the documents
embedded_documents = [embeddings.embed_query(document.page_content) for document in documents]
embedding_dim = len(embedded_documents[0])

# Step 4: Creating the FAISS index
index = fasiss.IndexFlatL2(embedding_dim)

# Step 5: Initialising the FAISS vector store
vector_store = FAISS(embeddings=embeddings, index=index, docstore=InMemoryDocStore(), index_to_docstore_id={})

# Step 6: Generating UUIDs for the documents
from uuid import uuid4

uuids = [str(uuid4()) for _ in range(len(documents))]

# print(uuids)

# Step 7: Storing the documents in the vector store
vector_store.store_documents(documents=documents, ids=uuids)