In [34]:
import wikipedia
import os, getpass
from langchain_openai import OpenAIEmbeddings
from pinecone import Pinecone
from urllib.parse import urlparse, unquote
from pinecone import Pinecone, ServerlessSpec
from langchain.text_splitter import RecursiveCharacterTextSplitter
from collections import namedtuple
from langchain_pinecone import PineconeVectorStore

In [3]:
def _set_env(var: str):
    if not os.environ.get(var):
        os.environ[var] = getpass.getpass(f"{var}: ")

_set_env("OPENAI_API_KEY")
_set_env("PINECONE_API_KEY")

In [5]:
# Step 1: Fetch Wikipedia Page Content
def fetch_wikipedia_page(page_identifier):
    try:
        # Set custom user-agent
        wikipedia.set_lang("en")  # Ensure we are using the English language
        wikipedia.set_user_agent("somevaluehere/1.0 (tes1223434@example.com)")  # Custom user agent
        
        # Check if input is a URL or a title
        if page_identifier.startswith("http"):
            # Extract the title from the URL
            path = urlparse(page_identifier).path
            title = unquote(path.split('/')[-1])  # Decode and get the last part of the path
        else:
            # Use the identifier as the title
            title = page_identifier.strip().replace("\"", "")  # Sanitize input
            
        # Attempt to fetch the page
        page = wikipedia.page(title)
        return page.content  # Return the full text of the page
    except wikipedia.exceptions.DisambiguationError as e:
        print(f"Disambiguation error for identifier '{page_identifier}'. Options: {e.options}")
    except wikipedia.exceptions.PageError:
        print(f"Page '{page_identifier}' not found. Please check the title or URL.")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")


In [32]:
# Initialize Pinecone and Store Vectors
def store_embeddings_in_pinecone(page_title, documents, index_name):
    # Initialize Pinecone
    pc = Pinecone(api_key=os.environ['PINECONE_API_KEY'])
    
    # Create a Pinecone index if it doesn't exist
    if index_name not in pc.list_indexes().names():
        print("wiki-index Index does not exist, creating...")
        # Define the spec using ServerlessSpec
        spec = ServerlessSpec(cloud="aws",region="us-east-1")
        pc.create_index(name=index_name, spec=spec, dimension=1536)
    
    # Get embeddings for the page
    embeddings = OpenAIEmbeddings(api_key=os.environ['OPENAI_API_KEY'])
    
    # Upsert the vector into Pinecone
    PineconeVectorStore.from_documents(documents = documents, embedding = embeddings, index_name = index_name)
    print(f"Successfully stored the {page_title} page in Pinecone.")


In [None]:
# Example Usage
page_title = "Amazon_(company)"  # Replace with the Wikipedia page you want
page_text = fetch_wikipedia_page(page_title)

# Define the named tuple
Document = namedtuple("Document", ["page_content", "metadata"])

# Create a list of named tuples (documents)
documents = [Document(page_content=page_text, metadata={"title": page_title})]

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
document_chunks = text_splitter.split_documents(documents)

store_embeddings_in_pinecone(page_title, document_chunks, index_name = "wiki-index")

Successfully stored the Amazon_(company) page in Pinecone.
