In [80]:
import os
import streamlit as st
import pickle
import time
import langchain
import haystack
from langchain import OpenAI
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.chains.qa_with_sources.loading import load_qa_with_sources_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredURLLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
import requests
from bs4 import BeautifulSoup
import numpy as np
from transformers import OpenAIGPTTokenizer, OpenAIGPTModel
import faiss
from haystack.retriever.dense import DensePassageRetriever
from haystack.database.faiss import FAISSDocumentStore


ModuleNotFoundError: No module named 'haystack.retriever'

In [68]:
#load openAI api key
os.environ['OPENAI_API_KEY'] = 'your openapi key here'

In [69]:
# Initialise LLM with required params
llm = OpenAI(temperature=0.9, max_tokens=500) 

### (1) Load data

In [70]:


def fetch_data_from_urls(urls):
    data = []
    for url in urls:
        try:
            response = requests.get(url)
            if response.status_code == 200:
                # Parse HTML content using BeautifulSoup
                soup = BeautifulSoup(response.content, 'html.parser')
                # Extract text content
                text_content = soup.get_text()
                data.append(text_content)
            else:
                print(f"Error fetching {url}, status code: {response.status_code}")
        except Exception as e:
            print(f"Error fetching or processing {url}, exception: {str(e)}")
    return data

urls = [
    "https://www.moneycontrol.com/news/business/markets/wall-street-rises-as-tesla-soars-on-ai-optimism-11351111.html",
    "https://www.moneycontrol.com/news/business/tata-motors-launches-punch-icng-price-starts-at-rs-7-1-lakh-11098751.html"
]

data = fetch_data_from_urls(urls)
print(len(data))


2


### (2) Split data to create chunks

In [71]:
# Define the RecursiveCharacterTextSplitter class
class RecursiveCharacterTextSplitter:
    def __init__(self, chunk_size=1000, chunk_overlap=200):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap

    def split_text(self, text):
        chunks = []
        for i in range(0, len(text), self.chunk_size - self.chunk_overlap):
            chunks.append(text[i:i + self.chunk_size])
        return chunks

# Initialize the splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

# Split the text data into chunks
docs = [text_splitter.split_text(text) for text in data]

# Flatten the list of lists into a single list
docs = [chunk for sublist in docs for chunk in sublist]


In [72]:
len(docs)

31

In [73]:
docs[0]

" Wall Street rises as Tesla soars on AI optimism           \n\n  \n\n      \n\n   \n\n  EnglishHindiGujaratiSpecialsSearch Quotes, News, Mutual Fund NAVsMoneycontrol Trending StockInfosys\xa0INE009A01021, INFY, 500209State Bank of India\xa0INE062A01020, SBIN, 500112Yes Bank\xa0INE528G01027, YESBANK, 532648Bank Nifty\xa0Nifty 500 \xa0QuotesMutual FundsCommoditiesFutures & OptionsCurrencyNewsCryptocurrencyForumNoticesVideosGlossaryAll Hello, LoginHello, LoginLog-inor Sign-UpMy AccountMy Profile My PortfolioMy WatchlistCredit Score₹100 Cash RewardMy FeedMy MessagesMy AlertsMy Profile My PROMy PortfolioMy WatchlistCredit Score₹100 Cash RewardMy FeedMy MessagesMy AlertsLogoutChat with UsDownload AppFollow us on:PremiumMy Feed->->MC_ENG_DESKTOP/MC_ENG_NEWS/MC_ENG_MARKETS_AS/MC_ENG_ROS_NWS_MKTS_AS_ATF_728Go PRO @₹99  PROAdvertisementRemove AdBudget 2        24The Election BudgetINDIA 2024HomepagePlay The FMTax CalculatorBudget Market TrendsTop Gainers & Top LosersBudget & Market Impact  Top 

### (3) Create embeddings for these chunks and save them to FAISS index

In [74]:

class OpenAIEmbeddings:
    def __init__(self):
        self.tokenizer = OpenAIGPTTokenizer.from_pretrained("openai-gpt")
        self.tokenizer.add_special_tokens({'pad_token': '[PAD]'})  # Add padding token
        self.model = OpenAIGPTModel.from_pretrained("openai-gpt")
    
    def embed(self, text):
        inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True)
        outputs = self.model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1).detach().numpy()
        return embeddings

# Define the RecursiveCharacterTextSplitter class
class RecursiveCharacterTextSplitter:
    def __init__(self, chunk_size=1000, chunk_overlap=200):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap

    def split_text(self, text):
        chunks = []
        for i in range(0, len(text), self.chunk_size - self.chunk_overlap):
            chunks.append(text[i:i + self.chunk_size])
        return chunks

# Fetch data from URLs and split into chunks
data = fetch_data_from_urls(urls)

# Initialize the splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

# Split the text data into chunks
docs = [text_splitter.split_text(text) for text in data]

# Flatten the list of lists into a single list
docs = [chunk for sublist in docs for chunk in sublist]

# Initialize OpenAI embeddings
embeddings = OpenAIEmbeddings()

# Embed each chunk
chunk_embeddings = [embeddings.embed(chunk) for chunk in docs]

# Concatenate embeddings along axis 0 to create a matrix
embedding_matrix = np.concatenate(chunk_embeddings, axis=0)

# Create FAISS vector index
vector_index = faiss.IndexFlatIP(embedding_matrix.shape[1])
vector_index.add(embedding_matrix)




ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.


In [75]:
# Storing vector index create in local
file_path="vector_index.pkl"
with open(file_path, "wb") as f:
    pickle.dump(vector_index, f)

In [76]:
if os.path.exists(file_path):
    with open(file_path, "rb") as f:
        vectorIndex = pickle.load(f)

### (4) Retrieve similar embeddings for a given question and call LLM to retrieve final answer

In [None]:


# Create FAISSDocumentStore and index the documents
document_store = FAISSDocumentStore()
document_store.write_embeddings(embedding_matrix)
document_store.update_embeddings(retriever="dpr")

# Initialize a DensePassageRetriever
retriever = DensePassageRetriever(document_store=document_store)

# Load or initialize llm (if you haven't already)
# llm = ...

# Initialize RetrievalQAWithSourcesChain
chain = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=retriever)


In [None]:
query = "what is the price of Tiago iCNG?"
# query = "what are the main features of punch iCNG?"

langchain.debug=True

chain({"question": query}, return_only_outputs=True)

[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQAWithSourcesChain] Entering Chain run with input:
[0m{
  "question": "what is the price of Tiago iCNG?"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQAWithSourcesChain > 3:chain:MapReduceDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQAWithSourcesChain > 3:chain:MapReduceDocumentsChain > 4:chain:LLMChain] Entering Chain run with input:
[0m{
  "input_list": [
    {
      "context": "The company also said it has also introduced the twin-cylinder technology on its Tiago and Tigor models.\n\nThe Tiago iCNG is priced between Rs 6.55 lakh and Rs 8.1 lakh, while the Tigor iCNG comes at a price range of Rs 7.8 lakh to Rs 8.95 lakh.\n\nTata Motors Passenger Vehicles Ltd Head-Marketing, Vinay Pant said these introductions put together will make the company's CNG line up \"appealing, holistic, and stronger than ever\".\n\nPTI\n\nTags:\n\n#Business\n\n#Companies\n\nfi

[36;1m[1;3m[llm/end][0m [1m[1:chain:RetrievalQAWithSourcesChain > 3:chain:MapReduceDocumentsChain > 4:chain:LLMChain > 5:llm:OpenAI] [1.01s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": " The Tiago iCNG is priced between Rs 6.55 lakh and Rs 8.1 lakh.",
        "generation_info": {
          "finish_reason": "stop",
          "logprobs": null
        }
      }
    ]
  ],
  "llm_output": {
    "token_usage": {
      "total_tokens": 1343,
      "prompt_tokens": 1269,
      "completion_tokens": 74
    },
    "model_name": "text-davinci-003"
  },
  "run": null
}
[36;1m[1;3m[llm/end][0m [1m[1:chain:RetrievalQAWithSourcesChain > 3:chain:MapReduceDocumentsChain > 4:chain:LLMChain > 6:llm:OpenAI] [1.01s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": " Tata Motors on Friday launched the CNG variant of its micro SUV Punch priced between Rs 7.1 lakh and Rs 9.68 lakh (ex-showroom, Delhi).",
        "generation_info

[36;1m[1;3m[llm/end][0m [1m[1:chain:RetrievalQAWithSourcesChain > 3:chain:MapReduceDocumentsChain > 9:chain:LLMChain > 10:llm:OpenAI] [2.88s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": " The Tiago iCNG is priced between Rs 6.55 lakh and Rs 8.1 lakh.\nSOURCES: https://www.moneycontrol.com/news/business/markets/wall-street-rises-as-tesla-soars-on-ai-optimism-11351111.htmlhttps://www.moneycontrol.com/news/business/tata-motors-launches-punch-icng-price-starts-at-rs-7-1-lakh-11098751.html",
        "generation_info": {
          "finish_reason": "stop",
          "logprobs": null
        }
      }
    ]
  ],
  "llm_output": {
    "token_usage": {
      "total_tokens": 2093,
      "prompt_tokens": 1976,
      "completion_tokens": 117
    },
    "model_name": "text-davinci-003"
  },
  "run": null
}
[36;1m[1;3m[chain/end][0m [1m[1:chain:RetrievalQAWithSourcesChain > 3:chain:MapReduceDocumentsChain > 9:chain:LLMChain] [2.88s] Exiting Chain run wi

{'answer': ' The Tiago iCNG is priced between Rs 6.55 lakh and Rs 8.1 lakh.\n',
 'sources': 'https://www.moneycontrol.com/news/business/markets/wall-street-rises-as-tesla-soars-on-ai-optimism-11351111.htmlhttps://www.moneycontrol.com/news/business/tata-motors-launches-punch-icng-price-starts-at-rs-7-1-lakh-11098751.html'}