In [19]:
import pandas as pd
import numpy as np
import pickle 
import os

In [20]:

# Step 1: Open the file in read-binary mode
with open('data.pkl', 'rb') as file:
    # Step 2: Load the data from the file
    data = pickle.load(file)

# Now, `data` contains the deserialized Python object
print(data)


0        JAN JAGRAN TIMES
1        JAGRAN CITY PLUS
2         SAMPURNA JAGRAN
3           DAINIK JAGRAN
4           VISHWA JAGRAN
               ...       
21394        KAIWART AWAZ
21395     SARBAHARAR AWAZ
21396      SHRAMIKER AWAZ
21397          SOBAR AWAZ
21398        AWAZ AAP TAK
Name: Title Name, Length: 10790, dtype: object


In [21]:
data = data.str.lower()
print(data)

0        jan jagran times
1        jagran city plus
2         sampurna jagran
3           dainik jagran
4           vishwa jagran
               ...       
21394        kaiwart awaz
21395     sarbaharar awaz
21396      shramiker awaz
21397          sobar awaz
21398        awaz aap tak
Name: Title Name, Length: 10790, dtype: object


In [22]:
data = pd.DataFrame(data)
data = data.drop_duplicates().dropna()

In [23]:
data.shape

(10790, 1)

In [24]:
data.rename(columns={'Title Name': 'title'}, inplace=True)


In [25]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document  # Import Document class

# Initialize the text splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)

# Assuming your DataFrame `data` has a column 'title' containing the titles or documents
# Create a list of Document objects
documents = [Document(page_content=row['title']) for index, row in data.iterrows()]

# Split the documents
split_docs = text_splitter.split_documents(documents)

In [26]:
from langchain_ollama.embeddings import OllamaEmbeddings
embeddings = OllamaEmbeddings(model ="mxbai-embed-large")

In [27]:
from dotenv import load_dotenv
load_dotenv()
import os
api_key = os.getenv("PINECONE_API_KEY")

In [28]:
from pinecone import Pinecone, ServerlessSpec
index_name = "sliftexsearch"
pc = Pinecone(api_key=api_key)
index = pc.Index(index_name)

In [29]:
import nltk
from pinecone_text.sparse import BM25Encoder

# Download the necessary tokenizer data
nltk.download('punkt')
nltk.data.path.append(r'C:\Users\Debanjan\AppData\Roaming\nltk_data')

# Initialize the BM25 encoder
encoder = BM25Encoder().default()

# Your data and BM25 encoding process
# Ensure the data column is converted to a list of strings
corpus = data['title'].tolist()  # Convert the column to a list

# Fit the encoder
encoder.fit(corpus)

# Save the encoded data
encoder.dump("document.json")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Debanjan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
100%|██████████| 10790/10790 [00:00<00:00, 29637.96it/s]


In [30]:
encoded_docs = BM25Encoder().load("document.json")

In [53]:
alpha = 0.7


In [55]:
from langchain_community.retrievers import PineconeHybridSearchRetriever
retriever = PineconeHybridSearchRetriever(index=index, sparse_encoder=encoded_docs, embeddings=embeddings , top_k=20 )

In [38]:
retriever

PineconeHybridSearchRetriever(embeddings=OllamaEmbeddings(model='mxbai-embed-large', base_url=None, client_kwargs={}), sparse_encoder=<pinecone_text.sparse.bm25_encoder.BM25Encoder object at 0x0000024E2F1AB6E0>, index=<pinecone.data.index.Index object at 0x0000024E7E792150>, top_k=20)

In [33]:
retriever.add_texts(
    corpus
)

100%|██████████| 338/338 [07:23<00:00,  1.31s/it]


In [49]:
retriever.invoke("dainic jagran")

[Document(metadata={'score': 0.528316438}, page_content='jagran'),
 Document(metadata={'score': 0.514177859}, page_content='dainik jagran'),
 Document(metadata={'score': 0.486823589}, page_content='prod jagran'),
 Document(metadata={'score': 0.475773036}, page_content='punjabi jagran'),
 Document(metadata={'score': 0.471637636}, page_content='youth jagran'),
 Document(metadata={'score': 0.471027642}, page_content='jat jagran'),
 Document(metadata={'score': 0.468507528}, page_content='brahman jagran'),
 Document(metadata={'score': 0.467536777}, page_content='adhyapak jagran'),
 Document(metadata={'score': 0.467157513}, page_content='ginvani jagran'),
 Document(metadata={'score': 0.462170929}, page_content='patrakar jagran'),
 Document(metadata={'score': 0.461282462}, page_content='balmik jagran'),
 Document(metadata={'score': 0.461194038}, page_content='rastra jagran'),
 Document(metadata={'score': 0.460967928}, page_content='jain jagran'),
 Document(metadata={'score': 0.457330912}, pag