In [13]:
import pandas as pd
import numpy as np
import pickle 
import os

In [14]:

# Step 1: Open the file in read-binary mode
with open('data.pkl', 'rb') as file:
    # Step 2: Load the data from the file
    data = pickle.load(file)

# Now, `data` contains the deserialized Python object
print(data)


0        JAN JAGRAN TIMES
1        JAGRAN CITY PLUS
2         SAMPURNA JAGRAN
3           DAINIK JAGRAN
4           VISHWA JAGRAN
               ...       
21394        KAIWART AWAZ
21395     SARBAHARAR AWAZ
21396      SHRAMIKER AWAZ
21397          SOBAR AWAZ
21398        AWAZ AAP TAK
Name: Title Name, Length: 10790, dtype: object


In [15]:
data = data.str.lower()
print(data)

0        jan jagran times
1        jagran city plus
2         sampurna jagran
3           dainik jagran
4           vishwa jagran
               ...       
21394        kaiwart awaz
21395     sarbaharar awaz
21396      shramiker awaz
21397          sobar awaz
21398        awaz aap tak
Name: Title Name, Length: 10790, dtype: object


In [16]:
data = pd.DataFrame(data)
data = data.drop_duplicates().dropna()

In [17]:
# Rename column
from metaphone import doublemetaphone

data.rename(columns={'Title Name': 'title'}, inplace=True)
data['metaphoneA'], data['metaphoneB'] = zip(*data['title'].apply(doublemetaphone))


In [18]:
data

Unnamed: 0,title,metaphoneA,metaphoneB
0,jan jagran times,JNJKRNTMS,ANJKRNTMS
1,jagran city plus,JKRNSTPLS,AKRNSTPLS
2,sampurna jagran,SMPRNJKRN,
3,dainik jagran,TNKJKRN,
4,vishwa jagran,FXJKRN,
...,...,...,...
21394,kaiwart awaz,KRTS,KRTTS
21395,sarbaharar awaz,SRPHRRS,SRPHRRTS
21396,shramiker awaz,XRMKRS,XRMKRTS
21397,sobar awaz,SPRS,SPRTS


In [35]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document  # Import Document class
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)



# Convert the DataFrame to Documents with page content as metaphoneA and metaphoneB
documents = [
    Document(
        page_content=f"{row['metaphoneA']} {row['metaphoneB']}",  # Concatenate metaphoneA and metaphoneB
        metadata={'title': row['title']}
    )
    for _, row in data.iterrows()
]

# Print a sample of the documents
print(documents)





In [21]:
split_docs = text_splitter.split_documents(documents)

In [22]:
print(split_docs)



In [23]:
from langchain_ollama.embeddings import OllamaEmbeddings
embeddings = OllamaEmbeddings(model ="mxbai-embed-large")

In [24]:
from dotenv import load_dotenv
load_dotenv()
import os
api_key = os.getenv("PINECONE_API_KEY")

In [26]:
from pinecone import Pinecone, ServerlessSpec
index_name = "phonaticsearch"
pc = Pinecone(api_key=api_key)
index = pc.Index(index_name)

In [27]:
import nltk
from pinecone_text.sparse import BM25Encoder

# Download the necessary tokenizer data

# Initialize the BM25 encoder
encoder = BM25Encoder().default()

# Your data and BM25 encoding process
# Ensure the data column is converted to a list of strings
corpus = data['title'].tolist()  # Convert the column to a list

# Fit the encoder
encoder.fit(corpus)

# Save the encoded data
encoder.dump("phon.json")


100%|██████████| 10790/10790 [00:00<00:00, 18949.65it/s]


In [28]:
encoded_docs = BM25Encoder().load("document.json")

In [29]:
from langchain_community.retrievers import PineconeHybridSearchRetriever
retriever = PineconeHybridSearchRetriever(index=index, sparse_encoder=encoded_docs, embeddings=embeddings)

In [30]:
retriever.add_texts(
    corpus
)

100%|██████████| 338/338 [07:58<00:00,  1.42s/it]


In [34]:
retriever.invoke("Chutiya")

[Document(metadata={'score': 0.344147861}, page_content='pudhiya india'),
 Document(metadata={'score': 0.312513143}, page_content='pratigya times'),
 Document(metadata={'score': 0.307463586}, page_content='india yug'),
 Document(metadata={'score': 0.3040573}, page_content='charaniya times')]