In [None]:
import pandas as pd
import numpy as np
import pickle 
import os

In [None]:

# Step 1: Open the file in read-binary mode
with open('data.pkl', 'rb') as file:
    # Step 2: Load the data from the file
    data = pickle.load(file)

# Now, `data` contains the deserialized Python object
print(data)


0        JAN JAGRAN TIMES
1        JAGRAN CITY PLUS
2         SAMPURNA JAGRAN
3           DAINIK JAGRAN
4           VISHWA JAGRAN
               ...       
21394        KAIWART AWAZ
21395     SARBAHARAR AWAZ
21396      SHRAMIKER AWAZ
21397          SOBAR AWAZ
21398        AWAZ AAP TAK
Name: Title Name, Length: 10790, dtype: object


In [None]:
data = data.str.lower()
print(data)

0        jan jagran times
1        jagran city plus
2         sampurna jagran
3           dainik jagran
4           vishwa jagran
               ...       
21394        kaiwart awaz
21395     sarbaharar awaz
21396      shramiker awaz
21397          sobar awaz
21398        awaz aap tak
Name: Title Name, Length: 10790, dtype: object


In [None]:
data = pd.DataFrame(data)
data = data.drop_duplicates().dropna()

In [None]:
data.shape

(10790, 1)

In [None]:
data.rename(columns={'Title Name': 'title'}, inplace=True)


In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document  # Import Document class

# Initialize the text splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)

# Assuming your DataFrame `data` has a column 'title' containing the titles or documents
# Create a list of Document objects
documents = [Document(page_content=row['title']) for index, row in data.iterrows()]

# Split the documents
split_docs = text_splitter.split_documents(documents)

In [None]:
# Extract content from split_docs
texts = [doc.page_content for doc in split_docs]


In [None]:
from langchain_ollama.embeddings import OllamaEmbeddings
embeddings = OllamaEmbeddings(model ="llama3.2" )

In [None]:
res = embeddings.embed_query("The Jagran Times")
print(len(res)) # checking the dimension of the embeddings so that the pinecone vector db can be configured accordingly 

3072


In [None]:
from dotenv import load_dotenv
load_dotenv()
import os
api_key = os.getenv("PINECONE_API_KEY")

In [None]:
from pinecone import Pinecone, ServerlessSpec
index_name = "llama"
pc = Pinecone(api_key=api_key)
index = pc.Index(index_name)
# Setting up the pinecone index the index name in this case is llama 

In [None]:
import nltk
from pinecone_text.sparse import BM25Encoder



# Initialize the BM25 encoder
encoder = BM25Encoder().default()

# Your data and BM25 encoding process
# Ensure the data column is converted to a list of strings
corpus = data['title'].tolist()  # Convert the column to a list

# Fit the encoder
encoder.fit(corpus)

# Save the encoded data
encoder.dump("document.json")


100%|██████████| 10790/10790 [00:00<00:00, 27940.79it/s]


In [None]:
encoded_docs = BM25Encoder().load("document.json")

In [None]:
alpha = 0.7


In [None]:
from langchain_community.retrievers import PineconeHybridSearchRetriever
retriever = PineconeHybridSearchRetriever(index=index, sparse_encoder=encoded_docs, embeddings=embeddings , top_k=30 , alpha = 0.8)

In [None]:
retriever

PineconeHybridSearchRetriever(embeddings=OllamaEmbeddings(model='llama3.2', base_url=None, client_kwargs={}), sparse_encoder=<pinecone_text.sparse.bm25_encoder.BM25Encoder object at 0x00000167E712A510>, index=<pinecone.data.index.Index object at 0x00000167E7197350>, top_k=30, alpha=0.8)

In [None]:
retriever.add_texts(
    corpus
)

In [None]:
input_text = "Denik Micheal jagran"

In [None]:
documents=retriever.invoke(input_text)

In [None]:
documents

[Document(metadata={'score': 0.733466625}, page_content='krisak jagran'),
 Document(metadata={'score': 0.713007212}, page_content='jan jagran'),
 Document(metadata={'score': 0.692169785}, page_content='shosit jagran'),
 Document(metadata={'score': 0.676369071}, page_content='swadeep jagran'),
 Document(metadata={'score': 0.674885869}, page_content='pal jagran'),
 Document(metadata={'score': 0.672016263}, page_content='maruthar jagran'),
 Document(metadata={'score': 0.669668317}, page_content='chaitanya jagran'),
 Document(metadata={'score': 0.66942966}, page_content='satat jagran'),
 Document(metadata={'score': 0.667033792}, page_content='vasundhara jagran'),
 Document(metadata={'score': 0.662713408}, page_content='brahman jagran'),
 Document(metadata={'score': 0.661280036}, page_content='india samikaran'),
 Document(metadata={'score': 0.659001946}, page_content='kaurav jagran'),
 Document(metadata={'score': 0.658042908}, page_content='kishan jagran'),
 Document(metadata={'score': 0.65

In [None]:
from rapidfuzz import fuzz





# Perform fuzzy matching
results = []
for doc in documents:
    similarity_score = fuzz.ratio(input_text.lower(), doc.page_content.lower())
    results.append({'page_content': doc.page_content, 'similarity_score': similarity_score})

# Sort the results by similarity score in descending order
sorted_results = sorted(results, key=lambda x: x['similarity_score'], reverse=True)

# Display all results with similarity scores in sorted order
for result in sorted_results:
    print(f"Title: {result['page_content']}, Similarity: {result['similarity_score']}%")


Title: kishan jagran, Similarity: 66.66666666666667%
Title: krisak jagran, Similarity: 60.60606060606061%
Title: khatik jagran, Similarity: 60.60606060606061%
Title: pal jagran, Similarity: 60.0%
Title: nandigiram jagran, Similarity: 59.45945945945945%
Title: swadeep jagran, Similarity: 58.82352941176471%
Title: lakshya jagran, Similarity: 58.82352941176471%
Title: manohar jagran, Similarity: 58.82352941176471%
Title: sandhya jagran, Similarity: 58.82352941176471%
Title: maruthar jagran, Similarity: 57.14285714285714%
Title: sewak jagran, Similarity: 56.25%
Title: chaitanya jagran, Similarity: 55.55555555555556%
Title: shrutlekh jagran, Similarity: 55.55555555555556%
Title: kaurav jagran, Similarity: 54.54545454545454%
Title: balmik jagran, Similarity: 54.54545454545454%
Title: vasundhara jagran, Similarity: 54.054054054054056%
Title: jan jagran, Similarity: 53.333333333333336%
Title: lok jagran, Similarity: 53.333333333333336%
Title: brahman jagran, Similarity: 52.94117647058824%
Titl