In [None]:
import pandas as pd
import numpy as np
import pickle 
import os

In [3]:
# Step 1: Open the file in read-binary mode
with open('data.pkl', 'rb') as file:
    # Step 2: Load the data from the file
    data = pickle.load(file)

# Now, `data` contains the deserialized Python object
print(data)


TypeError: Cannot convert numpy.ndarray to numpy.ndarray

In [3]:
data = data.str.lower()
print(data)

0        jan jagran times
1        jagran city plus
2         sampurna jagran
3           dainik jagran
4           vishwa jagran
               ...       
21394        kaiwart awaz
21395     sarbaharar awaz
21396      shramiker awaz
21397          sobar awaz
21398        awaz aap tak
Name: Title Name, Length: 10790, dtype: object


In [4]:
data = pd.DataFrame(data)
data = data.drop_duplicates().dropna()

In [5]:
data.shape

(10790, 1)

In [6]:
data.rename(columns={'Title Name': 'title'}, inplace=True)


In [7]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document  # Import Document class

# Initialize the text splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)

# Assuming your DataFrame `data` has a column 'title' containing the titles or documents
# Create a list of Document objects
documents = [Document(page_content=row['title']) for index, row in data.iterrows()]

# Split the documents
split_docs = text_splitter.split_documents(documents)

In [8]:
# Extract content from split_docs
texts = [doc.page_content for doc in split_docs]


In [9]:
from langchain_ollama.embeddings import OllamaEmbeddings
embeddings = OllamaEmbeddings(model ="llama3.2" )

In [21]:
res = embeddings.embed_query("The Jagran Times")
print(len(res)) # checking the dimension of the embeddings so that the pinecone vector db can be configured accordingly 

3072


In [11]:
from dotenv import load_dotenv
load_dotenv()
import os
api_key = os.getenv("PINECONE_API_KEY")

In [13]:
!pip install pinecone

Collecting pinecone
  Downloading pinecone-7.0.2-py3-none-any.whl.metadata (9.5 kB)
Collecting pinecone-plugin-assistant<2.0.0,>=1.6.0 (from pinecone)
  Downloading pinecone_plugin_assistant-1.6.1-py3-none-any.whl.metadata (27 kB)
Downloading pinecone-7.0.2-py3-none-any.whl (516 kB)
Downloading pinecone_plugin_assistant-1.6.1-py3-none-any.whl (239 kB)
Installing collected packages: pinecone-plugin-assistant, pinecone

   ---------------------------------------- 0/2 [pinecone-plugin-assistant]
   ---------------------------------------- 0/2 [pinecone-plugin-assistant]
   ---------------------------------------- 0/2 [pinecone-plugin-assistant]
   ---------------------------------------- 0/2 [pinecone-plugin-assistant]
   ---------------------------------------- 0/2 [pinecone-plugin-assistant]
   ---------------------------------------- 0/2 [pinecone-plugin-assistant]
   -------------------- ------------------- 1/2 [pinecone]
   -------------------- ------------------- 1/2 [pinecone]
   -

In [14]:
from pinecone import Pinecone
index_name = "sliftex"
pc = Pinecone(api_key=api_key)
index = pc.Index(index_name)
# Setting up the pinecone index the index name in this case is llama 

  from .autonotebook import tqdm as notebook_tqdm


In [15]:
import nltk
from pinecone_text.sparse import BM25Encoder



# Initialize the BM25 encoder
encoder = BM25Encoder().default()

# Your data and BM25 encoding process
# Ensure the data column is converted to a list of strings
corpus = data['title'].tolist()  # Convert the column to a list

# Fit the encoder
encoder.fit(corpus)

# Save the encoded data
encoder.dump("document.json")


None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.
100%|██████████| 10790/10790 [00:00<00:00, 21331.44it/s]


In [16]:
encoded_docs = BM25Encoder().load("document.json")

In [17]:
alpha = 0.7


In [18]:
from langchain_community.retrievers import PineconeHybridSearchRetriever
retriever = PineconeHybridSearchRetriever(index=index, sparse_encoder=encoded_docs, embeddings=embeddings , top_k=30 , alpha = 0.8)

In [19]:
retriever

PineconeHybridSearchRetriever(embeddings=OllamaEmbeddings(model='llama3.2', base_url=None, client_kwargs={}, async_client_kwargs={}, sync_client_kwargs={}, mirostat=None, mirostat_eta=None, mirostat_tau=None, num_ctx=None, num_gpu=None, keep_alive=None, num_thread=None, repeat_last_n=None, repeat_penalty=None, temperature=None, stop=None, tfs_z=None, top_k=None, top_p=None), sparse_encoder=<pinecone_text.sparse.bm25_encoder.BM25Encoder object at 0x00000212B4A7F8F0>, index=<pinecone.db_data.index.Index object at 0x000002129EDCF4D0>, top_k=30, alpha=0.8)

In [20]:
retriever.add_texts(
    corpus
)

  1%|          | 3/338 [04:02<7:32:00, 80.96s/it]


KeyboardInterrupt: 

In [22]:
input_text = "Denik Micheal jagran"

In [24]:
documents=retriever.invoke(input_text)

In [25]:
documents

[Document(metadata={'score': 0.73353374}, page_content='krisak jagran'),
 Document(metadata={'score': 0.711413}, page_content='jan jagran'),
 Document(metadata={'score': 0.692397237}, page_content='shosit jagran'),
 Document(metadata={'score': 0.676488638}, page_content='swadeep jagran'),
 Document(metadata={'score': 0.674240828}, page_content='pal jagran'),
 Document(metadata={'score': 0.671086073}, page_content='maruthar jagran'),
 Document(metadata={'score': 0.669752598}, page_content='satat jagran'),
 Document(metadata={'score': 0.669108272}, page_content='vasundhara jagran'),
 Document(metadata={'score': 0.666702628}, page_content='chaitanya jagran'),
 Document(metadata={'score': 0.663770795}, page_content='brahman jagran'),
 Document(metadata={'score': 0.661911488}, page_content='kaurav jagran'),
 Document(metadata={'score': 0.661866486}, page_content='india samikaran'),
 Document(metadata={'score': 0.660538912}, page_content='kishan jagran'),
 Document(metadata={'score': 0.65691

In [26]:
from rapidfuzz import fuzz





# Perform fuzzy matching
results = []
for doc in documents:
    similarity_score = fuzz.ratio(input_text.lower(), doc.page_content.lower())
    results.append({'page_content': doc.page_content, 'similarity_score': similarity_score})

# Sort the results by similarity score in descending order
sorted_results = sorted(results, key=lambda x: x['similarity_score'], reverse=True)

# Display all results with similarity scores in sorted order
for result in sorted_results:
    print(f"Title: {result['page_content']}, Similarity: {result['similarity_score']}%")


Title: audichya jagran, Similarity: 68.57142857142857%
Title: kishan jagran, Similarity: 66.66666666666667%
Title: krisak jagran, Similarity: 60.60606060606061%
Title: khatik jagran, Similarity: 60.60606060606061%
Title: pal jagran, Similarity: 60.0%
Title: nandigiram jagran, Similarity: 59.45945945945945%
Title: swadeep jagran, Similarity: 58.82352941176471%
Title: lakshya jagran, Similarity: 58.82352941176471%
Title: manohar jagran, Similarity: 58.82352941176471%
Title: sandhya jagran, Similarity: 58.82352941176471%
Title: maruthar jagran, Similarity: 57.14285714285714%
Title: sewak jagran, Similarity: 56.25%
Title: chaitanya jagran, Similarity: 55.55555555555556%
Title: shrutlekh jagran, Similarity: 55.55555555555556%
Title: kaurav jagran, Similarity: 54.54545454545454%
Title: balmik jagran, Similarity: 54.54545454545454%
Title: vasundhara jagran, Similarity: 54.054054054054056%
Title: jan jagran, Similarity: 53.333333333333336%
Title: lok jagran, Similarity: 53.333333333333336%
Tit