In [1]:
import pandas as pd
import numpy as np
import pickle 
import os

KeyboardInterrupt: 

In [None]:
prefixes = [
    "maharaj", "sardar", "rao", "thakur", "khan", "jadhav", "chaudhary", "zaildar", 
    "bhai", "maulana", "pandit", "singh", "babu", "guru", "rani", "shri", "dharma", 
    "raj", "bhagat"
]


In [None]:
suffixes = [
    "nagar", "sikhar", "mahiti", "darshan", "sathi", "awaaz", "parikrama", "yojana", 
    "rashtriya", "manthan", "vichar", "darpan", "vishwakosh", "patrika", "times", 
    "sutra", "chronicler", "khabar", "bharati", "sadhna", "varta", "parmeshwar","jagran","awaz"
]


In [None]:
import pandas as pd

# Lists of prefixes and suffixes


def strip_prefix_suffix(df, title_column):
    """
    Function to strip prefixes and suffixes from titles in the dataframe.
    Strips only the first and last words if they match the given prefix or suffix lists.

    Parameters:
    df (pd.DataFrame): The dataframe containing the titles.
    title_column (str): The name of the column in the dataframe containing the titles.

    Returns:
    pd.DataFrame: The dataframe with prefixes and suffixes stripped from the titles.
    """
    
    def remove_prefix_suffix(title):
        # Split the title into words
        words = title.split()
        
        # Check and remove prefix (first word)
        if words and words[0] in prefixes:
            words.pop(0)
        
        # Check and remove suffix (last word)
        if words and words[-1] in suffixes:
            words.pop()
        
        # Join the words back into a single string
        return ' '.join(words)
    
    # Apply the function to the title column
    df[title_column] = df[title_column].apply(remove_prefix_suffix)
    
    return df

# Example usage
data = {
    'title': ['Shri Times of India', 'Maharaj Vichar Dhara', 'Khabar Times', 'Raj Darshan News']
}



In [None]:

# Step 1: Open the file in read-binary mode
with open('data.pkl', 'rb') as file:
    # Step 2: Load the data from the file
    data = pickle.load(file)

# Now, `data` contains the deserialized Python object
print(data)


0        JAN JAGRAN TIMES
1        JAGRAN CITY PLUS
2         SAMPURNA JAGRAN
3           DAINIK JAGRAN
4           VISHWA JAGRAN
               ...       
21394        KAIWART AWAZ
21395     SARBAHARAR AWAZ
21396      SHRAMIKER AWAZ
21397          SOBAR AWAZ
21398        AWAZ AAP TAK
Name: Title Name, Length: 10790, dtype: object


In [None]:
data = data.str.lower()
print(data)

0        jan jagran times
1        jagran city plus
2         sampurna jagran
3           dainik jagran
4           vishwa jagran
               ...       
21394        kaiwart awaz
21395     sarbaharar awaz
21396      shramiker awaz
21397          sobar awaz
21398        awaz aap tak
Name: Title Name, Length: 10790, dtype: object


In [None]:
data = pd.DataFrame(data)
data = data.drop_duplicates().dropna()



In [None]:
print(data)

             Title Name
0      jan jagran times
1      jagran city plus
2       sampurna jagran
3         dainik jagran
4         vishwa jagran
...                 ...
21394      kaiwart awaz
21395   sarbaharar awaz
21396    shramiker awaz
21397        sobar awaz
21398      awaz aap tak

[10790 rows x 1 columns]


In [None]:
data.rename(columns={'Title Name': 'title'}, inplace=True)

from metaphone import doublemetaphone
data['metaphone'] = data['title'].apply(lambda x: doublemetaphone(x)[0])
print(data)

                  title  metaphone
0      jan jagran times  JNJKRNTMS
1      jagran city plus  JKRNSTPLS
2       sampurna jagran  SMPRNJKRN
3         dainik jagran    TNKJKRN
4         vishwa jagran     FXJKRN
...                 ...        ...
21394      kaiwart awaz       KRTS
21395   sarbaharar awaz    SRPHRRS
21396    shramiker awaz     XRMKRS
21397        sobar awaz       SPRS
21398      awaz aap tak      ASPTK

[10790 rows x 2 columns]


In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document  # Import Document class

# Initialize the text splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=50)

# Assuming your DataFrame `data` has columns 'title' and 'metaphone'
documents = []
for index, row in data.iterrows():
    # Create a document for the title column with metadata
    title_doc = Document(
        page_content=row['title'], 
        metadata={"type": "title", "original_index": index, "original_word": row['title']}
    )
    documents.append(title_doc)
    
    # Create a document for the metaphone column with metadata
    metaphone_doc = Document(
        page_content=row['metaphone'], 
        metadata={"type": "metaphone", "original_index": index, "original_word": row['metaphone']}
    )
    documents.append(metaphone_doc)

# Split the documents
split_docs = text_splitter.split_documents(documents)




In [None]:
from langchain_ollama.embeddings import OllamaEmbeddings
embeddings = OllamaEmbeddings(model ="mxbai-embed-large")

In [None]:
from dotenv import load_dotenv
load_dotenv()
import os
api_key = os.getenv("PINECONE_API_KEY")

In [None]:
from pinecone import Pinecone, ServerlessSpec
index_name = "sliftexphonatics"
pc = Pinecone(api_key=api_key)
index = pc.Index(index_name)

In [None]:
import nltk
from pinecone_text.sparse import BM25Encoder

# Download the necessary tokenizer data
nltk.download('punkt')
nltk.data.path.append(r'C:\Users\Debanjan\AppData\Roaming\nltk_data')

# Initialize the BM25 encoder
encoder = BM25Encoder().default()

# Your data and BM25 encoding process
# Ensure the data column is converted to a list of strings
corpus = data['title'].tolist()  # Convert the column to a list

# Fit the encoder
encoder.fit(corpus)

# Save the encoded data
encoder.dump("phonatics.json")




[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Debanjan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
100%|██████████| 10790/10790 [00:00<00:00, 28732.03it/s]


In [None]:
encoded_docs = BM25Encoder().load("phonatics.json")

In [None]:
from langchain_community.retrievers import PineconeHybridSearchRetriever
retriever = PineconeHybridSearchRetriever(index=index, sparse_encoder=encoded_docs, embeddings=embeddings)

In [None]:
retriever.add_texts(
    corpus
)

100%|██████████| 338/338 [06:35<00:00,  1.17s/it]


In [None]:
res = index.query(
    vector= "shri",
    top_k =10 ,
    include_metadata=True
)
print(res)

PineconeApiException: (400)
Reason: Bad Request
HTTP response headers: HTTPHeaderDict({'Date': 'Sat, 07 Dec 2024 04:50:04 GMT', 'Content-Type': 'text/plain', 'Content-Length': '48', 'Connection': 'keep-alive', 'server': 'envoy'})
HTTP response body: vector: invalid value "shri" for type TYPE_FLOAT
