In [1]:
import numpy as np 
import pandas as pd

In [2]:
import pandas as pd

# Load the Excel sheet
df = pd.read_csv('filtered_titles.csv')

# Keep only the second column and drop the first row
df = df.iloc[1:, [1]]  # Skip the first row, keep second column

# Optional: Reset index
df = df.reset_index(drop=True)

print(df.head())


              TITLE
0      SHIKSHNETTAR
1   SHRAMIK MIMANSA
2              .GUT
3             A & D
4        A &S INDIA


In [3]:
# Optional: Rename column for clarity
df.columns = ['title']
print(df.head())

              title
0      SHIKSHNETTAR
1   SHRAMIK MIMANSA
2              .GUT
3             A & D
4        A &S INDIA


In [4]:
# Convert the column to string for uniform sorting
df[df.columns[0]] = df[df.columns[0]].astype(str)

# Now sort safely
df = df.sort_values(by=df.columns[0], ascending=True)

# Reset index (optional)
df = df.reset_index(drop=True)

print(df.head())


              title
0      SHIKSHNETTAR
1   SHRAMIK MIMANSA
2          'M'TIMES
3              .GUT
4             A & D


In [5]:
df = df.iloc[:50000].reset_index(drop=True)


In [6]:
print(df.shape)

(50000, 1)


In [9]:
df.dropna(inplace=True)

print(df.shape)

(50000, 1)


In [10]:
# Assuming df is sorted and has one column
column_name = df.columns[0]

# Convert all to lowercase
df[column_name] = df[column_name].astype(str).str.lower()

# Use dict as a map data structure
alphabet_index_map: dict[str, int] = {}

# Build the map
for idx, value in enumerate(df[column_name]):
    first_char = value[0]
    if first_char.isalpha() and first_char not in alphabet_index_map:
        alphabet_index_map[first_char] = idx

print(alphabet_index_map)


{'a': 4, 'b': 13856, 'c': 22345, 'd': 28777, 'e': 35583, 'f': 37649, 'g': 39387, 'h': 44621, 'i': 49178}


In [11]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

# Download stopwords if not already done
import nltk
nltk.download('stopwords')
nltk.download('punkt')

# Get English stopwords
stop_words = set(stopwords.words('english'))

# Function to clean text by removing stopwords and punctuation
def clean_text(text):
    if pd.isnull(text):
        return ""
    # Tokenize text into words
    tokens = word_tokenize(text)
    # Convert to lowercase and remove punctuation and stopwords
    cleaned = [
        word.lower() for word in tokens
        if word.lower() not in stop_words and word not in string.punctuation
    ]
    return "".join(cleaned)  # Return as a single string for n-gram generation

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Debanjan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Debanjan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [12]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document  # Import Document class

data = df
# Initialize the text splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)

# Assuming your DataFrame `data` has a column 'title' containing the titles or documents
# Create a list of Document objects
documents = [Document(page_content=row['title']) for index, row in data.iterrows()]

# Split the documents
split_docs = text_splitter.split_documents(documents)

In [13]:
# Extract content from split_docs
texts = [doc.page_content for doc in split_docs]


In [14]:
from langchain_ollama.embeddings import OllamaEmbeddings
embeddings = OllamaEmbeddings(model ="llama3.2" )

In [15]:
res = embeddings.embed_query("The Jagran Times")
print(len(res)) # checking the dimension of the embeddings so that the pinecone vector db can be configured accordingly 

3072


In [16]:
from dotenv import load_dotenv
load_dotenv()
import os
api_key = os.getenv("PINECONE_API_KEY")

In [17]:
from pinecone import Pinecone
index_name = "titles"
pc = Pinecone(api_key=api_key)
index = pc.Index(index_name)
# Setting up the pinecone index the index name in this case is llama 

In [18]:
import nltk
nltk.download('stopwords' )
nltk.download('punkt' )

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Debanjan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Debanjan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [31]:
print(data)

                         title
0                 shikshnettar
1              shramik mimansa
2                     'm'times
3                         .gut
4                        a & d
...                        ...
49995  india cultures quarters
49996             india darpan
49997             india darpan
49998      india day and night
49999            india debates

[50000 rows x 1 columns]


In [24]:
from pinecone_text.sparse import BM25Encoder


import nltk
nltk.download('punkt')  # force correct download
nltk.download('stopwords')

# Initialize the BM25 encoder
encoder = BM25Encoder().default()

# Your data and BM25 encoding process
# Ensure the data column is converted to a list of strings
corpus = data['title'].tolist()  # Convert the column to a list

# Fit the encoder
encoder.fit(corpus)

# Save the encoded data
encoder.dump("document.json")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Debanjan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Debanjan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


  0%|          | 0/50000 [00:00<?, ?it/s]

In [25]:
encoded_docs = BM25Encoder().load("document.json")

In [27]:
sparse_vectors = encoder.encode_documents(corpus)


In [28]:
from langchain_community.retrievers import PineconeHybridSearchRetriever
retriever = PineconeHybridSearchRetriever(index=index, sparse_encoder=encoded_docs, embeddings=embeddings , top_k=30 , alpha = 0.8)

In [35]:

corpus = data['title'].astype(str).str.strip().str.lower().tolist()

# Initialize and prepare encoder
encoder = BM25Encoder().default()

# Filter out texts with no valid BM25 sparse tokens
def is_valid_bm25(text):
    if not text or not any(c.isalpha() for c in text):  # must contain letters
        return False
    encoded = encoder.encode_queries([text])[0]
    return len(encoded['indices']) > 0

# Filter corpus
filtered_corpus = [text for text in corpus if is_valid_bm25(text)]

# Fit encoder on filtered corpus
encoder.fit(filtered_corpus)
encoder.dump("document.json")

retriever = PineconeHybridSearchRetriever(
    index=index, 
    sparse_encoder=encoder, 
    embeddings=embeddings , 
    top_k=30 , 
    alpha = 0.8)
# Now safely add texts
retriever.add_texts(texts=filtered_corpus)


  0%|          | 0/49989 [00:00<?, ?it/s]

  0%|          | 0/1563 [00:00<?, ?it/s]