In [59]:
import pandas as pd
import numpy as np
import pickle 
import os

In [60]:
# Step 1: Open the file in read-binary mode
with open('data.pkl', 'rb') as file:
    # Step 2: Load the data from the file
    data = pickle.load(file)

# Now, `data` contains the deserialized Python object
print(data)


0        JAN JAGRAN TIMES
1        JAGRAN CITY PLUS
2         SAMPURNA JAGRAN
3           DAINIK JAGRAN
4           VISHWA JAGRAN
               ...       
21394        KAIWART AWAZ
21395     SARBAHARAR AWAZ
21396      SHRAMIKER AWAZ
21397          SOBAR AWAZ
21398        AWAZ AAP TAK
Name: Title Name, Length: 10790, dtype: object


In [61]:
data = pd.DataFrame(data)
data = data.drop_duplicates().dropna()

In [62]:
data.rename(columns={'Title Name': 'original'}, inplace=True)

data['title'] = data['original'].str.lower().apply(lambda x: ' '.join(sorted(x.split())))


In [63]:

data['title'] = data['title'].str.lower()


In [64]:
data

Unnamed: 0,original,title
0,JAN JAGRAN TIMES,jagran jan times
1,JAGRAN CITY PLUS,city jagran plus
2,SAMPURNA JAGRAN,jagran sampurna
3,DAINIK JAGRAN,dainik jagran
4,VISHWA JAGRAN,jagran vishwa
...,...,...
21394,KAIWART AWAZ,awaz kaiwart
21395,SARBAHARAR AWAZ,awaz sarbaharar
21396,SHRAMIKER AWAZ,awaz shramiker
21397,SOBAR AWAZ,awaz sobar


In [65]:
from dotenv import load_dotenv
load_dotenv()
import os
api_key = os.getenv("PINECONE_API_KEY2")

In [66]:
import nltk
nltk.download('stopwords' )
nltk.download('punkt' )

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Debanjan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Debanjan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [67]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data['title'])


In [68]:
def to_sparse_dict(row):
    return {
        "indices": row.indices.tolist(),
        "values": row.data.tolist()
    }

records = []
for i in range(X.shape[0]):
    sparse_vec = to_sparse_dict(X[i])
    metadata = {
        "title": data.iloc[i]["title"],
        "original": data.iloc[i]["original"]
    }
    record = {
        "id": f"id-{i}",
        "sparse_values": sparse_vec,
        "metadata": metadata
    }
    records.append(record)


In [69]:
records

[{'id': 'id-0',
  'sparse_values': {'indices': [3204, 3268, 7687],
   'values': [0.6341746948410293, 0.7536105134968789, 0.17289780325457538]},
  'metadata': {'title': 'jagran jan times', 'original': 'JAN JAGRAN TIMES'}},
 {'id': 'id-1',
  'sparse_values': {'indices': [3204, 1477, 5587],
   'values': [0.46269645209364696, 0.6010823487385641, 0.6516225926522825]},
  'metadata': {'title': 'city jagran plus', 'original': 'JAGRAN CITY PLUS'}},
 {'id': 'id-2',
  'sparse_values': {'indices': [3204, 6519],
   'values': [0.4892215228632858, 0.8721595619881304]},
  'metadata': {'title': 'jagran sampurna', 'original': 'SAMPURNA JAGRAN'}},
 {'id': 'id-3',
  'sparse_values': {'indices': [3204, 1683],
   'values': [0.5589070513561661, 0.829230310555732]},
  'metadata': {'title': 'dainik jagran', 'original': 'DAINIK JAGRAN'}},
 {'id': 'id-4',
  'sparse_values': {'indices': [3204, 8196],
   'values': [0.5717589523626054, 0.8204216601194876]},
  'metadata': {'title': 'jagran vishwa', 'original': 'VISH

In [70]:
embedding_size = len(vectorizer.vocabulary_)
print("TF-IDF Embedding Size:", embedding_size)


TF-IDF Embedding Size: 8468


In [71]:
from pinecone import Pinecone
pc = Pinecone(api_key=api_key)
index_name = "tfidf"
index = pc.Index(index_name)

In [72]:
def batch_upsert(index, records, batch_size=1000):
    for i in range(0, len(records), batch_size):
        batch = records[i:i + batch_size]
        index.upsert(batch)

# Example usage
batch_upsert(index, records)




In [73]:
import joblib

# Save to a file
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")


['tfidf_vectorizer.pkl']

In [74]:
query = "Mahila"
sorted_query = ' '.join(sorted(query.lower().split()))

query_vec = vectorizer.transform([sorted_query])

sparse_query = {
    "indices": query_vec.indices.tolist(),
    "values": query_vec.data.tolist()
}

results = index.query(top_k=15, sparse_vector=sparse_query, include_metadata=True)

for match in results["matches"]:
    print(f"Score: {match['score']:.4f}")
    print("Match:", match["metadata"]["original"])


Score: 0.9844
Match: MAHILA TIMES
Score: 0.8363
Match: MAHILA JAGRAN
Score: 0.8201
Match: MAHILA KI AWAZ
Score: 0.7634
Match: MAHILA KI AAWAZ
Score: 0.6959
Match: MAHILA PRAGATI TIMES
Score: 0.6820
Match: MAHILA MORCHA TIMES
Score: 0.6457
Match: MAHILA ABADH TIMES
Score: 0.0000
Match: HOSPET TIMES
Score: 0.0000
Match: ABHI TAK CRIME TIMES
Score: 0.0000
Match: THE JOURNAL OF THE INDIAN ACADEMY OF PHILOSOPHY
Score: 0.0000
Match: AMALNER TIMES
Score: 0.0000
Match: STATE NEWS TIMES
Score: 0.0000
Match: BHADRA TIMES
Score: 0.0000
Match: LOK PRIYA DILLI TIMES
Score: 0.0000
Match: THE MISSILE TIMES
