In [3]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('/home/pamya/Python/ML_Projects/maharashtra-forts/data/maharashtra-forts.csv')
df.sample(3)

In [None]:
df.info()

In [44]:
import sys
sys.path.append("/home/pamya/Python/ML_Projects/maharashtra-forts")
from src.core.data_loader import load_forts
from src.api.routers.forts import list_forts, get_fort

In [45]:
# Test data loading
df = load_forts()
df.head()


Unnamed: 0,fort_id,name,alternate_names,district,taluka,latitude,longitude,base_village,type,elevation_m,...,built_by,year_of_construction,key_events,trek_difficulty,trek_time_hours,best_season,water_availability,accommodation,asi_protected,notes
0,1,Aad Fort,,Ratnagiri,Dapoli,17.7125,73.2391,Aad,Giri Durg (Hill Fort),250,...,Unknown,Unknown,Minor watchtower fort,Easy,1.0,Winter,,,False,A small lesser-known fort covered in dense ve...
1,2,Achala Fort,,Nashik,Satana,20.4578,74.0536,Pimpalgaon,Giri Durg (Hill Fort),1372,...,Shivaji Maharaj,c. 17th Century,Part of Satmala hill range forts; Captured by ...,Medium,2.0,Winter,Seasonal,,False,"Twin fort to Ahivantgad, offers great views of..."
2,3,Achalpur Fort,Ellichpur Fort,Amravati,Achalpur,21.2612,77.5143,Achalpur,Bhuikot (Land Fort),358,...,Ahmad Shah Bahmani,c. 1425,Capital of Berar Sultanate,Easy,0.5,Winter,,,True,"Located inside the city, known for its massive..."
3,4,Adas Fort,,Satara,Khatav,17.6534,74.6543,Adas,Giri Durg (Hill Fort),937,...,Unknown,c. 17th Century,Minor watchtower fort,Easy,1.0,Post-Monsoon,,,False,A small fort primarily used as a watch post.
4,5,Ahiwantgad,,Nashik,Satana,20.4533,74.0494,Pimpalgaon,Giri Durg (Hill Fort),1385,...,Shivaji Maharaj,c. 17th Century,Treaty of Purandar; Captured by British,Medium,2.0,Winter,Year-round in cisterns,Caves,False,"Twin fort to Achala fort, features large caves..."


In [None]:
result = list_forts(limit=5, q=None, district=None)
result

In [None]:
result = list_forts(limit=10, q="shivaji", district=None)
result

In [None]:
from src.api.routers.clustering import get_clusters, predict_cluster

get_clusters()

In [None]:
predict_cluster(lat=18.52, lon=73.85)

In [5]:
from sentence_transformers import SentenceTransformer
from typing import List, Tuple

encoder = SentenceTransformer('all-MiniLM-L6-v2')

In [6]:
corpus = df['notes'].fillna('').astype(str).tolist()
emb = encoder.encode(corpus, show_progress_bar=True)
embeddings = np.vstack(emb)

Batches:   0%|          | 0/11 [00:00<?, ?it/s]

In [21]:
query = "Yashwantgad fort"
v = encoder.encode([query])[0]
sims = (embeddings @ v) / (np.linalg.norm(embeddings, axis=1) * np.linalg.norm(v) + 1e-12)
top_k = 3
idx = np.argsort(sims)[-top_k:][::-1]

In [22]:
[corpus[i] for i in idx]

['A different fort from Yashwantgad/Redi Fort in Sindhudurg district.',
 'Sister fort to Bhagwantgad.',
 'A twin fort to Sudhagad.']

In [15]:
df.notes[0]

'A small  lesser-known fort covered in dense vegetation.'

In [25]:
def build_corpus(df):
    corpus = []
    for _, row in df.iterrows():
        text = (
            f"Name: {row['name']}\n"
            f"District: {row['district']}\n"
            f"Type: {row['type']}\n"
            f"Built By: {row.get('built_by','')}\n"
            f"Era: {row.get('era','')}\n"
            f"Key Events: {row.get('key_events','')}\n"
            f"Historical Notes: {row.get('notes','')}\n"
            f"Water Availability: {row.get('water_availability','')}\n"
            f"Trek Difficulty: {row.get('trek_difficulty','')}\n"
            f"Description: {row.get('description','')}\n"
        )
        corpus.append(text)
    return corpus
docs = build_corpus(df)
embeddings = encoder.encode(docs, convert_to_tensor=True)

In [37]:
from sentence_transformers import SentenceTransformer, util
import torch


class RAGEngine:
    def __init__(self):
        self.df = None
        self.corpus = []
        self.embeddings = None
        self.model = SentenceTransformer("all-MiniLM-L6-v2")

    # -------------------------------------------------------
    # 1. LOAD DATA
    # -------------------------------------------------------
    def load_data(self, df):
        """
        Loads DataFrame and builds concatenated corpus for embeddings.
        Output of RAG = original df rows (no processing).
        """
        self.df = df
        self.corpus = []

        text_columns = [
            "name", "district", "type", "built_by", "era",
            "key_events", "notes", "water_availability",
            "trek_difficulty", "description"
        ]

        for _, row in df.iterrows():
            # Safe text fusion without modifying text content
            parts = [str(row.get(col, "")) for col in text_columns]
            combined = " | ".join(parts)  # simple join, no formatting

            self.corpus.append(combined)

        print(f"RAGEngine: Loaded corpus with {len(self.corpus)} items.")
        return self

    # -------------------------------------------------------
    # 2. BUILD INDEX
    # -------------------------------------------------------
    def build_index(self):
        if not self.corpus:
            raise ValueError("Data not loaded. Call load_data() first.")

        print("RAGEngine: Generating embeddings...")
        self.embeddings = self.model.encode(self.corpus, convert_to_tensor=True)
        print("RAGEngine: Index built.")
        return self

    # -------------------------------------------------------
    # 3. QUERY (NO POST PROCESSING)
    # -------------------------------------------------------
    def query(self, user_query, k=5):
        """
        RAG search:
        - Uses semantic similarity
        - Returns ORIGINAL dataframe rows (as dictionaries)
        - NO transformations, NO natural language formatting
        """

        if self.embeddings is None:
            raise ValueError("Index not built. Call build_index().")

        # Encode query
        q_emb = self.model.encode(user_query, convert_to_tensor=True)

        # Compute similarity
        scores = util.pytorch_cos_sim(q_emb, self.embeddings)[0]
        top_idx = torch.topk(scores, k).indices.tolist()

        # Return original DataFrame rows EXACTLY as stored
        results = [self.df.iloc[i].to_dict() for i in top_idx]
        return results


In [38]:
rag = RAGEngine()
rag.load_data(df)
rag.build_index()

RAGEngine: Loaded corpus with 346 items.
RAGEngine: Generating embeddings...
RAGEngine: Index built.


<__main__.RAGEngine at 0x70df34833590>

In [39]:
result = rag.query("sea fort with strong historical importance", k=3)
result

[{'fort_id': 42,
  'name': 'Bhagwantgad',
  'alternate_names': '',
  'district': 'Sindhudurg',
  'taluka': 'Vengurla',
  'latitude': 15.9083,
  'longitude': 73.5242,
  'base_village': 'Bhonus',
  'type': 'Jal Durg (Sea Fort)',
  'elevation_m': 20,
  'current_condition': 'In Ruins',
  'era': 'Sawantwadi State',
  'built_by': 'Khem Sawant III',
  'year_of_construction': 'c. 1701',
  'key_events': 'Built to counter the Portuguese at Terekhol',
  'trek_difficulty': 'Easy',
  'trek_time_hours': 0.5,
  'best_season': 'Winter',
  'water_availability': nan,
  'accommodation': nan,
  'asi_protected': 'FALSE',
  'notes': 'A small coastal fort, mostly in ruins but offers great sea views.'},
 {'fort_id': 340,
  'name': 'Vijaydurg',
  'alternate_names': 'Gheria',
  'district': 'Sindhudurg',
  'taluka': 'Devgad',
  'latitude': 16.5667,
  'longitude': 73.3333,
  'base_village': 'Vijaydurg',
  'type': 'Jal Durg (Sea Fort)',
  'elevation_m': 10,
  'current_condition': 'Well-Preserved',
  'era': 'Shilah

In [41]:
from src.core.recommender import recommend_by_proximity, recommend_similar

In [48]:
df0 = recommend_by_proximity(df, lat=18.52, lon=73.85, k=10)
df0

Unnamed: 0,fort_id,name,alternate_names,district,taluka,latitude,longitude,base_village,type,elevation_m,...,year_of_construction,key_events,trek_difficulty,trek_time_hours,best_season,water_availability,accommodation,asi_protected,notes,distance_km
248,249,Pisol Fort,,Pune,Haveli,18.4231,73.8312,Pisol,Giri Durg (Hill Fort),850,...,Unknown,A very small watchtower,Easy,0.5,Winter,,,False,Only a small plinth and a water cistern remain.,10.907715
100,101,Ghotavade Fort,,Pune,Mulshi,18.5412,73.6815,Ghotavade,Giri Durg (Hill Fort),700,...,Unknown,A small hillock fort,Easy,0.5,Winter,,,False,Almost no structure remains just a small hill.,17.944942
301,302,Sinhagad,Kondhana,Pune,Haveli,18.3663,73.7558,Donje,Giri Durg (Hill Fort),1312,...,c. 13th Century,"Battle of Sinhagad, 1670, led by Tanaji Malusare",Easy,0.5,Monsoon; Winter,Available seasonally; Bottled water sold,,True,Famous for pithla-bhakri stalls; Accessible by...,19.708968
204,205,Mohangad,Jaslodgad,Pune,Shirur,18.3,73.8167,Mohari,Giri Durg (Hill Fort),1100,...,c. 17th Century,A subsidiary fort to Rohida,Medium,2.0,Winter,Seasonal,,False,A small fort with remnants of a gate and ciste...,24.603595
187,188,Malhargad,Sonori Fort,Pune,Purandar,18.4111,74.0622,Sonori,Giri Durg (Hill Fort),925,...,c. 1760,Last fort built by the Marathas; named after L...,Easy,1.0,Monsoon; Winter,Seasonal,Temple,False,A small fort with two temples in good condition.,25.448933
285,286,Sangram Durg,Chakan Fort,Pune,Khed,18.75,73.85,Chakan,Bhuikot (Land Fort),650,...,c. 13th Century,Famous for the valiant defence by Firangoji Na...,Easy,0.5,All Seasons,Not applicable,,True,A land fort in the middle of Chakan town; has ...,25.458181
252,253,Purandar Fort,,Pune,Purandar,18.2831,73.8553,Narayanpur,Giri Durg (Hill Fort),1387,...,c. 11th Century,Birthplace of Sambhaji Maharaj; site of the Tr...,Easy,1.5,All Seasons,Available,,True,Consists of two forts: Purandar and Vajragad; ...,26.22726
328,329,Vajragad,Rudramal,Pune,Purandar,18.2833,73.8667,Narayanpur,Giri Durg (Hill Fort),1318,...,c. 14th Century,Twin fort of Purandar; captured by Shivaji's f...,Easy,0.5,All Seasons,,,True,"Often visited along with Purandar fort, featur...",26.2585
118,119,Induri Fort,Talegaon Fort,Pune,Maval,18.7301,73.6558,Induri,Bhuikot (Land Fort),580,...,c. 1720-21,Built on the banks of the Indrayani river,Easy,0.2,Winter,,,False,Known for the samadhi of Sarsenapati Khanderao...,30.996487
172,173,Lingana,,Raigad,Mahad,18.2989,73.6194,Mohari,Giri Durg (Hill Fort),884,...,c. 1648,Used as a penal colony by the Marathas,Hard,4.0,Winter,,Caves at base,False,A massive pinnacle near Raigad and Torna; requ...,34.533172


In [46]:
df2 = recommend_similar(df, fort_id=1, k=5)
df2.head(5)

Unnamed: 0,fort_id,name,alternate_names,district,taluka,latitude,longitude,base_village,type,elevation_m,...,trek_difficulty,trek_time_hours,best_season,water_availability,accommodation,asi_protected,notes,type_score,elev_diff,score
0,1,Aad Fort,,Ratnagiri,Dapoli,17.7125,73.2391,Aad,Giri Durg (Hill Fort),250,...,Easy,1.0,Winter,,,FALSE,A small lesser-known fort covered in dense ve...,1,0,1.0
190,191,Mangad Fort,,Raigad,Mangaon,18.3333,73.2667,Borwadi,Giri Durg (Hill Fort),235,...,Easy,1.0,Monsoon; Winter,Seasonal,Caves; Temple,FALSE,Features a unique cave called 'Chor-darwaza' o...,1,15,0.985
24,25,Avchitgad,,Raigad,Roha,18.435,73.1897,Medha,Giri Durg (Hill Fort),296,...,Easy,1.5,Monsoon; Winter,Year-round in cisterns,,FALSE,Known for its two main gates and bastions stil...,1,46,0.954
315,316,Talagad,,Raigad,Roha,18.35,73.1833,Talewadi,Giri Durg (Hill Fort),300,...,guarding the Mandad river,,1,Monsoon; Winter,Year-round,Caves; Temple,Features a large plateau and several water cis...,1,50,0.95
33,34,Balwantgad,,Ratnagiri,Khed,17.6538,73.4011,Asgani,Giri Durg (Hill Fort),300,...,Easy,1.0,Winter,,,FALSE,A small fort with very little historical record.,1,50,0.95
