<a href="https://colab.research.google.com/github/tarakantaacharya/NLPinternal/blob/main/processing_file.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! pip install chromadb



In [None]:
import pandas as pd
df = pd.read_csv('college_list.csv')

In [None]:
import pandas as pd

# Step 1: Create 'Best_Rank' column from 'CSE_Best_Boy_Rank'
df["Best_Rank"] = df["CSE_Best_Boy_Rank"]

# Step 2: Identify branch columns and extract available ones
branch_columns = [col for col in df.columns if "_Branch" in col]

def get_available_branches(row):
    available = [col.replace("_Branch", "") for col in branch_columns if row[col] != "Not Available"]
    return ", ".join(available)

df["Branch_Available"] = df.apply(get_available_branches, axis=1)

# Step 3: Drop all branch availability and rank columns (EXCEPT 'Best_Rank')
columns_to_remove = branch_columns + [
    col for col in df.columns if "Rank" in col and col != "Best_Rank"
]
df = df.drop(columns=columns_to_remove)

# Display the first few rows
df.head()

Unnamed: 0,S.No,Institute Name,State,Institute Code,Location,Institution Registered,Established Year,Affiliated To,Co-Ed Status,District Code,College Type,Tuition Fee,Best_Rank,Branch_Available
0,1,ADARSH COLLEGE OF ENGINEERING,Andhra Pradesh,ACEE,GOLLAPROLU,AU,2008,JNTUK,COED,EG,PVT,43000,98862,"CIV, CSE, ECE, MEC"
1,2,ADITYA COLLEGE OF ENGINEERING,Andhra Pradesh,ACES,PEDDAPURAM,AU,2008,JNTUK,COED,EG,PVT,43000,61096,"CIV, CSE, ECE, MEC, CSD, CSM, EEE, INF"
2,3,ADITYA COLLEGE OF ENGINEERING AND TECHNOLOGY,Andhra Pradesh,ACET,PEDDAPURAM,AU,2004,JNTUK,COED,EG,PVT,43000,22060,"CIV, CSE, ECE, MEC, CSD, CSM, EEE, INF"
3,4,ADITYA COLLEGE OF PHARMACY,Andhra Pradesh,ACPS,PEDDAPURAM,AU,2006,JNTUK,COED,EG,PVT,62400,Not Available,
4,5,ADARSHA COLL OF PHARMACY,Andhra Pradesh,ADCP,G.KOTHAPALLI,AU,2008,JNTUK,COED,EG,PVT,38000,Not Available,


In [None]:
df.columns

Index(['S.No', 'Institute Name', 'State', 'Institute Code', 'Location',
       'Institution Registered', 'Established Year', 'Affiliated To',
       'Co-Ed Status', 'District Code', 'College Type', 'Tuition Fee',
       'Best_Rank', 'Branch_Available'],
      dtype='object')

In [None]:
import pandas as pd
import json

# Function to convert each row into a meaningful sentence
def convert_to_rag_format(row):
    return (
        f"{row['Institute Name']} is a {row['College Type']} college located in {row['Location']}, {row['State']}. "
        f"It is affiliated to {row['Affiliated To']} and was established in {row['Established Year']}. "
        f"The college is {row['Co-Ed Status']} and has an institution code of {row['Institute Code']}. "
        f"The tuition fee is {row['Tuition Fee']} INR. "
        f"Available branches include {row['Branch_Available']}. "
        f"The best CSE rank admitted in this college is {row['Best_Rank']}."
    )

# Apply function to each row
df["RAG_Text"] = df.apply(convert_to_rag_format, axis=1)

# Select only required columns for RAG
rag_data = df[["Institute Name", "RAG_Text"]].rename(columns={"Institute Name": "metadata"})

# Convert to JSON for storage in vector DB
rag_json = rag_data.to_dict(orient="records")

# Save as a JSON file
with open("rag_data.json", "w") as f:
    json.dump(rag_json, f, indent=4)

# Display the first few rows
print(json.dumps(rag_json[:3], indent=4))  # Preview first 3 entries

[
    {
        "metadata": "ADARSH COLLEGE OF ENGINEERING",
        "RAG_Text": "ADARSH COLLEGE OF ENGINEERING is a PVT college located in GOLLAPROLU, Andhra Pradesh. It is affiliated to JNTUK and was established in 2008. The college is COED and has an institution code of ACEE. The tuition fee is 43000 INR. Available branches include CIV, CSE, ECE, MEC. The best CSE rank admitted in this college is 98862."
    },
    {
        "metadata": "ADITYA COLLEGE OF ENGINEERING",
        "RAG_Text": "ADITYA COLLEGE OF ENGINEERING is a PVT college located in PEDDAPURAM, Andhra Pradesh. It is affiliated to JNTUK and was established in 2008. The college is COED and has an institution code of ACES. The tuition fee is 43000 INR. Available branches include CIV, CSE, ECE, MEC, CSD, CSM, EEE, INF. The best CSE rank admitted in this college is 61096."
    },
    {
        "metadata": "ADITYA COLLEGE OF ENGINEERING AND TECHNOLOGY",
        "RAG_Text": "ADITYA COLLEGE OF ENGINEERING AND TECHNOLOGY is a P

In [None]:
from sentence_transformers import SentenceTransformer
import chromadb

# Load pre-trained embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")  # Lightweight model for embeddings

# Initialize ChromaDB
chroma_client = chromadb.PersistentClient(path="college_rag_db")

# Create a collection
collection = chroma_client.get_or_create_collection(name="college_info")

# Generate embeddings and store data in ChromaDB
for record in rag_json:
    embedding = model.encode(record["RAG_Text"]).tolist()  # Convert to list format
    collection.add(
        ids=[record["metadata"]],  # Use college name as ID
        embeddings=[embedding],
        metadatas=[{"college_name": record["metadata"]}],
        documents=[record["RAG_Text"]]  # ✅ Store text for retrieval
    )

print("Embeddings stored successfully in ChromaDB!")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Embeddings stored successfully in ChromaDB!


In [None]:
print("Total documents in ChromaDB:", collection.count())
docs = collection.get()
print("Stored Documents:", docs)
query_embedding = model.encode("Best private colleges in Andhra Pradesh").tolist()
results = collection.query(query_embeddings=[query_embedding], n_results=3)
print("Query Results:", results)

Total documents in ChromaDB: 450
Stored Documents: {'ids': ['ADARSH COLLEGE OF ENGINEERING', 'ADITYA COLLEGE OF ENGINEERING', 'ADITYA COLLEGE OF ENGINEERING AND TECHNOLOGY', 'ADITYA COLLEGE OF PHARMACY', 'ADARSHA COLL OF PHARMACY', 'ADITYA ENGINEERING COLLEGE', 'ADI KAVI NANNAYA UNIVERSITY COLLEGE OF ENGG.-SELF FINANCE', 'B V CHALAMAIAH ENGINEERING COLLEGE', 'BVC COLLEGE OF ENGINEERING', 'BONAM VENKATA CHALAMAIAH INST. OF TECH AND SCI.', 'CHAITANYA INST. OF SCI. AND TECHNOLOGY', 'GODAVARI INSTITUTE OF ENGG. AND TECHNOLOGY', 'GIET ENGINEERING COLLEGE', 'IDEAL INSTITUTE OF TECHNOLOGY', 'INTERNATIONAL SCHOOL OF TECH AND SCI FOR WOMEN', 'SCHOOL OF FOOD TECHNOLOGY JNTUK KAKINADA-SELF FINANCE', 'SCHOOL OF PHARMACEUTICAL SCI. AND TECH.-SELF FINANCE', 'JNTUK COLLEGE OF ENGG. KAKINADA', 'JNTUK COLLEGE OF ENGG. KAKINADA- SELF FINANCE', 'JNTUK COLLEGE OF ENGG. KAKINADA- SELF SUPPORTING', 'KAKINADA INSTITUTE OF ENGG. AND TECHNOLOGY', 'KAKINADA INST OF ENGG AND TECHNOLOGY FOR WOMEN', 'KAKINADA INST

In [None]:
def search_college_info(query, top_k=3):
    query_embedding = model.encode(query).tolist()
    results = collection.query(query_embeddings=[query_embedding], n_results=top_k)

    # Print the retrieved results
    for result in results["documents"][0]:
        print(result)

# Example query
search_college_info("Where is Adarsh College of Engineering located")

ADARSH COLLEGE OF ENGINEERING is a PVT college located in GOLLAPROLU, Andhra Pradesh. It is affiliated to JNTUK and was established in 2008. The college is COED and has an institution code of ACEE. The tuition fee is 43000 INR. Available branches include CIV, CSE, ECE, MEC. The best CSE rank admitted in this college is 98862.
ADARSHA COLL OF PHARMACY is a PVT college located in G.KOTHAPALLI, Andhra Pradesh. It is affiliated to JNTUK and was established in 2008. The college is COED and has an institution code of ADCP. The tuition fee is 38000 INR. Available branches include . The best CSE rank admitted in this college is Not Available.
AAR MAHAVEER ENGINEERING COLLEGE is a PVT college located in BANDLAGUDA, Telangana. It is affiliated to JNTUH and was established in 2010. The college is COED and has an institution code of AARM. The tuition fee is 60000 INR. Available branches include CSE, ECE, MEC, CSM, EEE. The best CSE rank admitted in this college is 44341.


---