In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [None]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from catboost import CatBoostRegressor
import joblib

# === LOAD DATA ===
df = pd.read_csv("/content/JoSAA_2019_2024_CLEAN_FINAL.csv")

# === FEATURE ENGINEERING ===
def engineer(df):
    df = df.copy()

    # Institute tier
    def inst_tier(x):
        if "Indian Institute of Technology" in x: return 1
        if "National Institute of Technology" in x: return 2
        if "Indian Institute of Information Technology" in x: return 3
        return 4
    df["institute_tier"] = df["institute"].apply(inst_tier)

    # Branch demand tier
    def branch_tier(x):
        x = x.lower()
        if "computer" in x or "ai" in x or "data" in x: return 1
        if "elect" in x: return 2
        if "mechanical" in x or "civil" in x or "chemical" in x: return 3
        return 4
    df["branch_demand"] = df["academic_program_name"].apply(branch_tier)

    # Home advantage
    df["home_advantage"] = (df["quota"] == "HS").astype(int)

    # Normalized year + round
    df["year_norm"] = (df["year"] - df["year"].min()) / (df["year"].max() - df["year"].min())
    df["round_norm"] = df["round"] / df["round"].max()

    return df

df = engineer(df)

# === SELECT FEATURES ===
feature_cols = [
    "institute",
    "academic_program_name",
    "quota",
    "seat_type",
    "gender",
    "year_norm",
    "round_norm",
    "institute_tier",
    "branch_demand",
    "home_advantage",
]

X = df[feature_cols]
y = df["closing_rank"]

# === TRAIN-TEST SPLIT ===
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# === CATEGORICAL INDICES ===
categorical_idx = [0, 1, 2, 3, 4]

# === TRAIN CATBOOST ===
model = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.07,
    depth=8,
    loss_function="RMSE",
    random_seed=42,
    verbose=False
)

model.fit(
    X_train, y_train,
    cat_features=categorical_idx
)

# === EVALUATE ===
pred = model.predict(X_test)
print("\n===== MODEL PERFORMANCE =====")
print("MAE :", mean_absolute_error(y_test, pred))
print("RMSE:", mean_squared_error(y_test, pred)**0.5)
print("R2  :", r2_score(y_test, pred))

# === SAVE ===
SAVE_PATH = "/content/COLLEGE_MODEL.pkl"
joblib.dump((model, feature_cols, categorical_idx), SAVE_PATH)

print("\nModel saved to:", SAVE_PATH)



===== MODEL PERFORMANCE =====
MAE : 2557.6394924237534
RMSE: 13120.551039165102
R2  : 0.8883651714648854

Model saved to: /content/COLLEGE_MODEL.pkl


In [None]:
import joblib
import pandas as pd

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from catboost import CatBoostRegressor
import joblib

In [None]:
def engineer(df):
    df = df.copy()

    def inst_tier(x):
        if "Indian Institute of Technology" in x: return 1
        if "National Institute of Technology" in x: return 2
        if "Indian Institute of Information Technology" in x: return 3
        return 4
    df["institute_tier"] = df["institute"].apply(inst_tier)

    def branch_tier(x):
        x = x.lower()
        if "computer" in x or "ai" in x or "data" in x: return 1
        if "elect" in x: return 2
        if "mechanical" in x or "civil" in x or "chemical" in x: return 3
        return 4
    df["branch_demand"] = df["academic_program_name"].apply(branch_tier)

    df["home_advantage"] = (df["quota"] == "HS").astype(int)

    df["year_norm"] = (df["year"] - df["year"].min()) / (df["year"].max() - df["year"].min())
    df["round_norm"] = df["round"] / df["round"].max()

    return df


In [None]:
model, feature_cols, categorical_idx = joblib.load("/content/COLLEGE_MODEL (1).pkl")
df = pd.read_csv("/content/JoSAA_2019_2024_CLEAN_FINAL.csv")
df = engineer(df)  # your same FE function


In [None]:
def safe_filter(df, col, value):
    """
    Universal matching function:
    - removes spaces,
    - lowercases everything,
    - matches partial texts,
    - handles hyphens, underscores, etc.
    """
    value = str(value).strip().lower()

    return df[df[col].astype(str).str.strip().str.lower().str.replace("-", " ").str.contains(value)]


In [None]:
print(df["gender"].unique())


['Gender-Neutral' 'Female-only']


In [None]:
print(df["seat_type"].unique())


['OPEN' 'EWS' 'OBC-NCL' 'SC' 'ST' 'OPEN (PwD)' 'OBC-NCL (PwD)' 'SC (PwD)'
 'EWS (PwD)' 'ST (PwD)']


In [None]:
print(df["quota"].unique())


['AI' 'HS' 'OS' 'GO' 'JK' 'LA']


In [None]:
print(df.groupby("year")["round"].unique())


year
2021    [1, 6]
2022    [1, 6]
2023    [1, 6]
2024    [1, 5]
Name: round, dtype: object


In [None]:
print(df.head(10))


                                    institute  \
0  Indian Institute of Technology Bhubaneswar   
1  Indian Institute of Technology Bhubaneswar   
2  Indian Institute of Technology Bhubaneswar   
3  Indian Institute of Technology Bhubaneswar   
4  Indian Institute of Technology Bhubaneswar   
5  Indian Institute of Technology Bhubaneswar   
6  Indian Institute of Technology Bhubaneswar   
7  Indian Institute of Technology Bhubaneswar   
8  Indian Institute of Technology Bhubaneswar   
9  Indian Institute of Technology Bhubaneswar   

                               academic_program_name quota seat_type  \
0  Civil Engineering (4 Years, Bachelor of Techno...    AI      OPEN   
1  Civil Engineering (4 Years, Bachelor of Techno...    AI      OPEN   
2  Civil Engineering (4 Years, Bachelor of Techno...    AI       EWS   
3  Civil Engineering (4 Years, Bachelor of Techno...    AI       EWS   
4  Civil Engineering (4 Years, Bachelor of Techno...    AI   OBC-NCL   
5  Civil Engineering (4 Year

In [None]:
def recommend_colleges_with_ML(
    user_rank,
    seat_type="OPEN",
    gender="Gender-Neutral",
    quota="AI",
    year=2024,
    round_no=5     # LAST POSSIBLE ROUND FOR 2024
):
    data = df.copy()

    # ------------ YEAR + ROUND FILTER ------------
    # Make sure round exists
    available_rounds = df[df["year"] == year]["round"].unique()
    if round_no not in available_rounds:
        print(f"⚠ Round {round_no} not available for year {year}. Using round {max(available_rounds)} instead.")
        round_no = max(available_rounds)

    data = data[(data["year"] == year) & (data["round"] == round_no)]

    # ------------ STRING FILTERS (exact match) ------------
    data = data[data["gender"] == gender]
    data = data[data["seat_type"] == seat_type]
    data = data[data["quota"] == quota]

    # ------------ PREDICT CLOSING RANK ------------
    data["predicted_closing_rank"] = model.predict(data[feature_cols])

    # ------------ FILTER BASED ON USER RANK ------------
    eligible = data[data["predicted_closing_rank"] >= user_rank]

    # If empty, show closest matches
    if eligible.empty:
        print("⚠ No eligible colleges found for this rank. Showing nearest possible matches...")
        data["rank_diff"] = abs(data["predicted_closing_rank"] - user_rank)
        return data.sort_values("rank_diff").head(20)[[
            "institute",
            "academic_program_name",
            "predicted_closing_rank",
            "opening_rank",
            "closing_rank"
        ]]

    # Sort best → worst
    eligible = eligible.sort_values("predicted_closing_rank")

    return eligible[[
        "institute",
        "academic_program_name",
        "predicted_closing_rank",
        "opening_rank",
        "closing_rank"
    ]]


In [None]:
result = recommend_colleges_with_ML(
    user_rank=10000,
    seat_type="OPEN",
    gender="Gender-Neutral",
    quota="AI",
    year=2024,
    round_no=5
)

print(result.head(20))


                                           institute  \
73053     Indian Institute of Technology Gandhinagar   
73093           Indian Institute of Technology Patna   
74214             Indian Institute of Technology Goa   
73445         Indian Institute of Technology Roorkee   
73111           Indian Institute of Technology Patna   
72120          Indian Institute of Technology Indore   
73815  Indian Institute of Technology (BHU) Varanasi   
72159       Indian Institute of Technology Kharagpur   
72285       Indian Institute of Technology Kharagpur   
72628         Indian Institute of Technology Jodhpur   
72992     Indian Institute of Technology Gandhinagar   
73588   Indian Institute of Technology (ISM) Dhanbad   
71770           Indian Institute of Technology Mandi   
72210       Indian Institute of Technology Kharagpur   
73389         Indian Institute of Technology Roorkee   
73938  Indian Institute of Technology (BHU) Varanasi   
73967  Indian Institute of Technology (BHU) Vara

In [None]:
# Load model ONCE at the top
model, feature_cols, categorical_idx = joblib.load("/content/COLLEGE_MODEL (1).pkl")

def recommend_colleges_with_ML(
    user_rank,
    seat_type="OPEN",
    gender="Gender-Neutral",
    quota="AI",
    year=2024,
    round_no=5
):
    data = df.copy()

    data = data[(data["year"] == year) & (data["round"] == round_no)]
    data = data[data["gender"] == gender]
    data = data[data["seat_type"] == seat_type]
    data = data[data["quota"] == quota]

    # Remove IITs
    data = data[~data["institute"].str.contains("Indian Institute of Technology")]

    # ML Prediction
    data["predicted_closing_rank"] = model.predict(data[feature_cols])

    eligible = data[data["predicted_closing_rank"] >= user_rank]

    if eligible.empty:
        data["rank_diff"] = abs(data["predicted_closing_rank"] - user_rank)
        return data.sort_values("rank_diff").head(20)

    return eligible.sort_values("predicted_closing_rank")


In [None]:
result = recommend_colleges_with_ML(
    user_rank=10000,
    seat_type="OPEN",
    gender="Gender-Neutral",
    quota="AI",
    year=2024,
    round_no=5
)

print(result.head(20))

                                               institute  \
80534  Atal Bihari Vajpayee Indian Institute of Infor...   
80825  Indian Institute of Information Technology, Al...   
81928                         Mizoram University, Aizawl   
81084  Indian Institute of Information Technology Luc...   
81047  Indian Institute of Information Technology Luc...   
82020  Shri Mata Vaishno Devi University, Katra, Jamm...   
81058  Indian Institute of Information Technology Luc...   
81071  Indian Institute of Information Technology Luc...   
80794  Indian Institute of Information Technology, Al...   
80559  Atal Bihari Vajpayee Indian Institute of Infor...   
80862  Indian Institute of Information Technology, De...   
81323  Indian Institute of Information Technology (II...   
80585  Atal Bihari Vajpayee Indian Institute of Infor...   
80940  Pt. Dwarka Prasad Mishra Indian Institute of I...   
80572  Atal Bihari Vajpayee Indian Institute of Infor...   
80889  Indian Institute of Information T