## MODEL

In [19]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, util

# ========================== 1. LOAD DATA ================================

# Load the career dataset for skill matching
career_df = pd.read_csv("career_dataset.csv")
career_df["Career"] = career_df["Career"].fillna("").astype(str)
career_df["Skill"] = career_df["Skill"].fillna("").astype(str)

# Load the private universities
private_unis = pd.read_csv("private_unis.csv")
private_unis["Relevant_Field"] = private_unis["Relevant_Field"].fillna("").astype(str)

# Load the government universities
gov_unis_df = pd.read_excel("government_unis.xlsx")

In [20]:
# Pre-load Sentence-BERT
model = SentenceTransformer("paraphrase-mpnet-base-v2")

# ========================== 2. SEMANTIC SEARCH FOR THE USER'S SKILL INPUT ================================

# We embed the "Skill" column from the career dataset to find the best matching career.
skill_embeddings = model.encode(career_df["Skill"].tolist(), convert_to_tensor=True)

In [21]:
model2 = SentenceTransformer("paraphrase-mpnet-base-v2")

In [25]:
user_skill_input = input("Enter the skill area you are interested in (e.g., 'data analytics'): ").strip()
query_embedding = model2.encode(user_skill_input, convert_to_tensor=True)


top_k = 5
search_results = util.semantic_search(query_embedding, skill_embeddings, top_k=top_k)[0]

if not search_results:
    print("No matching skills found in the dataset. Showing fallback career from the entire dataset.\n")
    fallback_career = career_df.iloc[0]  # Or randomly pick any row
    matched_career = fallback_career["Career"]
    matched_skill_text = fallback_career["Skill"]
    print(f"Fallback Career: '{matched_career}' / Skill: '{matched_skill_text}'")
else:
    # Pick the top match
    top_match = search_results[0]
    matched_career = career_df.iloc[top_match['corpus_id']]['Career']
    matched_skill_text = career_df.iloc[top_match['corpus_id']]['Skill']
    matched_score = top_match["score"]
    print(f"\nTop matched skill from dataset: '{matched_skill_text}' (Score: {matched_score:.2f})")
    print(f"Corresponding Career: '{matched_career}'")


# ========================== 3. PRIVATE UNIVERSITY RECOMMENDATIONS (WITH FALLBACK) ================================

print("\n----- PRIVATE UNIVERSITY RECOMMENDATIONS -----")

# 3.1 Embed all private unis' "Relevant_Field"
private_fields = private_unis["Relevant_Field"].tolist()
private_field_embeddings = model.encode(private_fields, convert_to_tensor=True)

# 3.2 Semantic search
private_results = util.semantic_search(query_embedding, private_field_embeddings, top_k=len(private_unis))[0]

if not private_results:
    print(f"No semantic matches found in private universities for skill '{user_skill_input}'.")
    print("Providing fallback suggestion from entire private dataset:\n")
    # Instead of just taking the first row, consider:
    fallback_row = private_unis.sample(1).iloc[0]  # Random sample  # or any other fallback logic
    print(f"  University: {fallback_row['University']}")
    print(f"  Degree: {fallback_row['Degree']}")
    print(f"  Relevant Field: {fallback_row['Relevant_Field']}")
    print(f"  Link: {fallback_row['Link']}")
else:
    print("Top Private University Recommendations:")
    # Show top 5
    top_private = private_results[:5]
    for idx, result in enumerate(top_private, start=1):
        row_idx = result["corpus_id"]
        score = result["score"]
        row = private_unis.iloc[row_idx]
        print(f"\nRecommendation {idx}:")
        print(f"  University: {row['University']}")
        print(f"  Degree: {row['Degree']}")
        print(f"  Relevant Field: {row['Relevant_Field']}")
        print(f"  Link: {row['Link']}")
        print(f"  Similarity Score: {score:.2f}")


# ========================== 4. GOVERNMENT UNIVERSITY RECOMMENDATIONS ================================

gov_interest = input("\nAre you interested in government universities? (yes/no): ").lower().strip()
if gov_interest in ["yes", "y"]:
    try:
        user_z = float(input("Enter your Z-score: ").strip())
    except ValueError:
        print("Invalid Z-score. Exiting government university search.")
        exit()

    user_district = input("Enter your District (e.g., 'Kandy'): ").strip().title()
    al_stream = input("Enter your A-Level stream (e.g., 'Physical Science', 'Commerce', etc.): ").strip().title()

    print("\n----- GOVERNMENT UNIVERSITY RECOMMENDATIONS -----")

    # 4.1 Filter gov_unis_df by:
    # - User's Z-score (program's Z-score <= user's Z-score)
    # - District (if available in data)
    # - Stream (either matches user's stream OR is 'all')
    # - district/stream fallback logic
    filtered_gov = gov_unis_df[(gov_unis_df["Z_score"] <= user_z)]

    # Priority 1: Exact district + exact stream
    priority1 = filtered_gov[
        (filtered_gov["District"].str.title() == user_district) & 
        (filtered_gov["Stream"].str.title() == al_stream)
    ]
    if not priority1.empty:
        filtered_gov = priority1
    else:
        # Priority 2: Any district + exact stream
        priority2 = filtered_gov[filtered_gov["Stream"].str.title() == al_stream]
        if not priority2.empty:
            filtered_gov = priority2
        else:
            # Priority 3: Any stream (including 'all')
            filtered_gov = filtered_gov[
                (filtered_gov["Stream"].str.title() == al_stream) | 
                (filtered_gov["Stream"].str.lower() == "all")
            ]

    if filtered_gov.empty:
        print(f"No government programs match your exact criteria (Z-score: {user_z}, District: {user_district}).")
        print("Trying broader search (matching Z-score and stream only)...")
        
        # Fallback 1: Match Z-score and stream (including 'all' streams), any district
        filtered_gov = gov_unis_df[
            (gov_unis_df["Z_score"] <= user_z) &
            ((gov_unis_df["Stream"].str.title() == al_stream) | 
             (gov_unis_df["Stream"].str.lower() == "all"))
        ].copy()
        
        if filtered_gov.empty:
            print("Still no matches. Trying Z-score only...")
            # Fallback 2: Just match Z-score
            filtered_gov = gov_unis_df[gov_unis_df["Z_score"] <= user_z].copy()
            if filtered_gov.empty:
                print("No programs match even your Z-score. Showing one fallback suggestion:")
                fallback_row = gov_unis_df.iloc[0]
            else:
                print("Showing programs that at least match your Z-score:")
        else:
            print("Showing programs that match your Z-score and stream (from any district):")

    # Remove duplicate university/course combinations (keep the one with lowest Z-score)
    filtered_gov = filtered_gov.sort_values('Z_score').drop_duplicates(
        subset=['Selected_University', 'Course'], 
        keep='first'
    )

    if len(filtered_gov) == 0:
        # Final fallback if all filters return empty
        fallback_row = gov_unis_df.iloc[0]
        print("\nFallback recommendation:")
        print(f"  University: {fallback_row['Selected_University']}")
        print(f"  Course: {fallback_row['Course']}")
        print(f"  Z-Cutoff Required: {fallback_row['Z_score']}")
        print(f"  District: {fallback_row.get('District', 'N/A')}")
        print(f"  Stream: {fallback_row['Stream']}")
    else:
        # 4.2 Semantic search on "Stream" and "Course" for better matching
        combined_fields = filtered_gov["Stream"] + " " + filtered_gov["Course"]
        gov_field_embeddings = model.encode(combined_fields.tolist(), convert_to_tensor=True)
        gov_results = util.semantic_search(query_embedding, gov_field_embeddings, top_k=len(filtered_gov))[0]

        if not gov_results:
            print(f"\nNo semantic matches found for your skill '{user_skill_input}'.")
            # Show filtered results without semantic matching
            print("\nShowing filtered universities without skill matching:")
            for idx, row in filtered_gov.head(5).iterrows():
                print(f"\nRecommendation {idx+1}:")
                print(f"  University: {row['Selected_University']}")
                print(f"  Course: {row['Course']}")
                print(f"  Z-Cutoff Required: {row['Z_score']}")
                print(f"  District: {row.get('District', 'N/A')}")
                print(f"  Stream: {row['Stream']} (Open to all streams)" if row['Stream'].lower() == 'all' else f"  Stream: {row['Stream']}")
        else:
            # Show top 5 unique universities
            seen_universities = set()
            recommendations_shown = 0
            print(f"\nTop Government University Recommendations for skill '{user_skill_input}' with:")
            print(f"Z-score >= {user_z}, District: {user_district}, Stream: {al_stream}")
            
            for result in gov_results:
                if recommendations_shown >= 5:
                    break
                row_idx = result["corpus_id"]
                row = filtered_gov.iloc[row_idx]
                university = row['Selected_University']
                
                if university not in seen_universities:
                    seen_universities.add(university)
                    recommendations_shown += 1
                    score = result["score"]
                    print(f"\nRecommendation {recommendations_shown}:")
                    print(f"  University: {university}")
                    print(f"  Course: {row['Course']}")
                    print(f"  Z-Cutoff Required: {row['Z_score']}")
                    print(f"  District: {row.get('District', 'N/A')}")
                    print(f"  Stream: {row['Stream']} (Open to all streams)" if row['Stream'].lower() == 'all' else f"  Stream: {row['Stream']}")
                    print(f"  Similarity Score: {score:.2f}")


Top matched skill from dataset: 'Cybersecurity' (Score: 0.66)
Corresponding Career: 'Security'

----- PRIVATE UNIVERSITY RECOMMENDATIONS -----
Top Private University Recommendations:

Recommendation 1:
  University: Horizon Campus
  Degree: BSc (Hons) Cyber Security (Top-up)
  Relevant Field: Cyber Security, Pen Testing
  Link: https://horizoncampus.edu.lk/bcu_cyber_security_top_up
  Similarity Score: 0.57

Recommendation 2:
  University: NIBM
  Degree: BSc (Hons) in Ethical Hacking and Network Security
  Relevant Field: Cyber Security, Ethical Hacking
  Link: https://www.nibm.lk/course/bsc-hons-ethical-hacking-and-network-security
  Similarity Score: 0.54

Recommendation 3:
  University: SLIIT
  Degree: BSc (Hons) in Computer Science
  Relevant Field: Software Engineering, AI, Cyber Security
  Link: https://www.sliit.lk/computing/programmes/computer-science-degree/
  Similarity Score: 0.53

Recommendation 4:
  University: ICBT Campus
  Degree: BSc (Hons) Network Systems Engineering
 

In [23]:
# ========================== 5. MODEL SAVING ================================

import joblib

joblib.dump({
    'model': model,
    'career_df': career_df,
    'private_unis': private_unis,
    'gov_unis_df': gov_unis_df,
    'skill_embeddings': skill_embeddings,
    'private_field_embeddings': private_field_embeddings
}, 'university_recommender.pkl'),
compress=3 

print("Model and data saved successfully!")

Model and data saved successfully!


#server
