In [1]:
!pip install sentence-transformers faiss-cpu --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.6/23.6 MB[0m [31m36.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import os
import glob
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
import requests

# ----------- GPT-4.1 API Submit Function -----------
api_key = "<your api key>"
base_url = "<your url>"
model_name = "gpt-4.1"
api_version = "2024-12-01-preview"

def submit(user_message: str, system_message: str = "", messages: list = None,
           temperature=0.7, max_tokens=300):
    if messages is None:
        messages = []
        if system_message:
            messages.append({"role": "system", "content": system_message})
        messages.append({"role": "user", "content": user_message})

    url = f"{base_url}/deployments/{model_name}/chat/completions?api-version={api_version}"
    headers = {
        "accept": "application/json",
        "Content-Type": "application/json",
        "api-key": api_key,
    }
    payload = {"messages": messages, "temperature": temperature, "max_tokens": max_tokens, "top_p": 1, "stream": False}
    response = requests.post(url, json=payload, headers=headers)

    if response.status_code == 200:
        return response.json()["choices"][0]["message"]["content"]
    else:
        return f"ERROR: {response.status_code}: {response.text}"

In [3]:
def load_documents(base_folder="/content/drive/MyDrive/docs"):
    doc_texts = []
    doc_meta = []
    for level_folder in sorted(os.listdir(base_folder)):
        level_path = os.path.join(base_folder, level_folder)
        if os.path.isdir(level_path):
            for md_path in glob.glob(os.path.join(level_path, "*.md")):
                with open(md_path, "r", encoding="utf-8") as f:
                    text = f.read()
                doc_texts.append(text)
                doc_meta.append({"level": level_folder, "path": md_path})
    return doc_texts, doc_meta

#Modify the path if you are not using colab in this way
documents, doc_meta = load_documents("/content/drive/MyDrive/docs")
print(f"Loaded {len(documents)} program documents.")

# Embed docs (each .md is one unit)
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
doc_embeds = embedding_model.encode(documents, show_progress_bar=True)
doc_embeds = np.array(doc_embeds).astype('float32')

Loaded 78 program documents.


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
index = faiss.IndexFlatL2(doc_embeds.shape[1])
index.add(doc_embeds)

In [29]:
#Modify this with real case
student_profile = {
    "bachelor_school": "Hong Kong Baptist University, CS major",
    "GPA": "3.63/4.0(WES)",
    "language_test": "IELTS 7.0",
    "GRE": "325",
    "experience": "1 research internship, 1 industry internship about LLM.",
    "preference": "Interested more in PhD opportunities than employment."
}
print(student_profile)

{'bachelor_school': 'Hong Kong Baptist University, CS major', 'GPA': '3.63/4.0(WES)', 'language_test': 'IELTS 7.0', 'GRE': '325', 'experience': '1 research internship, 1 industry internship about LLM.', 'preference': 'Interested more in PhD opportunities than employment.'}


In [16]:
level_bar = {
    "C+": "GPA >= 80/100(3.2/4.0), basic language test, accepts most backgrounds (friendly for non-STEM majors).",
    "B-": "GPA >= 3.2/4.0, TOEFL > 95, internships help, moderate selectivity, mostly teaching-focused.",
    "B":  "GPA 3.2-3.4/4.0, TOEFL > 95, maily depends on bachelor school level and GPA.",
    "B+": "GPA 3.4-3.6, TOEFL > 95, relevant coursework, research experience beneficial, prefer good bachelor school.",
    "A-": "GPA >= 3.6, TOEFL > 100, GRE > 320, normal school + strong projects/some research/high GPA or good bachelor school + moderate GPA.",
    "A":  "GPA >= 3.6, TOEFL > 103, GRE > 325, strong research project or work/internship experience.",
    "A+": "GPA >= 3.8, TOEFL > 105, GRE > 325, strong research and top coursework, some US/EU/Asia top schools.",
    "S":  "GPA >= 3.8, TOEFL > 105, GRE > 328, strong international recognition, excellent research or industry experience, top schools.",
    "SS": "GPA >= 3.8, TOEFL > 105, GRE > 330, top schools, first-authored papers, competitive fellowship awards, mainly for those whose GPA ranks top 5% in the major.",
    "SSS": "This level is the top schools of the top schools, like MIT. Requires almost full score GPA + top schools worldwide + strong research experience + strong recommendation letters"
}

In [30]:
#@title Step 1: Ask LLM for 4 Suitable Admissions Levels

levels_prompt = "\n".join([f"{level}: {desc}" for level, desc in level_bar.items()])
system_message = (
    "You are an expert US CS master's admissions advisor. Given a student's profile and the requirements for each admission level (C+, B-, B, ..., SSS), "
    "select the 4 most suitable levels for this student and briefly justify the picks. After all this, output in the very end of the output the 4 levels,"
    "started and ended with {}, punctuated with comas, without any other characters. This is for the ease of make the output useful for some automized programs."
)

user_prompt = f"""Student Profile:
{student_profile}

Level Criteria:
{levels_prompt}
"""

selected_levels_text = submit(user_prompt, system_message)
print("LLM-selected levels and justification:\n\n", selected_levels_text)

LLM-selected levels and justification:

 Let's analyze the student's profile against the level criteria:

**GPA:** 3.63/4.0 (WES)  
**Bachelor School:** Hong Kong Baptist University, CS major  
**Language Test:** IELTS 7.0  
**GRE:** 325  
**Experience:** 1 research internship, 1 industry internship about LLM  
**Preference:** Interested more in PhD opportunities than employment

**Language Test Conversion:** IELTS 7.0 is roughly equivalent to TOEFL 94-95, so it does not meet the TOEFL > 95 or > 100 requirements for higher levels.

**GPA:** Slightly above 3.6, but not in the 3.8+ threshold.

**GRE:** 325, meets A- and A requirements.

**Bachelor School:** Hong Kong Baptist University is a decent Asian university, though not considered top-tier globally.

**Research Experience:** Has one research internship, which is beneficial for higher tiers, especially those interested in PhD.

### Level Suitability

#### C+
- Criteria met. GPA above minimum, language test sufficient, and relevant b

In [31]:
#@title Step 2: Parse Levels (Simple Extraction)


import re

# Try to parse three level codes from response
def extract_levels(reply):
    levels = []
    r = reply[-12:]
    for l in reversed(level_bar):
        if re.search(rf'{re.escape(l)}', r):
            levels.append(l)
    return levels[:4]
candidate_levels = extract_levels(selected_levels_text)
print("Parsed candidate levels:", candidate_levels)

Parsed candidate levels: ['A', 'A-', 'B+', 'B']


In [32]:
#@title Step 3: RAG — Retrieve Top 40 Relevant Programs from Candidate Levels

# Subset docs/embeds
candidate_indices = [i for i, meta in enumerate(doc_meta) if meta["level"] in candidate_levels]
candidate_embeds = doc_embeds[candidate_indices]
candidate_texts = [documents[i] for i in candidate_indices]
candidate_paths = [doc_meta[i]["path"] for i in candidate_indices]

cand_index = faiss.IndexFlatL2(doc_embeds.shape[1])
cand_index.add(candidate_embeds)

# Compose query
profile_query = (
    f"Student's background:\n{student_profile}\n"
    "This bachelor student is looking for a master program in computer science or related majors."
    "Which programs described below best match this profile, in consideration of both the chance of"
    "getting admitted and the suitability of the programs?"
)

q_embed = embedding_model.encode([profile_query]).astype('float32')
D, I = cand_index.search(q_embed, 40)

retrieved_programs = [candidate_texts[idx] for idx in I[0]]
retrieved_paths = [candidate_paths[idx] for idx in I[0]]
print("Retrieved top programs for matching:\n")
for i, path in enumerate(retrieved_paths[:40]):
    print(f"{i+1}. {os.path.basename(path)}")

Retrieved top programs for matching:

1. NYU DS.md
2. UMD MSCS.md
3. cmu mits.md
4. utah state university MSCS.md
5. Umich MSCS.md
6. CMU MSESS.md
7. UCI MCS.md
8. tamu cse.md
9. Yale MSCS.md
10. Upenn MCIT.md
11. Cornell MPS-IS.md
12. CMU ECE.md
13. cmu sesv.md
14. NWU MSCS.md
15. Columbia ce.md
16. USC CS37.md
17. UWT MSCS.md
18. CMU MSMITE.md
19. Gatech ECE.md
20. UMass MSCS.md
21. Rics MCS.md
22. Columbia DS.md
23. NEU MSCS.md
24. Upenn EE.md
25. Emory MSCS.md
26. jhu msecs.md
27. Ucsd EC79.md
28. cmu mism.md
29. Columbia ee.md
30. UCSB MSCS.md
31. uci mswe.md
32. duke ece.md
33. UIUC ECE MENG.md
34. cornell tech CM.md
35. UCLA MENG.md
36. uchicago mpcs.md
37. NCSU MCS.md
38. uiuc mcs.md
39. NYU Tandon.md
40. uw ee pmp.md


In [33]:
#@title Step 4: Final Recommendation — Ask LLM to Pick the Best 20 and Explain

program_info = "\n\n".join(retrieved_programs)
system_message = (
    "You are a highly knowledgeable CS admission advisor. Based on the student's profile and the following program descriptions, "
    "select the 20 BEST FIT programs and give a brief reason for each. Respond with a numbered list: [School/Program name]: [reason]."
    "The program descriptions are in Chinese, but you should answer in English."
)
user_message = (
    f"Student Profile:\n{student_profile}\n\n"
    f"Candidate program descriptions:\n{program_info}\n\n"
    "Choose the 20 most suitable programs and rank them for the student and justify each selection. If"
    "less than 20 programs' information is given, choose all of them and rank. "
)

final_recommendation = submit(user_message, system_message, max_tokens=2000)
print(final_recommendation)

Based on the student's profile:

- **Undergrad:** Hong Kong Baptist University, CS major  
- **GPA:** 3.63/4.0 (WES)  
- **IELTS:** 7.0  
- **GRE:** 325  
- **Experience:** 1 research internship, 1 industry internship (LLM-related)  
- **Preference:** Favors PhD transition opportunities over immediate employment

**Selection Criteria:**
- Prioritizing programs with strong research/PhD pathways, research-friendly environments, or known for successful MS-to-PhD transitions.
- Considering fit with student’s GPA, background, and realistic admissions chances.
- Deprioritizing programs that are strictly employment/industry-oriented, very low research exposure, or have high barriers for international/“non-elite” undergrad backgrounds.

---

## 1. **UMD MSCS**:  
**Reason:** Highly research-oriented, shares application pool with PhD, strong AI/NLP/Vision research ranking, excellent for those seeking PhD transition; values research experience over pure GPA.  
---

## 2. **CMU ECE**:  
**Reason: