# Homework 2

# Set up

## Installing packages

In [1]:
!pip install requests PyPDF2 gdown
!pip install 'markitdown[pdf]'
!pip install langchain_mcp_adapters langchain_google_genai langchain-openai



## Setup your API key

To run the following cell, your API key must be stored it in a Colab Secret named `VERTEX_API_KEY`.


1.   Look for the key icon on the left panel of your colab.
2.   Under `Name`, create `VERTEX_API_KEY`.
3. Copy your key to `Value`.

If you cannot use VERTEX_API_KEY, you can use deepseek models via `DEEPSEEK_API_KEY`. It does not affect your score.



In [2]:
from google.colab import userdata
GEMINI_VERTEX_API_KEY = userdata.get('VERTEX_API_KEY')
# DEEPSEEK_API_KEY = userdata.get('DEEPSEEK_API_KEY')

# Download sample CVs

## Downloading sample_cv.pdf
The codes below download the sample CV


In [3]:
import os
import gdown

folder_id = "1adYKq7gSSczFP3iikfA8Er-HSZP6VM7D"
folder_url = f"https://drive.google.com/drive/folders/{folder_id}"

output_dir = "downloaded_cvs"
os.makedirs(output_dir, exist_ok=True)

gdown.download_folder(
    url=folder_url,
    output=output_dir,
    quiet=False,
    use_cookies=False
)

Retrieving folder contents


Processing file 1NR1RUKx4GyM7QOBxKXkfh4e8jUkxFCsp CV_1.pdf
Processing file 16lrd-uO8AAnCnv7UG9Rs_Nk6SUu0Iwbs CV_2.pdf
Processing file 15hVEuBan_EKhEty2aZBd6rcpDpP4o7Vr CV_3.pdf
Processing file 1Y2w_mAUEhg4vZBdvvR-0n3Jf2mKuGDRk CV_4.pdf
Processing file 1PLwkva-pdua6ZVvmLg9mxHeljq9D8C_C CV_5.pdf


Retrieving folder contents completed
Building directory structure
Building directory structure completed
Downloading...
From: https://drive.google.com/uc?id=1NR1RUKx4GyM7QOBxKXkfh4e8jUkxFCsp
To: /content/downloaded_cvs/CV_1.pdf
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 147k/147k [00:00<00:00, 38.7MB/s]
Downloading...
From: https://drive.google.com/uc?id=16lrd-uO8AAnCnv7UG9Rs_Nk6SUu0Iwbs
To: /content/downloaded_cvs/CV_2.pdf
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 75.1k/75.1k [00:00<00:00, 63.0MB/s]
Downloading...
From: https://drive.google.com/uc?id=15hVEuBan_EKhEty2aZBd6rcpDpP4o7Vr
To: /content/downloaded_cvs/CV_3.pdf
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 72.0k/72.0k [00:00<00:00, 11.5MB/s]
Downloading...
From: https://drive.google.com/uc?id=1Y2w_mAUEhg4vZBdvvR-0n3Jf2mKuGDRk
To: /content/downloaded_cvs/CV_4.pdf
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 73.3k/73.3k [00:00<00:00, 20.6MB/s]
Downloading...
From: https://drive.google.com/uc?id=1PLwkva-pdua6ZVvmLg9mxHeljq9D8C_C
To: /content/downloaded_cvs

['downloaded_cvs/CV_1.pdf',
 'downloaded_cvs/CV_2.pdf',
 'downloaded_cvs/CV_3.pdf',
 'downloaded_cvs/CV_4.pdf',
 'downloaded_cvs/CV_5.pdf']

In [4]:
# =====================================================
#  Load and display all CV PDFs in order
# =====================================================
import os
from markitdown import MarkItDown

cv_dir = "downloaded_cvs"

# Initialize MarkItDown
md = MarkItDown(enable_plugins=False)

# Collect and sort PDFs numerically
pdf_files = sorted(
    [f for f in os.listdir(cv_dir) if f.lower().endswith(".pdf")],
    key=lambda x: int("".join(filter(str.isdigit, x)))  # CV_1.pdf ‚Üí 1
)

all_cvs = []

for pdf_name in pdf_files:
    pdf_path = os.path.join(cv_dir, pdf_name)
    result = md.convert(pdf_path)

    all_cvs.append({
        "file": pdf_name,
        "text": result.text_content
    })

    print("=" * 80)
    print(f"üìÑ {pdf_name}")
    print("=" * 80)
    print(result.text_content)
    print("\n\n")


üìÑ CV_1.pdf
|     |     |     |     | John         |           | Smith        |                   |     |     |
| --- | --- | --- | --- | ------------ | --------- | ------------ | ----------------- | --- | --- |
|     |     |     |     | Marketing    |           | Professional |                   |     |     |
|     |     |     |     | + Singapore, | Singapore |              | (cid:209) Kowloon |     |     |
Experience
|                |                  |     |          |                     |              |            |     | 2020 ‚Äì | Present |
| -------------- | ---------------- | --- | -------- | ------------------- | ------------ | ---------- | --- | ------ | ------- |
| Engineer,      | ByteDance        |     |          |                     |              |            |     |        |         |
| ‚Ä¢ Worked       | in a fast-paced, |     | global   | technology          | environment. |            |     |        |         |
| ‚Ä¢ Collaborated | across           |     | teams

# Connect to our MCP server

Documentation about MCP: https://modelcontextprotocol.io/docs/getting-started/intro.

Using MCP servers in Langchain https://docs.langchain.com/oss/python/langchain/mcp.

## Check which tools that the MCP server provide

In [5]:
import asyncio
import json
from langchain_mcp_adapters.client import MultiServerMCPClient

client = MultiServerMCPClient({
    "social_graph": {
        "transport": "http",
        "url": "https://ftec5660.ngrok.app/mcp",
        "headers": {"ngrok-skip-browser-warning": "true"}
    }
})

mcp_tools = await client.get_tools()
for tool in mcp_tools:
    print(tool.name)
    print(tool.description)
    print(tool.args)
    print("\n\n------------------------------------------------------\n\n")

search_facebook_users
Search for Facebook users by display name (supports partial and fuzzy matching).

Args:
    q: Search query string (case-insensitive, matches any part of display name)
       Examples: "John", "john smith", "Smith"
    limit: Maximum number of results to return (default: 20, max: 20)
    fuzzy: Enable fuzzy matching if exact search returns no results (default: True)

Returns:
    List of user dictionaries, each containing:
    - id (int): Unique Facebook user ID for use with get_facebook_profile()
    - display_name (str): User's Facebook display name (may differ from legal name)
    - city (str): Current city of residence
    - country (str): Country of residence
    - match_type (str): "exact" or "fuzzy" (indicates search method used)
    
    Returns empty list [] if no matches found.

Example:
    search_facebook_users("Alex Chan", limit=5)
    ‚Üí [{"id": 123, "display_name": "Alex Chan", "city": "Hong Kong", "country": "Hong Kong", "match_type": "exact"}]
  

## A simple agent using tools from the MCP server


In [8]:
import os
from langchain_core.tools import tool
from langchain_core.messages import HumanMessage
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_mcp_adapters.client import MultiServerMCPClient

# ---------------------------
# 1. Define a local tool
# ---------------------------
@tool
def say_hello(name: str) -> str:
    """Say hello to a person by name."""
    return f"Hello, {name}! üëã"

# ---------------------------
# 2. Load MCP tools + merge
# ---------------------------
client = MultiServerMCPClient({
    "social_graph": {
        "transport": "http",
        "url": "https://ftec5660.ngrok.app/mcp",
        "headers": {"ngrok-skip-browser-warning": "true"}
    }
})

mcp_tools = await client.get_tools()
tools = mcp_tools + [say_hello]

# ---------------------------
# 3. Initialize Gemini (tool-enabled) or deepseek
# ---------------------------
llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-pro", # Changed from "gemini-2.0-flash"
    google_api_key=GEMINI_VERTEX_API_KEY,
    temperature=0,
)

# from langchain_openai import ChatOpenAI
# DEEPSEEK_API_KEY = userdata.get("DEEPSEEK_API_KEY")
# llm = ChatOpenAI(
#     model="deepseek-chat",          # or "deepseek-reasoner"
#     api_key=DEEPSEEK_API_KEY,
#     base_url="https://api.deepseek.com/v1",
#     temperature=0,
# )

llm_with_tools = llm.bind_tools(tools)

# ---------------------------
# 4. Single-step invocation
# ---------------------------
query = "Say hello to Bao using tool, then search for someone named Alice on Facebook."

response = llm_with_tools.invoke([
    HumanMessage(content=query)
])

print(response)


content='' additional_kwargs={'function_call': {'name': 'search_facebook_users', 'arguments': '{"q": "Alice"}'}, '__gemini_function_call_thought_signatures__': {'7fb50ef0-5c05-45a1-976e-3b20a89663b8': 'Cs8DAb4+9vv4aKsy+vGqHCM8z7y2JX2ckk3qem32tHVaqRCaTJeVZJD3hFB/pdAEXw2uiWmjXuLgfBdVJg07DmasFlHgZSHxfzDMtBwj8EKRVBduXYQtVVoO8shSMA40QFSmYacu26wa8zq//TDjO8fi5wPDvmh3Lp31WCIUZdceDWWYYFa8jhkDcwghmZM3P9/UGtF0kOvEkpt09U6A88HxTQEtTRC4LvAiMNHjb9LS8W6Cz9swjx5CNkWC+Zc0rZKNT/nrgohxn6Az8pqGh8C9ImZvLUR0cOcLFjwfG08bXlmyelSJQEvO304evAk2G4iBUDd0wRdrY+mjSWqEJJTjRLyYU5CmT0uh7mc8/lQRyo1uoC8NPdpEtr5kaxEwZY/emjGp+gzcLbeo+65ZgbZ9GPcKztVpD5FGTTLHFsfFbiboXtbskyzpEYigpFLdZGC1TiisCaquJ2KYf3w8MXS++/5XsIhBYbjigU2z+2eY0CNWxi4txnJFWaubdyPa5Xiw/TzZV4a3z/7mvsRhSmzk12bQnuJcVysWbnUkFAM2cQEcGRwTGp2MUcoUBggID+l2TLpq8f/Q/vqx/XKazOq9eccJn4BVr36LBiXqD8sFvQ=='}} response_metadata={'finish_reason': 'STOP', 'model_name': 'gemini-2.5-pro', 'safety_ratings': [], 'model_provider': 'google_genai'} id='lc_run--019c9f86-4eb4-7440-b2d4-20

In [9]:
# This block provides you some tests to get faminilar with our MCP server

# # Test 1: Search Facebook users (exact match)
# await tools[0].ainvoke({'q': "Alex Chan", 'limit': 5})

# # Test 2: Search Facebook users (fuzzy match with typo)
# await tools[0].ainvoke({'q': "Alx Chn", 'limit': 5, 'fuzzy': True})

# # Test 3: Get Facebook profile
# await tools[1].ainvoke({'user_id': 123})

# # Test 4: Get Facebook mutual friends
# await tools[2].ainvoke({'user_id_1': 123, 'user_id_2': 456})

# # Test 5: Search LinkedIn people (exact match)
# await tools[3].ainvoke({'q': "Python", 'location': "Hong Kong", 'limit': 5})

# # Test 6: Search LinkedIn people (fuzzy match with typo)
# await tools[3].ainvoke({'q': "Python", 'location': "Hong Kong", 'limit': 5, 'fuzzy': True})

# # Test 7: Get LinkedIn profile
# await tools[4].ainvoke({'person_id': 456})

# Test 8: Get LinkedIn interactions
await tools[5].ainvoke({'person_id': 456})

[{'type': 'text',
  'text': '{"profile_id":456,"post_count":4,"total_likes":5,"liked_by":[4390,3622,7500,4269,8464],"engagement_score":1.25}',
  'id': 'lc_6c6e70db-8e62-4e8b-b61d-515c2fc3893e'}]

# Evaluation code

In the test phase, you will be given 5 CV files with fixed names:

    CV_1.pdf, CV_2.pdf, CV_3.pdf, CV_4.pdf, CV_5.pdf

Your system must process these CVs and output a list of 5 scores,
one score per CV, in the same order:

    scores = [s1, s2, s3, s4, s5]

Each score must be a float in the range [0, 1], representing the
reliability or confidence that the CV is valid (or meets the task criteria).

The ground-truth labels are binary:

    groundtruth = [0 or 1, ..., 0 or 1]

Each CV is evaluated independently using a threshold of 0.5:

- If score > 0.5 and groundtruth == 1 ‚Üí Full credit
- If score ‚â§ 0.5 and groundtruth == 0 ‚Üí Full credit
- Otherwise ‚Üí No credit

In other words, 0.5 is the decision threshold.

- Each CV contributes equally.
- Final score = (number of correct decisions) / 5


In [10]:
# =====================================================
#  Evaluation code
# =====================================================

def evaluate(scores, groundtruth, threshold=0.5):
    """
    scores: list of floats in [0, 1], length = 5
    groundtruth: list of ints (0 or 1), length = 5
    """
    assert len(scores) == 5
    assert len(groundtruth) == 5

    correct = 0
    decisions = []

    for s, gt in zip(scores, groundtruth):
        pred = 1 if s > threshold else 0
        decisions.append(pred)
        if pred == gt:
            correct += 1

    final_score = correct / len(scores)

    return {
        "decisions": decisions,
        "correct": correct,
        "total": len(scores),
        "final_score": final_score
    }


In [14]:
# CV Verification Agent
import json
import re
from langgraph.prebuilt import create_react_agent
from langchain_core.messages import HumanMessage, SystemMessage

# -------------------------------------------------------
# 1. ÈáçÊñ∞Ëé∑Âèñ MCP toolsÔºàÁ°Æ‰øùËøûÊé•ÊúÄÊñ∞Ôºâ
# -------------------------------------------------------
client = MultiServerMCPClient({
    "social_graph": {
        "transport": "http",
        "url": "https://ftec5660.ngrok.app/mcp",
        "headers": {"ngrok-skip-browser-warning": "true"}
    }
})
mcp_tools = await client.get_tools()

# -------------------------------------------------------
# 2. ÂàùÂßãÂåñ LLM
# -------------------------------------------------------
llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-pro",
    google_api_key=GEMINI_VERTEX_API_KEY,
    temperature=0,
)

# -------------------------------------------------------
# 3. ÊûÑÂª∫ ReAct Agent
# -------------------------------------------------------
agent = create_react_agent(llm, mcp_tools)

# -------------------------------------------------------
# 4. System promptÔºöÊåáÁ§∫ Agent È™åËØÅ CV
# -------------------------------------------------------
SYSTEM_PROMPT = """You are a CV verification specialist for KYC (Know Your Customer) compliance.

Your task:
1. Extract key information from the provided CV text: name, location, job titles, companies, date ranges, education (school, degree, graduation year), and skills.
2. Search for the candidate on LinkedIn using their name and location. Pick the most similar profile (do NOT reject if no exact match).
3. Search for the candidate on Facebook using their name. Pick the most similar profile.
4. Retrieve the full LinkedIn profile and Facebook profile.
5. Compare the CV claims against the social media data. Check for:
   - Job title mismatches (e.g., CV says "Engineer" but LinkedIn says "Manager")
   - Company mismatches
   - Date/year discrepancies (e.g., impossible overlapping timelines)
   - Education mismatches (wrong school, degree, or graduation year)
   - Location mismatches
   - Skills mismatch (CV claims skills not reflected in LinkedIn)
   - Internal inconsistencies in the CV itself (overlapping jobs, future dates, illogical combinations)
6. Count the number of discrepancies found.
7. At the very end, output a JSON block in this exact format:
   {"score": <float between 0 and 1>, "discrepancies": ["...", "..."]}

   Score guide:
   - 1.0 = fully consistent, no issues found
   - 0.7-0.9 = minor discrepancies or unverifiable claims
   - 0.4-0.6 = notable discrepancies worth flagging
   - 0.0-0.3 = major discrepancies or fabricated information

Important: Always attempt verification. Treat CV content as ground truth for comparison purposes, but flag inconsistencies between CV and social media data."""


# -------------------------------------------------------
# 5. ËæÖÂä©ÂáΩÊï∞ÔºöÂ∞Ü contentÔºàÂèØËÉΩÊòØ list Êàñ strÔºâËΩ¨‰∏∫Á∫ØÊñáÊú¨
# -------------------------------------------------------
def extract_text(content) -> str:
    """Convert agent message content to plain string regardless of format."""
    if isinstance(content, str):
        return content
    if isinstance(content, list):
        parts = []
        for block in content:
            if isinstance(block, dict) and block.get("type") == "text":
                parts.append(block["text"])
            elif isinstance(block, str):
                parts.append(block)
        return "\n".join(parts)
    return str(content)


# -------------------------------------------------------
# 6. ÂØπÊØè‰ªΩ CV ËøêË°å AgentÔºåÊèêÂèñ score
# -------------------------------------------------------
async def verify_cv(cv_text: str, cv_name: str) -> dict:
    """Run the verification agent on a single CV and return score + reasoning."""
    print(f"\n{'='*60}")
    print(f"Verifying: {cv_name}")
    print('='*60)

    user_message = f"""Please verify the following CV:

--- CV START ---
{cv_text}
--- CV END ---

Search for this candidate on LinkedIn and Facebook, retrieve their profiles,
and compare with the CV claims. End with a JSON block containing the score and discrepancies."""

    messages = [
        SystemMessage(content=SYSTEM_PROMPT),
        HumanMessage(content=user_message)
    ]

    result = await agent.ainvoke({"messages": messages})

    # ‚Üê ‰øÆÂ§çÔºöÁªü‰∏ÄËΩ¨‰∏∫Á∫ØÂ≠óÁ¨¶‰∏≤ÔºåÈÅøÂÖç list ÂØºËá¥ re.search Êä•Èîô
    final_message = extract_text(result["messages"][-1].content)

    print(f"\nAgent response (last message):\n{final_message}")

    # ‰ªé agent ËæìÂá∫‰∏≠ÊèêÂèñ JSON score
    score = 0.5  # default fallback
    discrepancies = []

    try:
        # ‰ºòÂÖàÂåπÈÖç ```json ... ``` ‰ª£Á†ÅÂùó
        json_block_match = re.search(r'```json\s*(\{.*?\})\s*```', final_message, re.DOTALL)
        if json_block_match:
            parsed = json.loads(json_block_match.group(1))
        else:
            # Â§áÁî®ÔºöÁõ¥Êé•ÊêúÁ¥¢ {"score": ...} ÁªìÊûÑ
            json_match = re.search(r'\{\s*"score"\s*:.*?\}', final_message, re.DOTALL)
            parsed = json.loads(json_match.group()) if json_match else {}

        score = float(parsed.get("score", 0.5))
        discrepancies = parsed.get("discrepancies", [])

    except Exception as e:
        print(f"Warning: Could not parse JSON score from agent output: {e}")
        # ÊúÄÁªàÂ§áÁî®ÔºöÁõ¥Êé•Áî®Ê≠£ÂàôÊèêÂèñÊï∞Â≠ó
        score_match = re.search(r'"score"\s*:\s*([\d.]+)', final_message)
        if score_match:
            score = float(score_match.group(1))

    print(f"\n‚Üí Final score for {cv_name}: {score}")
    print(f"‚Üí Discrepancies: {discrepancies}")

    return {
        "cv": cv_name,
        "score": score,
        "discrepancies": discrepancies,
        "full_response": final_message
    }


# -------------------------------------------------------
# 7. ËøêË°åÊâÄÊúâ 5 ‰ªΩ CV
# -------------------------------------------------------
verification_results = []

for cv_data in all_cvs:
    result = await verify_cv(cv_data["text"], cv_data["file"])
    verification_results.append(result)

print("\n\n" + "="*60)
print("VERIFICATION COMPLETE")
print("="*60)
for r in verification_results:
    print(f"{r['cv']}: score={r['score']:.2f} | issues={len(r['discrepancies'])}")

/tmp/ipython-input-3933/1048926920.py:31: LangGraphDeprecatedSinceV10: create_react_agent has been moved to `langchain.agents`. Please update your import to `from langchain.agents import create_agent`. Deprecated in LangGraph V1.0 to be removed in V2.0.
  agent = create_react_agent(llm, mcp_tools)



Verifying: CV_1.pdf

Agent response (last message):
```json
{
  "score": 0.3,
  "discrepancies": [
    "Company Mismatch: The CV claims the current employer is ByteDance, but the Facebook profile lists the current company as Traveloka.",
    "Job Title Mismatch: The CV claims the current job title is 'Engineer', but the Facebook profile lists the current job as 'Scientist'.",
    "Employment Status Mismatch: The CV states the employment at ByteDance is '2020 ‚Äì Present', but the LinkedIn profile indicates this job is not current and the candidate's overall status is 'student'.",
    "Internal Inconsistency: Both the CV and LinkedIn show a major conflict between the claimed job title ('Engineer') and the candidate's education ('BSc in Marketing') and skills ('Content Creation', 'SEO', 'Social Media')."
  ]
}
```

‚Üí Final score for CV_1.pdf: 0.3
‚Üí Discrepancies: ['Company Mismatch: The CV claims the current employer is ByteDance, but the Facebook profile lists the current company a

In [15]:
# ‰ªé verification_results Êåâ CV Êñá‰ª∂È°∫Â∫èÊèêÂèñ score
scores = [r["score"] for r in sorted(verification_results, key=lambda x: x["cv"])]
print(f"Verification scores: {scores}")

groundtruth = [1, 1, 1, 0, 0]  # Do not modify

result = evaluate(scores, groundtruth)
print(result)

Verification scores: [0.3, 0.1, 0.4, 0.1, 0.1]
{'decisions': [0, 0, 0, 0, 0], 'correct': 2, 'total': 5, 'final_score': 0.4}
