<a href="https://colab.research.google.com/github/vanderbilt-data-science/ai-days-collaboration/blob/main/Process_cvs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Install required packages
!pip install PyPDF2
!pip install python-docx
!pip install anthropic
!pip install pandas tqdm

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.3/244.3 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-docx
Successfully installed python-docx-1.1.2
Collecting anthropic
  Downloading anthropic-0.49.0-py3-none-any.whl.metadata (24 kB)
Downloading anthropic-0.49.0-py3-none-any.whl (243 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m243.4/243.4 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: anthropic
Succe

In [4]:
# AI Days Collaboration Matcher
# Created for AI Days 2025

import os
import pandas as pd
import PyPDF2
import docx
import re
import json
from google.colab import drive
import anthropic
from tqdm.notebook import tqdm
from google.colab import userdata


# Step 1: Mount Google Drive
drive.mount('/content/drive')

# Check if the Anthropic package is installed, if not install it
try:
    import anthropic
except ImportError:
    !pip install anthropic
    import anthropic

# Step 2: Install required packages
try:
    import PyPDF2
except ImportError:
    !pip install PyPDF2

try:
    import docx
except ImportError:
    !pip install python-docx

# Step 3: Define the paths
BASE_PATH = "/content/drive/My Drive/Data Science/Symposium/AI Days Resumes_Research"
OUTPUT_PATH = "/content/drive/My Drive/Data Science/Symposium/AI Days Output"

# Create output directory if it doesn't exist
if not os.path.exists(OUTPUT_PATH):
    os.makedirs(OUTPUT_PATH)

# Step 4: Setup your Anthropic API key
# Replace with your actual API key or set as an environment variable
ANTHROPIC_API_KEY = userdata.get('ANTHROPIC_API_KEY')
anthropic_client = anthropic.Anthropic(api_key=ANTHROPIC_API_KEY)



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [9]:
# Step 5: Functions to extract text from different file types
def extract_text_from_pdf(file_path):
    """Extract text from PDF files"""
    text = ""
    with open(file_path, 'rb') as f:
        pdf_reader = PyPDF2.PdfReader(f)
        for page in pdf_reader.pages:
            text += page.extract_text() + "\n"
    return text

def extract_text_from_docx(file_path):
    """Extract text from DOCX files"""
    doc = docx.Document(file_path)
    return "\n".join([para.text for para in doc.paragraphs])

def extract_text_from_file(file_path):
    """Extract text from various file formats"""
    _, ext = os.path.splitext(file_path)
    ext = ext.lower()

    try:
        if ext == '.pdf':
            return extract_text_from_pdf(file_path)
        elif ext in ['.docx', '.doc']:
            return extract_text_from_docx(file_path)
        elif ext == '.txt':
            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                return f.read()
        else:
            print(f"Unsupported file format: {ext}")
            return ""
    except Exception as e:
        print(f"Error extracting text from {file_path}: {str(e)}")
        return ""

# Step 6: Function to extract research profile using Claude
# Step 6: Function to extract research profile using Claude
def extract_research_profile(text, file_name):
    """
    Extract research profile using Claude 3.7 Sonnet in structured JSON format

    Returns:
        Dictionary with structured research profile information
    """
    # Create system prompt
    system_prompt = """You are an expert assistant helping to extract structured information from academic resumes and research statements.
    Extract the information according to the specified JSON format, ensuring all fields are included with the correct structure."""

    # Create user prompt with structured JSON format
    # Modified prompt portion for Step 6
    user_prompt = f"""
    Analyze this document and extract key research information in the following JSON format:

    {{
      "basic_info": {{
        "name": "Full name of the researcher",
        "email": "Email address if available, otherwise null",
        "affiliation": "University or organization name",
        "role": "The person's role (faculty, physician-scientist, clinical_faculty, clinician_in_training, postdoc, research_scientist, graduate_student, undergraduate_student, research_staff, industry, community, or other)"
      }},
      "research_profile": {{
        "primary_focus": "A 1-2 sentence description of their main research area",
        "methodologies": ["Method 1", "Method 2", "Method 3"],
        "domains": ["Application domain 1", "Application domain 2"],
        "research_summary": "A 150-200 word paragraph describing their research in detail"
      }},
      "collaboration_potential": {{
        "expertise_offered": ["Specific expertise 1", "Specific expertise 2"],
        "resources_available": ["Dataset", "Tool", "Framework"],
        "complementary_fields": ["Field 1", "Field 2"]
      }},
      "keywords": ["keyword1", "keyword2", "keyword3", "keyword4", "keyword5"]
    }}

    Guidelines:
    1. Extract information explicitly stated in the document when available
    2. Make reasonable inferences for missing fields, marking these as "[inferred]"
    3. Use "null" for fields where no information is available and no inference can be made
    4. Limit lists to 3-5 items, prioritizing the most significant ones
    5. Ensure the research_summary provides a coherent overview of their work
    6. Focus on information most relevant for identifying potential research collaborations
    7. For the "role" field, determine the most appropriate category:
      - faculty: Academic professors and researchers without significant clinical duties
      - physician-scientist: Medical doctors who conduct significant research
      - clinical_faculty: Primarily clinical practitioners with academic appointments
      - clinician_in_training: Residents, fellows, and others in clinical training positions
      - postdoc: Postdoctoral researchers
      - research_scientist: Non-faculty research professionals
      - graduate_student: Master's and PhD students
      - undergraduate_student: Bachelor's degree students
      - research_staff: Research assistants, lab managers, etc.
      - industry: Private sector professionals
      - community: Community organization members
      - other: Roles that don't fit the above categories

    The JSON structure must be strictly followed without additional narrative or explanation outside the JSON object.

    Document:
    {text[:15000]}
    """

    try:
        # Call Anthropic API with Claude 3.7 Sonnet
        response = anthropic_client.messages.create(
            model="claude-3-7-sonnet-20250219",
            max_tokens=1000,
            system=system_prompt,
            messages=[
                {"role": "user", "content": user_prompt}
            ],
            temperature=0.2
        )

        # Extract and parse the JSON response
        response_text = response.content[0].text

        # Find JSON in the response (in case Claude adds any commentary)
        json_match = re.search(r'({.*})', response_text, re.DOTALL)
        if json_match:
            json_str = json_match.group(1)
            profile = json.loads(json_str)

            # Add file information
            profile['file_info'] = {
                'file_name': file_name,
                'processing_time': pd.Timestamp.now().isoformat()
            }

            return profile
        else:
            # If no JSON formatting, return an error structure
            return {
                "basic_info": {
                    "name": "Error parsing",
                    "email": "Error parsing",
                    "affiliation": "Error parsing"
                },
                "research_profile": {
                    "primary_focus": "Error parsing JSON from response",
                    "methodologies": [],
                    "domains": [],
                    "research_summary": response_text
                },
                "collaboration_potential": {
                    "expertise_offered": [],
                    "resources_available": [],
                    "complementary_fields": []
                },
                "keywords": [],
                "file_info": {
                    "file_name": file_name,
                    "processing_time": pd.Timestamp.now().isoformat(),
                    "error": "Failed to parse JSON from response"
                }
            }

    except Exception as e:
        print(f"Error extracting research profile from {file_name}: {str(e)}")
        return {
            "basic_info": {
                "name": "Error",
                "email": "Error",
                "affiliation": "Error"
            },
            "research_profile": {
                "primary_focus": "Error processing",
                "methodologies": [],
                "domains": [],
                "research_summary": f"Error: {str(e)}"
            },
            "collaboration_potential": {
                "expertise_offered": [],
                "resources_available": [],
                "complementary_fields": []
            },
            "keywords": [],
            "file_info": {
                "file_name": file_name,
                "processing_time": pd.Timestamp.now().isoformat(),
                "error": str(e)
            }
        }

# Step 7: Scan directory and get list of files
def scan_directory(directory, limit=None):
    """
    Scan the directory for files

    Args:
        directory: Path to scan
        limit: Optional limit on number of files to return

    Returns:
        List of file paths
    """
    file_paths = []

    # Walk through the directory
    for root, _, files in os.walk(directory):
        for file in files:
            # Only include document files
            _, ext = os.path.splitext(file)
            if ext.lower() in ['.pdf', '.docx', '.doc', '.txt']:
                file_path = os.path.join(root, file)
                file_paths.append(file_path)

    # Limit the number of files if specified
    if limit and len(file_paths) > limit:
        file_paths = file_paths[:limit]

    return file_paths

# Step 8: Main function to process files
def process_files(input_dir, output_dir, limit=5):
    """
    Process files to extract research profiles

    Args:
        input_dir: Directory containing input files
        output_dir: Directory to save output
        limit: Number of files to process (for testing)
    """
    # Get list of files
    file_paths = scan_directory(input_dir, limit)
    print(f"Found {len(file_paths)} files to process")

    # Process each file
    results = []

    for file_path in tqdm(file_paths):
        file_name = os.path.basename(file_path)

        try:
            # Extract text from file
            text = extract_text_from_file(file_path)

            if text:
                # Extract research profile
                profile = extract_research_profile(text, file_name)

                # Add file info
                profile['file_name'] = file_name
                profile['file_path'] = file_path

                # Save individual profile
                output_file = os.path.join(output_dir, f"{os.path.splitext(file_name)[0]}_profile.json")
                with open(output_file, 'w', encoding='utf-8') as f:
                    json.dump(profile, f, indent=2)

                results.append(profile)
                print(f"Processed: {file_name}")
            else:
                print(f"No text extracted from: {file_name}")

        except Exception as e:
            print(f"Error processing {file_name}: {str(e)}")

    # Save all results to a single file
    all_results_file = os.path.join(output_dir, "all_profiles.json")
    with open(all_results_file, 'w', encoding='utf-8') as f:
        json.dump(results, f, indent=2)

    # Create a CSV version for easy viewing
    df = pd.DataFrame(results)
    csv_file = os.path.join(output_dir, "all_profiles.csv")
    df.to_csv(csv_file, index=False)

    print(f"Processed {len(results)} files. Results saved to {all_results_file} and {csv_file}")
    return results

# Step 9: Display the extracted profiles in a nice format
def display_profiles(profiles):
    """Display the extracted profiles in a readable format"""
    for i, profile in enumerate(profiles):
        print(f"\n{'='*80}\nProfile {i+1}: {profile.get('file_info', {}).get('file_name', 'Unknown')}\n{'='*80}")

        # Basic Info
        basic_info = profile.get('basic_info', {})
        print(f"Name: {basic_info.get('name', 'Not extracted')}")
        print(f"Email: {basic_info.get('email', 'Not extracted')}")
        print(f"Affiliation: {basic_info.get('affiliation', 'Not extracted')}")
        print(f"Role: {basic_info.get('role', 'Not extracted')}")

        # Research Profile
        research_profile = profile.get('research_profile', {})
        print(f"\nPrimary Focus: {research_profile.get('primary_focus', 'Not extracted')}")

        print("\nMethodologies:")
        for method in research_profile.get('methodologies', []):
            print(f"- {method}")

        print("\nDomains:")
        for domain in research_profile.get('domains', []):
            print(f"- {domain}")

        print(f"\nResearch Summary:\n{research_profile.get('research_summary', 'Not extracted')}")

        # Collaboration Potential
        collab = profile.get('collaboration_potential', {})
        print("\nExpertise Offered:")
        for expertise in collab.get('expertise_offered', []):
            print(f"- {expertise}")

        print("\nResources Available:")
        for resource in collab.get('resources_available', []):
            print(f"- {resource}")

        print("\nComplementary Fields:")
        for field in collab.get('complementary_fields', []):
            print(f"- {field}")

        # Keywords
        print("\nKeywords:")
        for keyword in profile.get('keywords', []):
            print(f"- {keyword}")

In [10]:
# Run the processing on 5 files
profiles = process_files(BASE_PATH, OUTPUT_PATH, limit=5)

# Display the results
display_profiles(profiles)

Found 5 files to process


  0%|          | 0/5 [00:00<?, ?it/s]

Processed: Xiaotiao Ma_Resume - Xiaotiao Ma (1).pdf
Processed: Dillon Pruett CV 1_29_2025 - Dillon Pruett.pdf
Processed: Xiaotiao Ma_Resume - Xiaotiao Ma.pdf
Processed: 20241204_Curriculum-Vitae-SOM-Format-K. Coate - Katie Coate.pdf
Processed: New Uploaded Resume (1) - Mingyang Jiang.pdf
Processed 5 files. Results saved to /content/drive/My Drive/Data Science/Symposium/AI Days Output/all_profiles.json and /content/drive/My Drive/Data Science/Symposium/AI Days Output/all_profiles.csv

Profile 1: Xiaotiao Ma_Resume - Xiaotiao Ma (1).pdf
Name: Xiaotiao Ma
Email: xiaotiao.s.ma@vanderbilt.edu
Affiliation: Vanderbilt University
Role: student

Primary Focus: [inferred] Economics and political science with applications in business and marketing research.

Methodologies:
- Market research
- Business analysis
- Strategic communication

Domains:
- Marketing
- Economics
- Business development

Research Summary:
[inferred] Xiaotiao Ma is an undergraduate student at Vanderbilt University pursuing a 

In [11]:
# Process 5 files and estimate total runtime and cost
import time
import math

def estimate_full_processing(input_dir, output_dir, sample_size=5):
    """
    Process a sample of files and estimate total runtime and cost

    Args:
        input_dir: Directory containing input files
        output_dir: Directory to save output
        sample_size: Number of files to process for the estimate
    """
    # Get total number of files
    all_files = scan_directory(input_dir)
    total_files = len(all_files)

    if total_files == 0:
        print("No files found in the directory.")
        return

    # Process a sample of files
    sample_files = all_files[:sample_size]
    print(f"Processing {sample_size} sample files out of {total_files} total files...")

    # Track timing and token usage
    start_time = time.time()
    token_counts = []

    # Process each file in the sample
    results = []

    for file_path in tqdm(sample_files):
        file_name = os.path.basename(file_path)

        try:
            # Extract text from file
            text = extract_text_from_file(file_path)

            if text:
                # Estimate token count (rough estimate: 4 chars per token)
                estimated_tokens = len(text) // 4
                token_counts.append(min(estimated_tokens, 15000 // 4))  # Cap at our limit

                # Extract research profile
                profile = extract_research_profile(text, file_name)

                # Add file info
                profile['file_info'] = {
                    'file_name': file_name,
                    'file_path': file_path,
                    'processing_time': pd.Timestamp.now().isoformat()
                }

                # Save individual profile
                output_file = os.path.join(output_dir, f"{os.path.splitext(file_name)[0]}_profile.json")
                with open(output_file, 'w', encoding='utf-8') as f:
                    json.dump(profile, f, indent=2)

                results.append(profile)
                print(f"Processed: {file_name}")
            else:
                print(f"No text extracted from: {file_name}")

        except Exception as e:
            print(f"Error processing {file_name}: {str(e)}")

    # Calculate timing
    end_time = time.time()
    total_sample_time = end_time - start_time
    avg_time_per_file = total_sample_time / len(sample_files)
    estimated_total_time = avg_time_per_file * total_files

    # Calculate estimated token usage
    avg_tokens_per_file = sum(token_counts) / len(token_counts) if token_counts else 0
    estimated_total_tokens_in = avg_tokens_per_file * total_files

    # Estimate output tokens (typically smaller than input)
    estimated_tokens_out = estimated_total_tokens_in * 0.3  # Rough estimate

    # Calculate cost (Claude 3.7 Sonnet pricing)
    # As of March 2025, using approximations
    input_cost_per_1k = 0.03  # $0.03 per 1K input tokens for Claude 3.7 Sonnet
    output_cost_per_1k = 0.15  # $0.15 per 1K output tokens for Claude 3.7 Sonnet

    estimated_input_cost = (estimated_total_tokens_in / 1000) * input_cost_per_1k
    estimated_output_cost = (estimated_tokens_out / 1000) * output_cost_per_1k
    estimated_total_cost = estimated_input_cost + estimated_output_cost

    # Display results
    print("\n" + "="*80)
    print("PROCESSING ESTIMATE SUMMARY")
    print("="*80)
    print(f"Sample size: {sample_size} files")
    print(f"Total files: {total_files}")
    print(f"\nTiming Information:")
    print(f"  Average processing time per file: {avg_time_per_file:.2f} seconds")
    print(f"  Estimated total processing time: {estimated_total_time:.2f} seconds " +
          f"({estimated_total_time/60:.2f} minutes, {estimated_total_time/3600:.2f} hours)")

    print(f"\nToken Usage Estimate:")
    print(f"  Average input tokens per file: {avg_tokens_per_file:.0f}")
    print(f"  Estimated total input tokens: {estimated_total_tokens_in:.0f}")
    print(f"  Estimated total output tokens: {estimated_tokens_out:.0f}")

    print(f"\nCost Estimate:")
    print(f"  Estimated input cost: ${estimated_input_cost:.2f}")
    print(f"  Estimated output cost: ${estimated_output_cost:.2f}")
    print(f"  Estimated total cost: ${estimated_total_cost:.2f}")

    print("\nNote: These are rough estimates based on the sample. Actual values may vary.")
    print("="*80)

    # Display the results
    if len(results) > 0:
        print("\nSample profiles extracted:")
        display_profiles(results)

    return results

# Run the estimation
profiles = estimate_full_processing(BASE_PATH, OUTPUT_PATH, sample_size=5)

Processing 5 sample files out of 152 total files...


  0%|          | 0/5 [00:00<?, ?it/s]

Processed: Xiaotiao Ma_Resume - Xiaotiao Ma (1).pdf
Processed: Dillon Pruett CV 1_29_2025 - Dillon Pruett.pdf
Processed: Xiaotiao Ma_Resume - Xiaotiao Ma.pdf
Processed: 20241204_Curriculum-Vitae-SOM-Format-K. Coate - Katie Coate.pdf
Processed: New Uploaded Resume (1) - Mingyang Jiang.pdf

PROCESSING ESTIMATE SUMMARY
Sample size: 5 files
Total files: 152

Timing Information:
  Average processing time per file: 8.71 seconds
  Estimated total processing time: 1324.51 seconds (22.08 minutes, 0.37 hours)

Token Usage Estimate:
  Average input tokens per file: 2135
  Estimated total input tokens: 324550
  Estimated total output tokens: 97365

Cost Estimate:
  Estimated input cost: $9.74
  Estimated output cost: $14.60
  Estimated total cost: $24.34

Note: These are rough estimates based on the sample. Actual values may vary.

Sample profiles extracted:

Profile 1: Xiaotiao Ma_Resume - Xiaotiao Ma (1).pdf
Name: Xiaotiao Ma
Email: xiaotiao.s.ma@vanderbilt.edu
Affiliation: Vanderbilt University