[Course2Job System Architecture](https://s7w4qh1bo4.feishu.cn/docx/PsF8d9FJTogTVDxsTK0c1NnAnse)

### PDF2TXT

Information to Extract:

+ Course Description: P1,P2
+ Table

Common Expression

+ catalogue_desc_pattern:
    Catalogue Description:
    Catalog Entry
    Catalogue Description
    Catalog Description
    Catalog Course Description
    Catalogue Course Description
    Brief Course Description
    Catalog Description:


+ expanded_desc_pattern:
    Course Description:
    Course Description
    CSCI 104 Syllabus
    Overview
    Syllabus:
    Description
    Introduction and Purposes
    Concepts
    Course content
    Course Outline
    ​Introduction and Purposes
    Course T​opics and Readings
    Detailed Description
    Topics
    Course Scope and Purpose
    Course Outline:
    Detailed Course Syllabus:
    Expanded Course Description:
    Course Summary
    Expanded Course Description

+ leaning_obj_pattern:
    Learning Objectives
    Course Objectives:
    Objective
    Course Objectives
    Course Objective:
    Learning Objectives and Outcomes
    Outcomes expected upon the completion of the course:
    Course Objective
    Learning Objectives:

In [2]:
import pdfplumber
import json
import re
import os
from tqdm import tqdm

In [3]:
# Function to extract text and course descriptions from the first 'num_pages' pages of the PDF
def extract_text_from_pdf(pdf_path, num_pages=2):
    extracted_text = ""
    
    # Open the PDF file and extract text from the specified number of pages
    with pdfplumber.open(pdf_path) as pdf:
        for i in range(min(num_pages, len(pdf.pages))):  # Iterate through the first 'num_pages' pages
            extracted_text += pdf.pages[i].extract_text() + "\n"  # Add page text to the result
    
    # Regular expressions to capture Catalogue and Expanded Course Descriptions with flexible labels
    catalogue_desc_pattern = r"(Cata?logue\s?Course\s?Description|Catalogue\s?Description|Catalog\s?Course\s?Description|Catalog\s?Entry|Course\s?Catalog\s?Description)(.*?)(?=E?xpanded\s?Course\s?Description|Course\s?Description|$)"
    expanded_desc_pattern = r"(Expanded\s?Course\s?Description|Course\s?Description|CSCI\s?\d{3}\s?Syllabus|Overview|Syllabus:|Description|Introduction\s?and\s?Purposes|Concepts|Course\s?Content|Course\s?Outline|Detailed\s?Course\s?Syllabus|Expanded\s?Course\s?Description|Course\s?Summary|Course\s?Topics\s?and\s?Readings)(.*?)(?=Learning\s?Objectives|$)"
    learning_obj_pattern = r"(Learning\s?Objectives|Course\s?Objectives|Objective|Learning\s?Objectives\s?and\s?Outcomes|Outcomes\s?expected\s?upon\s?the\s?completion\s?of\s?the\s?course)(.*?)(?=Course\s?Content|$)"
    
    # Extract Catalogue Course Description
    catalogue_desc_match = re.search(catalogue_desc_pattern, extracted_text, re.DOTALL | re.IGNORECASE)
    catalogue_desc = catalogue_desc_match.group(2).strip() if catalogue_desc_match else "."
    
    # Extract Expanded Course Description
    expanded_desc_match = re.search(expanded_desc_pattern, extracted_text, re.DOTALL | re.IGNORECASE)
    expanded_desc = expanded_desc_match.group(2).strip() if expanded_desc_match else ""

    # Extract Catalogue Course Description
    learning_obj_match = re.search(learning_obj_pattern, extracted_text, re.DOTALL | re.IGNORECASE)
    learning_obj_desc = learning_obj_match.group(2).strip() if learning_obj_match else ""
    
    # Remove the overlap between the two descriptions by checking for common content
    def remove_overlap(catalogue_desc, expanded_desc):
        # Find the longest common prefix (if any) to avoid repeating parts
        common_prefix_length = 0
        for i in range(min(len(catalogue_desc), len(expanded_desc))):
            if catalogue_desc[i] == expanded_desc[i]:
                common_prefix_length += 1
            else:
                break
        
        # Remove the common prefix from the expanded description to avoid repetition
        return expanded_desc[common_prefix_length:].strip()

    # Deduplicate by removing common parts between Catalogue and Expanded descriptions
    deduplicated_expanded_desc = remove_overlap(catalogue_desc, expanded_desc)

    # Combine the descriptions into a single text description
    combined_description = f"Catalogue Course Description:\n{catalogue_desc}\n\nExpanded Course Description:\n{deduplicated_expanded_desc}"
    
    # Return the extracted text and combined course description
    return combined_description, learning_obj_desc


# Function to extract all tables from the PDF
def extract_all_tables_from_pdf(pdf_path):
    all_tables = []

    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages):  # Iterate through all pages
            table = page.extract_table()  # Try to extract the table from the current page
            if table:  # If a table is found, add it to the list
                all_tables.append({
                    "page": page_num + 1,  # Store the page number (1-based index)
                    "table": table
                })
    
    return all_tables

# Main function to extract both text and tables, then combine them into a JSON format
def process_pdf_to_json(pdf_path, course_code):
    # Extract the text from the first 2 pages
    course_description, learning_objective = extract_text_from_pdf(pdf_path, num_pages=2)
    # Extract all tables from the PDF
    tables = extract_all_tables_from_pdf(pdf_path)
    
    # Combine all tables' content into a single text description
    table_description = ""
    for table in tables:
        table_description += f"\n"
        for row in table['table']:
            row = [str(cell) if cell is not None else "" for cell in row]
            table_description += " | ".join(row) + "\n"  # Join columns with " | "
        table_description += "=" * 50 + "\n"  # Separator between tables

    # Create the final result dictionary
    result = {
        course_code: {
            "Course Description": course_description.strip(),  # Clean up the extracted text
            "Learning Objective": learning_objective.strip(),
            "Table Description": table_description.strip()  # Clean up the table description
        }
    }
    return result

In [12]:
# get course number list
folder_path = '../syllabus/'
files = os.listdir(folder_path)
course_file_list = [f for f in files if os.path.isfile(os.path.join(folder_path, f))]

output_json_path = "raw_extraction.json"
results = []
for course_file in course_file_list:
    # Process the PDF and generate the JSON result
    pdf_path = folder_path + course_file
    course_code = course_file[:-4]
    # result = process_pdf_to_json(pdf_path, course_code)
    try:
        result = process_pdf_to_json(pdf_path, course_code)
        results.append(result)
    except:
        continue
    print(f"{course_code} JSON result has been saved.")

# Save the result as a JSON file
with open(output_json_path, 'a', encoding='utf-8') as f:
    json.dump(results, f, ensure_ascii=False, indent=4)


CSCI-430 JSON result has been saved.
CSCI-356 JSON result has been saved.
CSCI-587 JSON result has been saved.
CSCI-550 JSON result has been saved.
CSCI-544 JSON result has been saved.
CSCI-585 JSON result has been saved.
CSCI-426 JSON result has been saved.
DSCI-454 JSON result has been saved.
CSCI-625 JSON result has been saved.
CSCI-580 JSON result has been saved.
CSCI-350 JSON result has been saved.
DSCI-525 JSON result has been saved.
DSCI-531 JSON result has been saved.
CSCI-568 JSON result has been saved.
CSCI-420 JSON result has been saved.
CSCI-353 JSON result has been saved.
CSCI-526 JSON result has been saved.
CSCI-532 JSON result has been saved.
CSCI-644 JSON result has been saved.
DSCI-352 JSON result has been saved.
CSCI-594B JSON result has been saved.
DSCI-554 JSON result has been saved.
CSCI-577A JSON result has been saved.
CSCI-531 JSON result has been saved.
DSCI-556 JSON result has been saved.
DSCI-351 JSON result has been saved.
CSCI-534 JSON result has been saved.

### NER

In [3]:
import spacy
import json

In [None]:
with open("clean_extraction.json", "r", encoding="utf-8") as file:
    contents = json.load(file)

ner_extraction = {}
nlp = spacy.load('en_core_web_sm')

for content in contents:
    course_num, course_des = list(content.keys())[0], " ".join(list(list(content.values())[0].values()))
    
    doc = nlp(course_des)
    res = []
    ner_extraction[course_num] = doc.ents

### LLM

In [20]:
import json

In [None]:
with open("/Users/xintongjiang/Desktop/USC/Courses/DSCI558/project/scraper/course_num.txt", "r", encoding="utf-8") as f:
    contents = f.read().split('\n')

contents = {content.split(': ')[0]: content.split(': ')[1] for content in contents}
contents_rev = {v: k for k, v in contents.items()}

In [31]:
llm_extraction = {contents[k]: v for k, v in data.items()}

In [33]:
gpt_extraction = {
    'Introduction to Computer and Network Security': [
        'security', 'networking', 'computing', 'data', 'operating systems', 
        'threats', 'defenses', 'cybersecurity', 'cryptography', 'hashes', 
        'encryption', 'key management', 'authentication', 'access control', 
        'intrusions', 'scanning', 'DDoS', 'DNS security', 'privacy', 
        'communications', 'routing'
    ],
    'Introduction to Computer Systems': [
        'assembly language', 'bit strings', 'cache performance', 'computer architecture', 
        'computer networks', 'computer systems', 'compilation', 'debuggers', 'DRAM', 
        'floating point representation', 'heap management', 'instruction execution', 
        'instruction set', 'machine code', 'memory hierarchy', 'operating systems', 
        'out-of-order execution', 'processor architectures', 'reverse engineering', 
        'system software', 'virtual memory', 'x86 instruction set'
    ],
    'Geospatial Information Management': [
        'spatial thinking', 'geospatial intelligence', 'human security', 'written communication', 
        'citizen science', 'mobility reduction', 'location data', 'smart cities', 
        'urban wetland mapping', 'satellite imagery', 'airborne LiDAR data', 
        'contact tracing apps', 'bluetooth-based contact tracing', 'proximity measurement', 
        'spatial skills', 'thesis project', 'capstone project', 'project proposal', 
        'human security challenges', 'geospatial intelligence challenges'
    ],
    'Advanced Data Stores': [
        'database', 'data management', 'data processing', 'system architecture', 
        'indexing', 'query optimization', 'pipelining', 'materialization', 'vectorization', 
        'scheduling', 'join algorithms', 'consistency', 'concurrency control', 
        'relational database systems', 'OLAP', 'OLTP', 'execution', 'compilation', 
        'hashing', 'sorting', 'data-intensive systems', 'scalable techniques', 
        'GPU', 'NVRAM', 'RDMA', 'in-memory databases', 'streaming', 'hybrid workloads'
    ],
    'Applied Natural Language Processing': [
        'Natural Language Processing', 'Python', 'Regular Expressions', 
        'Text Normalization', 'Edit Distance', 'Naive Bayes', 'Linear Classifier', 
        'Logistic Regression', 'Support Vector Machine', 'Word Embedding', 
        'Deep Learning', 'Multilayer Perceptron', 'Convolutional Neural Networks', 
        'Sequence Labeling', 'Hidden Markov Models', 'Recurrent Neural Networks', 
        'Sequence Modeling', 'Dependency Parsing', 'Machine Translation', 'PyTorch', 
        'Transformers', 'Attention', 'BERT', 'GPT-2', 'Speech and Language Processing', 
        'Machine Learning'
    ],
    'Database Systems': [
        'database management systems', 'parallel data processing', 'performance optimization', 
        'availability', 'NoSQL', 'cache management', 'transactional storage', 'data analytics', 
        'ACID properties', 'relational data model', 'SQL', 'index structures', 'hashing', 
        'B+ trees', 'relational algebra', 'concurrency control', 'protocol locking', 
        'timestamping', 'crash recovery', 'logging'
    ],
    'Game Prototyping': [
        'game design', 'game prototyping', 'programming', 'testing', 'game mechanics', 
        'player experience', 'rapid prototyping', 'playtesting', 'game engine', 'Unity'
    ],
    'Data Visualization and User Interface Design': [
        'data visualization', 'user interface design', 'interaction design', 
        'usability testing', 'cognitive science', 'information graphics', 
        'human-computer interaction', 'responsive design', 'visualizations', 
        'interactive web design'
    ],
    'Program Synthesis and Computer-Aided Verification': [
        'verification', 'program synthesis', 'symbolic execution', 'invariants', 
        'abstract interpretation', 'constraint solvers', 'software verification', 
        'Hoare logic', 'decision procedures', 'static analysis', 'predicate abstraction', 
        'dataflow analyses'
    ],
    '3-D Graphics and Rendering': [
        '3D graphics', 'linear algebra', 'geometry', 'rendering algorithms', 
        'visual effects', 'animation', 'VR', 'AR', 'C/C++', 'Python', 'JavaScript', 
        'ray tracing', 'rasterization', 'shading', 'texture mapping', 'procedural texturing', 
        'sampling', 'reconstruction', 'shadows', 'radiosity', 'OpenGL', 'GPUs', 
        'hardware acceleration'
    ],
    'Introduction to Operating Systems': [
        'operating systems', 'process abstraction', 'context switch', 'system calls', 
        'concurrency', 'threads', 'synchronization', 'memory management', 'file systems', 
        'distributed systems', 'remote procedure call', 'trusted system design'
    ],
    'Trusted System Design, Analysis and Development': [
        'trusted systems', 'computer security', 'reference monitor', 'security kernel', 
        'hardware security', 'software security', 'secure systems', 'trusted computing', 
        'access control', 'security models', 'verification', 'protection rings', 
        'multiprocessing', 'virtualization'
    ],
    'Fairness in Artificial Intelligence': [
        'bias', 'fairness', 'algorithmic transparency', 'explainability', 'machine learning', 
        'statistics', 'probability', 'deep learning', 'adversarial learning', 'privacy', 
        'computational social science', 'bias mitigation', 'algorithmic decision making', 
        'algorithmic fairness', 'AI ethics'
    ],
    'Requirements Engineering': [
        'requirements engineering', 'requirements analysis', 'system modeling', 
        'stakeholder communication', 'elicitation techniques', 'requirements verification', 
        'risk management', 'formal methods', 'user stories', 'agile methodologies', 
        'requirements management'
    ],
    'Computer Graphics': [
        '3D computer graphics', 'OpenGL', 'linear algebra', 'calculus', 'transformations', 
        'geometric modeling', 'animation', 'rendering', 'ray tracing', 'shading', 
        'lighting', 'polygonal meshes', 'texture mapping', 'rasterization', 
        'spatial data structures', 'virtual reality'
    ],
    'Introduction to Internetworking': [
        'networking', 'internetworking', 'protocol layering', 'HTTP', 'DNS', 'CDNs', 
        'TCP', 'UDP', 'transport layer', 'congestion control', 'IP routing', 
        'BGP', 'SDN', 'wireless networking', 'datacenter networking', 'sockets'
    ],
    'Advanced Mobile Devices and Game Consoles': [
        'game development', 'game design', 'game programming', 'Unity', 'game engines', 
        'APIs', 'market research', 'networking', 'core game loops', 'game platforms'
    ],
    'Innovation for Defense Applications': [
        'project', 'plan', 'key takeaways', 'beneficiaries', 'value proposition', 
        'MVP', 'mission model canvas', 'business model canvas', 'solution architecture', 
        'product-mission fit', 'dual-use', 'deployment', 'activities', 'resources', 
        'key partners', 'buy-in', 'support'
    ],
    'Natural Language Dialogue Systems': [
        'dialogue systems', 'natural language processing', 'speech recognition', 
        'machine learning', 'artificial intelligence', 'computational techniques', 
        'design', 'implementation', 'evaluation', 'dialogue genres', 'dialogue management', 
        'context representation', 'response policies', 'natural language generation', 
        'embodied conversational agents', 'incremental speech processing', 'dialogue system evaluation', 
        'speech recognition services', 'cloud services', 'user populations', 'conversational interfaces', 
        'human-computer interaction', 'natural language understanding', 'spoken language understanding'
    ],
    'Applied Machine Learning and Data Mining': [
        'machine learning', 'data mining', 'supervised learning', 'unsupervised learning', 
        'regression methods', 'resampling methods', 'cross-validation', 'decision trees', 
        'dimensionality reduction', 'clustering', 'deep learning', 'map reduce', 'association rules', 
        'recommender systems', 'bayesian decision theory', 'parametric methods', 'multivariate methods', 
        'nonparametric methods', 'linear discrimination', 'neural networks', 'data analysis', 'data visualization', 
        'data science', 'information retrieval', 'data preprocessing', 'data transformation'
    ],
    'Master\'s Thesis': [
        'spatial thinking', 'research project', 'thesis writing', 'thesis defense', 'geographic information science', 
        'bibliographic reference manager', 'communication skills', 'thesis manuscript', 'research process', 
        'validity of claims', 'evidence', 'outcomes', 'academic context', 'research pitfalls', 'research obligations'
    ],
    'Data Visualization': [
        'data visualization', 'information graphics', 'interactive visualization', 'cognitive processing', 
        'perception', 'designing visualizations', 'colors', 'shapes', 'organization', 'images', 
        'methods', 'libraries', 'tools', 'web', 'notebooks', 'modern visualization', 'effective visualizations'
    ],
    'Software Engineering': [
        'software engineering', 'project management', 'requirements', 'architecture', 'design', 
        'implementation', 'testing', 'maintenance', 'object-oriented design', 'design patterns', 
        'test-driven development', 'configuration management', 'iterative development', 'software development lifecycle', 
        'dynamic analysis', 'static bug detection', 'formal verification', 'software tools'
    ],
    'Applied Cryptography': [
        'crypto', 'attacks', 'secrecy', 'stream ciphers', 'block ciphers', 'aes', 'message integrity', 
        'authenticated encryption', 'key exchange', 'public key', 'rsa', 'digital signatures', 
        'pki', 'identity-based encryption', 'authentication', 'zero-knowledge protocols', 'kerberos', 'electronic mail'
    ],
    'User Experience Design and Strategy': [
        'user experience', 'design', 'strategy', 'digital products', 'user research', 'interaction design', 
        'business strategy', 'competitive analysis', 'value innovation', 'prototyping', 'user interviews', 
        'design experiments', 'customer discovery', 'empathy maps', 'SWOT analysis', 'feature prioritization', 
        'prototyping for value proposition validation', 'online user research', 'hypotheses', 'business model canvas'
    ],
    'Foundations of Data Management': [
        'data management', 'storage systems', 'cloud storage', 'data modeling', 'relational databases', 'map reduce', 
        'network-attached storage', 'data mining', 'data integration', 'data cleaning', 'data science', 'python', 
        'java', 'databases', 'file systems', 'sql', 'views', 'query processing', 'nosql', 'hadoop', 'spark', 
        'big data', 'data analytics', 'data visualization', 'data privacy', 'data governance'
    ],
    'Affective Computing': [
        'affective computing', 'emotion recognition', 'cognitive modeling', 'emotional behavior', 'human-centered computing', 
        'decision making', 'health', 'entertainment', 'pedagogy', 'ethics AI', 'human subjects experimental design', 
        'emotional intelligence', 'emotion elicitation', 'emotional prediction', 'social agents', 'human emotion', 
        'cognition', 'emotional regulation', 'emotional resilience'
    ],
    'Computer Animation and Simulation': [
        'computer animation', 'simulation', 'numerical methods', 'deformable objects', 
        'fluids', 'character rigging', 'inverse kinematics', 'motion capture', 'quaternions', 
        'rigid body dynamics', 'GPU programming', 'OpenGL', 'collision detection', 'haptics', 
        'crowd animation', 'facial animation', 'constraints', 'sound simulation', 'CUDA', 
        'OpenCL', 'computer animation engines', 'Unity', 'Unreal Engine', 'Havok Physics', 
        'Open Dynamics Engine', 'Vega FEM'
    ],
    'Foundations and Applications of Data Mining': [
        'data mining', 'machine learning', 'map reduce', 'python', 'probability', 'linear algebra', 
        'algorithm design', 'unix', 'spark', 'statistics', 'data analysis', 'data transformation', 
        'pattern discovery', 'large data sets', 'real world problems', 'data mining algorithms', 
        'massive data analysis', 'data mining techniques'
    ],
    'Machine Learning for Data Science': [
        'machine learning', 'data mining', 'recommendation systems', 'data analytics', 'probability', 
        'statistics', 'linear algebra', 'supervised learning', 'unsupervised learning', 'regression methods', 
        'resampling methods', 'cross-validation', 'decision trees', 'dimensionality reduction', 'regularization', 
        'clustering', 'kernel methods', 'hidden Markov models', 'neural networks', 'backpropagation', 'gradient descent', 
        'autoencoders', 'dropout', 'adversarial training', 'convolutional neural networks', 'reinforcement learning', 
        'q-learning', 'deep reinforcement learning'
    ],
    'Multimodal Probabilistic Learning of Human Communication': [
        'gesture', 'affective computing', 'sentiment analysis', 'neural networks', 'probabilistic models', 
        'human communication', 'multimodal learning', 'computer vision', 'speech recognition', 'emotion recognition', 
        'visual attention', 'video analysis', 'human-computer interaction'
    ],
    'Data Science at Scale': [
        'data science', 'big data', 'informatics', 'machine learning', 'data mining', 'data lifecycle', 
        'data visualization', 'hadoop', 'no sql', 'hbase', 'data exploration', 'data management', 
        'data scientist', 'big data analytics', 'data science tools', 'spark', 'data pipelines'
    ],
    'Advanced Topics in Operating Systems': [
        'cloud-hosted', 'web service', 'multi-threaded', 'smartphone application', 'system design', 
        'reliability', 'performance', 'scalability', 'concurrency', 'latency', 'throughput', 'optimization', 
        'computer systems', 'system scaling', 'system evaluation'
    ],
    'Game Engine Development': [
        'game engine', '3D game engine', 'real-time', 'multi-threaded', 'cross-platform', 'game development', 
        'rendering', 'game engine design', 'game engine programming', 'game engine subsystems', 'game engines'
    ],
    'Advanced Operating Systems': [
        'cloud computing', 'distributed systems', 'reliability', 'performance', 'scalability', 'concurrency', 
        'latency', 'throughput', 'optimization', 'system design', 'system evaluation', 'research paper', 'hardware'
    ],
    'Data Science Professional Practicum': [
        'data science', 'project management', 'lean six sigma', 'data analytics', 'machine learning', 
        'data mining', 'data management', 'business analytics', 'data visualization', 'data science project', 
        'capstone', 'data science industry', 'data science team', 'data science presentation'
    ],
    'Introduction to Machine Learning': [
        'machine learning', 'supervised learning', 'unsupervised learning', 'regression', 'classification', 
        'clustering', 'neural networks', 'decision trees', 'SVM', 'data pre-processing', 'model evaluation', 
        'cross-validation', 'overfitting', 'underfitting', 'bias-variance tradeoff'
    ],
    'Advanced Analysis of Algorithms': [
        'divide and conquer', 'heapsort', 'dynamic programming', 'greedy algorithms', 'amortized analysis', 
        'fibonacci heaps', 'minimum spanning trees', 'shortest paths', 'maximum flow', 'approximation algorithms', 
        'NP-completeness'
    ],
    'Introduction to Computational Thinking and Data Science': [
        'data science', 'data analysis', 'machine learning', 'supervised learning', 'unsupervised learning', 
        'clustering', 'anomaly detection', 'probabilistic graphical models', 'bayesian networks', 'text mining', 
        'image processing', 'geospatial data', 'data visualization'
    ],
    'Special Topics': [
        'distributed systems', 'fault tolerance', 'big data', 'cloud computing', 'network communication', 
        'consensus algorithms', 'distributed job scheduling', 'distributed systems optimization', 'energy efficiency'
    ],
    'Introduction to Programming': [
        'variables', 'types', 'loops', 'conditional statements', 'functions', 'arrays', 'recursion', 
        'object-oriented programming', 'data structures', 'algorithm implementation', 'C++', 'memory management'
    ],
    'Practicum in Teaching Computer Science': [
        'remote teaching', 'zoom', 'grading', 'student presentations', 'lectures', 'assignments', 'grading policies'
    ],
    'Structure and Dynamics of Networked Information': [
        'software engineering', 'web applications', 'network design', 'modular code', 'agile techniques', 'database', 
        'cloud computing', 'object-oriented design', 'software systems'
    ],
    'Introduction to Programming Systems Design': [
        'systems design', 'algorithm implementation', 'object-oriented programming', 'data structures', 
        'C++ programming', 'computer systems', 'recursion', 'dynamic memory', 'computational complexity'
    ],
    'Advanced Computer Vision': [
        'computer vision', 'deep learning', 'image processing', 'neural networks', 'image segmentation', 
        'object detection', 'image recognition', 'feature extraction', 'convolutional neural networks', 
        'image enhancement'
    ],
    'Theory of Computation': [
        'automata', 'complexity theory', 'computability', 'context-free languages', 'decidability', 'turing machines', 
        'nondeterminism', 'NP-completeness', 'quantum computing', 'regular expressions'
    ],
    'Introduction to Algorithms and Theory of Computing': [
        'algorithms', 'dynamic programming', 'greedy algorithms', 'network flow', 'NP-complete', 'graph theory', 
        'lazy programming', 'number theory', 'cryptography', 'union-find data structure', 'skip lists'
    ],
    'Haptic Interfaces and Virtual Environments': [
        'haptics', 'human-computer interaction', 'robotics', 'virtual reality', 'teleoperation', 'perception', 
        'tactile sensing', 'haptic devices', 'multi-DOF kinesthetic devices', 'haptic rendering', 'haptic technology'
    ],
    'Building Knowledge Graphs': [
        'information extraction', 'entity linking', 'semantic web', 'knowledge graphs', 'graph databases', 
        'SPARQL', 'data cleaning', 'probabilistic models', 'entity resolution', 'graph algorithms', 'question answering'
    ],
    'Analysis of Algorithms': ['algorithm', 'analysis', 'design', 'greedy', 'dynamic programming', 'network flow', 'linear programming', 'NP-completeness', 'approximation', 'randomization', 'reduction', 'Turing machine', 'data structures', 'discrete mathematics', 'probability', 'pseudocode', 'computer science', 'theory', 'assignments', 'homework', 'collaboration', 'grading', 'explanation', 'time management'],
    
    'Fundamentals of Computation': ['algorithmic thinking', 'C++', 'data representation', 'discrete math', 'control structures', 'functions', 'arrays', 'problem-solving', 'programming', 'conditional statements', 'loops', 'binary search', 'time complexity', 'interpreted languages', 'python', 'debugging', 'style guidelines', 'logical operators', 'boolean data type', 'string operations'],
    
    'Web Technologies': ['html', 'css', 'http', 'web servers', 'javascript', 'ajax', 'json', 'python', 'server-side', 'client-side', 'web services', 'rest', 'security', 'privacy', 'cloud computing', 'mobile app development', 'react', 'nodejs', 'microservices', 'aws', 'google cloud platform', 'angular', 'mongodb', 'cookies', 'responsive design', 'website design'],
    
    'Principles of Software Development': ['software development', 'version control', 'agile methodology', 'testing', 'debugging', 'project management', 'teamwork', 'design patterns', 'software design', 'documentation', 'communication', 'requirements gathering', 'coding standards', 'maintenance'],
    
    'Machine Learning': ['machine learning', 'regression', 'classification', 'clustering', 'SVMs', 'ANNs', 'dimensionality reduction', 'linear algebra', 'probability theory', 'calculus', 'regularization', 'Bayesian inference', 'greedy algorithms', 'randomized algorithms', 'linear programming', 'approximation algorithms', 'data structures', 'neural networks', 'convolutional neural networks', 'backpropagation'],
    
    'Introduction to Artificial Intelligence': ['agents', 'problem solving', 'search algorithms', 'knowledge representation', 'reasoning', 'planning', 'machine learning', 'rationality', 'intelligence', 'search algorithms', 'games', 'logical agents', 'automated planning', 'supervised learning', 'unsupervised learning', 'neural networks', 'reinforcement learning', 'q-learning', 'exploration-exploitation'],
    
    'Security and Privacy': ['cryptography', 'network security', 'privacy', 'authentication', 'encryption', 'firewalls', 'data protection', 'threat models', 'security protocols', 'access control', 'vulnerabilities', 'privacy laws', 'ethical hacking'],
    
    'Deep Learning and Its Applications': ['deep learning', 'neural networks', 'computer vision', 'natural language processing', 'reinforcement learning', 'graph networks', 'attention mechanisms', 'generative models', 'transfer learning', 'convolutional neural networks', 'recurrent neural networks', 'optimization', 'loss functions'],
    
    'Information Retrieval and Web Search Engines': ['data science', 'machine learning', 'data structures', 'algorithms', 'data mining', 'data visualization', 'statistics', 'probability', 'mathematics', 'linear algebra', 'discrete mathematics', 'combinatorics', 'graph theory', 'programming languages', 'python', 'java', 'database systems', 'big data', 'hadoop', 'spark', 'neural networks'],
    
    'Multimedia Systems Design': ['multimedia', 'images', 'video', 'audio', 'graphics', 'compression', 'storage', 'networking', 'quality of service', 'encryption', 'jpeg', 'mpeg', 'database systems', 'signal processing', 'image processing', 'video processing', 'audio processing'],
    
    'Operating Systems': ['operating systems', 'process management', 'memory management', 'filesystems', 'network communication', 'C programming', 'Python programming', 'Unix', 'Exokernel', 'system architecture', 'resource management'],
    
    'Data Structures and Object Oriented Design': ['data structures', 'linked lists', 'trees', 'priority queues', 'hashtables', 'algorithm analysis', 'sorting algorithms', 'graph search', 'recursion', 'C++ programming', 'object-oriented design', 'modular programs', 'extensible programs'],
    
    'Principles of Programming for Data Science': ['python', 'data manipulation', 'searching', 'analysis', 'visualization', 'web scraping', 'SQL', 'XML', 'JSON', 'api', 'pandas', 'regex', 'git', 'data science'],
    
    'Advanced Game Projects': ['game development', 'project management', 'team collaboration', 'game design', 'game production', 'game testing', 'game marketing', 'development tools', 'game production pipeline', 'best practices', 'software methodologies'],
    
    'Foundations of Artificial Intelligence': ['Artificial Intelligence', 'Machine Learning', 'Deep Learning', 'Neural Networks', 'Probabilistic Reasoning', 'Decision Making', 'Uncertainty', 'Logic', 'Planning', 'Reinforcement Learning', 'Knowledge Representation', 'Search', 'Problem Solving', 'Inference', 'Markov Decision Processes', 'Hidden Markov Models', 'Kalman Filters', 'Robotics'],
    
    'Capstone': ['software engineering', 'teamwork', 'professional preparation', 'project management', 'software solutions', 'complex computing problems', 'communication', 'ethical principles', 'professional development', 'interviews', 'project presentations']
}


In [45]:
contents_rev

{'Principles of Software Development': 'CSCI-201',
 'Introduction to Operating Systems': 'CSCI-350',
 'Operating Systems': 'CSCI-402',
 'Directed Research': 'DSCI-590',
 'Advanced Mobile Devices and Game Consoles': 'CSCI-526',
 'Machine Learning': 'CSCI-567',
 'Analysis of Algorithms': 'CSCI-570',
 'Web Technologies': 'CSCI-571',
 'Database Systems': 'CSCI-585',
 "Master's Thesis": 'CSCI-594Z',
 'Research': 'CSCI-790',
 'Doctoral Dissertation': 'CSCI-794Z',
 'Fundamentals of Computation': 'CSCI-102 L',
 'Introduction to Programming': 'CSCI-103 L',
 'Data Structures and Object Oriented Design': 'CSCI-104 L',
 'Discrete Methods in Computer Science': 'CSCI-170',
 'Introduction to Algorithms and Theory of Computing': 'CSCI-270',
 'Special Topics': 'DSCI-599',
 'Software Engineering': 'CSCI-577A',
 'Introduction to Internetworking': 'CSCI-353',
 'Introduction to Computer Systems': 'CSCI-356',
 'Introduction to Artificial Intelligence': 'CSCI-360',
 'Capstone': 'CSCI-401',
 'Computer Graphic

In [46]:
gpt_extraction = {contents_rev[k]: v for k, v in gpt_extraction.items()}


with open('gpt_extraction.json', 'w', encoding='utf-8') as f:
    json.dump(gpt_extraction, f, ensure_ascii=False, indent=4)