In [None]:
#importing the required libraries  
import json  
import re   

In [None]:
# Technology categories and keywords   
TECHNOLOGY_PATTERNS = {
    "programming_languages": ["python", "sql", "java", "c", "matlab", "sas", "bash"],
    
    "ml_frameworks": ["tensorflow", "pytorch", "keras", "scikit-learn", "sklearn", "xgboost",],

    "big_data": ["spark", "pyspark", "hadoop", "airflow", "dask","databricks", "snowflake",],

    "cloud_platforms": [ "aws", "azure", "google cloud", "amazon web services","lambda"],

    "databases": ["mysql", "postgresql", "mongodb", "oracle"], 

    "data_tools": ["pandas", "numpy", "scipy", "matplotlib", "seaborn", "plotly","tableau", "powerbi",  "excel", "jupyter"],

    "ml_concepts": ["machine learning", "deep learning", "NLP","regression", "classification",
                    "clustering", "random forest", "gradient boosting", "cnn", "rnn","reinforcement learning", ],
}

def extract_technologies(job_description: str, TECHNOLOGY_PATTERNS: dict) -> dict:
    text = job_description.lower()  # convert text to lowercase
    results = {}

    for category, keywords in TECHNOLOGY_PATTERNS.items():
        matches = []  # list to store found technologies in this category
        for word in keywords:
            # check if the word exists in the text (whole word match)
            if re.search(r'\b' + re.escape(word) + r'\b', text):
                matches.append(word.replace("\\+", "+"))
        if matches:
            results[category] = sorted(matches)  # store sorted list of matches

    return results

def analyze_job_description(job_description: str) -> str:
    technologies = extract_technologies(job_description)
    output = {
        "extracted_technologies": technologies,
        "total_categories": len(technologies),
        "total_technologies": sum(len(v) for v in technologies.values())
    }
    return json.dumps(output,indent=2)



In [30]:
# Example usage
if __name__ == "__main__":
    sample_job = """
    We are looking for a Data Scientist with expertise in Python and SQL.
    Experience with TensorFlow, PyTorch, and scikit-learn is required.
    You should be familiar with AWS, Spark, and Snowflake.
    Knowledge of NLP, deep learning, and transformer models is a plus.
    Experience with Docker, Git, and MLflow for MLOps practices.
    Strong skills in Pandas, NumPy, and data visualization using Tableau.
    """
    result = extract_technologies(sample_job, TECHNOLOGY_PATTERNS)
    print(result)


{'programming_languages': ['python', 'sql'], 'ml_frameworks': ['pytorch', 'scikit-learn', 'tensorflow'], 'big_data': ['snowflake', 'spark'], 'cloud_platforms': ['aws'], 'data_tools': ['numpy', 'pandas', 'tableau'], 'ml_concepts': ['deep learning']}
