In [None]:
pip install transformers accelerate torch sentencepiece



In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

# -------------------------------------------------
# 1) Store the Job Description (JD)
# -------------------------------------------------
JOB_DESCRIPTION = """
About the job
We are partnered with a rapidly growing home services company that provides outsourced customer service
to the nation's leading utility companies! They have helped protect homeowners from the costs of unexpected repairs
to their home systems, lines, and appliances for over 30 years. They service over 1 million repair and maintenance plans
in 16 states. They help homeowners avoid the hassle of finding a repair professional
when an essential home system or appliance breaks down.

As a Data Analyst, you’ll take a proactive role in assessing process chains to uncover inefficiencies
and recommend strategic improvements. By leveraging advanced data analysis and workflow optimization techniques,
you’ll drive impactful process enhancements and support overall operational excellence.

REWARDS
Hourly Rate - $34 - $45 /Hour based on experience
Consistent work schedule
Ample opportunity for career advancement
Full benefits and flexibility upon conversion

Requirements
2-3 year of data analysis or reporting experience
Strong data visualization experience within Tableau, Power BI, or Looker
Advanced excel skills including pivots and macros
GED / Highschool diploma required, college degree preferred
Must be able to commute to Naperville, IL

Responsibilities
Lead data analysis projects, ensuring accuracy and timely delivery of insights.
Collaborate with leadership to identify key performance indicators (KPIs) and track business goals.
Analyze complex datasets to identify trends, anomalies, and actionable opportunities.
Work closely with cross-functional teams to understand business needs and deliver tailored solutions.
Conduct advanced statistical analysis to solve business problems and optimize operations.
"""


# -------------------------------------------------
# 2) Unified Prompt
# -------------------------------------------------
# We'll use the same prompt for each model to standardize comparisons.
# The prompt asks the LLM to extract relevant skills, tools, or technologies from the JD.

SKILL_EXTRACTION_PROMPT = """You are an AI assistant trained to analyze job descriptions and extract the key technical and non-technical skills, tools, or qualifications mentioned.
Focus on listing them succinctly. For example, if the JD mentions 'Power BI', 'Tableau', 'Excel macros', it should appear in your final list.

Job Description:
{jd}

Now list the relevant skills, tools, and technologies:
"""


# -------------------------------------------------
# 3) Common "generate_text" function
# -------------------------------------------------
def generate_text(model_name: str, prompt_text: str, max_new_tokens=200, temperature=0.0):
    """
    1. Loads AutoModelForCausalLM and AutoTokenizer for 'model_name'.
    2. Creates a pipeline for text generation.
    3. Generates text from the given prompt.

    Note: This can be resource-intensive for large models.
          Ensure you have enough VRAM or CPU resources.
    """
    print(f"\n--- Loading: {model_name} ---")
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)

    # Some models (like Llama) might need special arguments, e.g.:
    # tokenizer.pad_token_id = tokenizer.eos_token_id
    # or trust_remote_code=True, etc.

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16,  # or float32 if you have large GPU mem
        device_map="auto",         # automatically place model on available GPUs
    )

    # Create pipeline
    gen_pipeline = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=max_new_tokens,
        temperature=temperature,
        do_sample=False,  # deterministic output
    )

    # Generate
    output = gen_pipeline(prompt_text)
    text_output = output[0]["generated_text"]
    return text_output


# -------------------------------------------------
# 4) Specific Functions for Each Model
# -------------------------------------------------
# def extract_skills_llama2(jd_text):
#     """
#     Llama 2 (e.g., meta-llama/Llama-2-7b-chat-hf or Llama-2-13b, etc.).
#     Must have license acceptance in HF.
#     """
#     model_name = "meta-llama/Llama-2-7b-chat-hf"  # Example 7B Chat model
#     prompt = SKILL_EXTRACTION_PROMPT.format(jd=jd_text)
#     result = generate_text(model_name, prompt)
#     return result

def extract_skills_falcon(jd_text):
    """
    Falcon (7B, 40B). We'll use the instruct variant for better instruction following.
    Example: "tiiuae/falcon-7b-instruct" or "tiiuae/falcon-40b-instruct"
    """
    model_name = "tiiuae/falcon-7b-instruct"
    prompt = SKILL_EXTRACTION_PROMPT.format(jd=jd_text)
    result = generate_text(model_name, prompt)
    return result

def extract_skills_mistral(jd_text):
    """
    Mistral model (7B).
    Currently, "mistralai/Mistral-7B-v0.1" is a base model,
    not heavily instruction-tuned. You might use a community instruct-tuned variant.
    """
    model_name = "mistralai/Mistral-7B-v0.1"
    prompt = SKILL_EXTRACTION_PROMPT.format(jd=jd_text)
    result = generate_text(model_name, prompt)
    return result

def extract_skills_mpt(jd_text):
    """
    MPT model by MosaicML.
    We'll use the instruct variant "mosaicml/mpt-7b-instruct" for better prompt adherence.
    """
    model_name = "mosaicml/mpt-7b-instruct"
    prompt = SKILL_EXTRACTION_PROMPT.format(jd=jd_text)
    result = generate_text(model_name, prompt)
    return result

def extract_skills_bloom(jd_text):
    """
    BLOOM model (e.g., "bigscience/bloom-7b1" or the full "bigscience/bloom" at 176B).
    We'll pick smaller Bloom for local usage, e.g. bloom-7b1 if available.
    """
    # "bigscience/bloom-7b1" - there's also bloom-3b, bloom-1b7, etc.
    model_name = "bigscience/bloom-7b1"
    prompt = SKILL_EXTRACTION_PROMPT.format(jd=jd_text)
    result = generate_text(model_name, prompt)
    return result

def extract_skills_gpt_neo(jd_text):
    """
    GPT-Neo / GPT-J / GPT-NeoX from EleutherAI, e.g.:
      - "EleutherAI/gpt-neo-2.7B"
      - "EleutherAI/gpt-j-6B"
      - "EleutherAI/gpt-neox-20b"
    We'll pick GPT-J (6B) as an example.
    """
    model_name = "EleutherAI/gpt-j-6B"
    prompt = SKILL_EXTRACTION_PROMPT.format(jd=jd_text)
    result = generate_text(model_name, prompt)
    return result


# -------------------------------------------------
# 5) Compare Results - Main
# -------------------------------------------------
if __name__ == "__main__":

    # We'll create a dictionary of "model label" -> function
    models_to_test = {
        # "Llama 2": extract_skills_llama2,
        "Falcon": extract_skills_falcon,
        "Mistral": extract_skills_mistral,
        "MPT": extract_skills_mpt,
        "BLOOM": extract_skills_bloom,
        "GPT-J (EleutherAI)": extract_skills_gpt_neo
    }

    # Iterate over each approach
    for model_label, extractor_func in models_to_test.items():
        print(f"\n==================== {model_label} ====================")
        try:
            output_text = extractor_func(JOB_DESCRIPTION)
            print(f"--- Extracted Skills/Tools for {model_label} ---\n{output_text}")
        except Exception as e:
            print(f"!!! Error running {model_label} -> {e}")




--- Loading: tiiuae/falcon-7b-instruct ---


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.13k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.73M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/281 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/17.7k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.48G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/117 [00:00<?, ?B/s]

Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


--- Extracted Skills/Tools for Falcon ---
You are an AI assistant trained to analyze job descriptions and extract the key technical and non-technical skills, tools, or qualifications mentioned.
Focus on listing them succinctly. For example, if the JD mentions 'Power BI', 'Tableau', 'Excel macros', it should appear in your final list.

Job Description:

About the job
We are partnered with a rapidly growing home services company that provides outsourced customer service 
to the nation's leading utility companies! They have helped protect homeowners from the costs of unexpected repairs 
to their home systems, lines, and appliances for over 30 years. They service over 1 million repair and maintenance plans 
in 16 states. They help homeowners avoid the hassle of finding a repair professional 
when an essential home system or appliance breaks down.

As a Data Analyst, you’ll take a proactive role in assessing process chains to uncover inefficiencies 
and recommend strategic improvements. By 

tokenizer_config.json:   0%|          | 0.00/237 [00:00<?, ?B/s]

!!! Error running MPT -> Tokenizer class GPTNeoXTokenizer does not exist or is not currently imported.


--- Loading: bigscience/bloom-7b1 ---


tokenizer_config.json:   0%|          | 0.00/222 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/739 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/28.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.16G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


--- Extracted Skills/Tools for BLOOM ---
You are an AI assistant trained to analyze job descriptions and extract the key technical and non-technical skills, tools, or qualifications mentioned.
Focus on listing them succinctly. For example, if the JD mentions 'Power BI', 'Tableau', 'Excel macros', it should appear in your final list.

Job Description:

About the job
We are partnered with a rapidly growing home services company that provides outsourced customer service 
to the nation's leading utility companies! They have helped protect homeowners from the costs of unexpected repairs 
to their home systems, lines, and appliances for over 30 years. They service over 1 million repair and maintenance plans 
in 16 states. They help homeowners avoid the hassle of finding a repair professional 
when an essential home system or appliance breaks down.

As a Data Analyst, you’ll take a proactive role in assessing process chains to uncover inefficiencies 
and recommend strategic improvements. By l

tokenizer_config.json:   0%|          | 0.00/619 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/4.04k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/357 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.37M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/930 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/24.2G [00:00<?, ?B/s]

Some weights of the model checkpoint at EleutherAI/gpt-j-6B were not used when initializing GPTJForCausalLM: ['transformer.h.0.attn.bias', 'transformer.h.0.attn.masked_bias', 'transformer.h.1.attn.bias', 'transformer.h.1.attn.masked_bias', 'transformer.h.10.attn.bias', 'transformer.h.10.attn.masked_bias', 'transformer.h.11.attn.bias', 'transformer.h.11.attn.masked_bias', 'transformer.h.12.attn.bias', 'transformer.h.12.attn.masked_bias', 'transformer.h.13.attn.bias', 'transformer.h.13.attn.masked_bias', 'transformer.h.14.attn.bias', 'transformer.h.14.attn.masked_bias', 'transformer.h.15.attn.bias', 'transformer.h.15.attn.masked_bias', 'transformer.h.16.attn.bias', 'transformer.h.16.attn.masked_bias', 'transformer.h.17.attn.bias', 'transformer.h.17.attn.masked_bias', 'transformer.h.18.attn.bias', 'transformer.h.18.attn.masked_bias', 'transformer.h.19.attn.bias', 'transformer.h.19.attn.masked_bias', 'transformer.h.2.attn.bias', 'transformer.h.2.attn.masked_bias', 'transformer.h.20.attn.bi

--- Extracted Skills/Tools for GPT-J (EleutherAI) ---
You are an AI assistant trained to analyze job descriptions and extract the key technical and non-technical skills, tools, or qualifications mentioned.
Focus on listing them succinctly. For example, if the JD mentions 'Power BI', 'Tableau', 'Excel macros', it should appear in your final list.

Job Description:

About the job
We are partnered with a rapidly growing home services company that provides outsourced customer service 
to the nation's leading utility companies! They have helped protect homeowners from the costs of unexpected repairs 
to their home systems, lines, and appliances for over 30 years. They service over 1 million repair and maintenance plans 
in 16 states. They help homeowners avoid the hassle of finding a repair professional 
when an essential home system or appliance breaks down.

As a Data Analyst, you’ll take a proactive role in assessing process chains to uncover inefficiencies 
and recommend strategic impro


# Just Falcon

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import json

In [None]:
JOB_DESCRIPTION = """
About the job
We are partnered with a rapidly growing home services company that provides outsourced customer service
to the nation's leading utility companies! They have helped protect homeowners from the costs of unexpected repairs
to their home systems, lines, and appliances for over 30 years. They service over 1 million repair and maintenance plans
in 16 states. They help homeowners avoid the hassle of finding a repair professional
when an essential home system or appliance breaks down.

As a Data Analyst, you’ll take a proactive role in assessing process chains to uncover inefficiencies
and recommend strategic improvements. By leveraging advanced data analysis and workflow optimization techniques,
you’ll drive impactful process enhancements and support overall operational excellence.

REWARDS
Hourly Rate - $34 - $45 /Hour based on experience
Consistent work schedule
Ample opportunity for career advancement
Full benefits and flexibility upon conversion

Requirements
2-3 year of data analysis or reporting experience
Strong data visualization experience within Tableau, Power BI, or Looker
Advanced excel skills including pivots and macros
GED / Highschool diploma required, college degree preferred
Must be able to commute to Naperville, IL

Responsibilities
Lead data analysis projects, ensuring accuracy and timely delivery of insights.
Collaborate with leadership to identify key performance indicators (KPIs) and track business goals.
Analyze complex datasets to identify trends, anomalies, and actionable opportunities.
Work closely with cross-functional teams to understand business needs and deliver tailored solutions.
Conduct advanced statistical analysis to solve business problems and optimize operations.
"""


In [None]:
PROMPT_TEMPLATE = """You are an AI assistant trained to analyze job descriptions and extract the key technical and non-technical skills, tools, or qualifications mentioned.
Focus on listing them succinctly. For example, if the JD mentions 'Power BI', 'Tableau', 'Excel macros', it should appear in your final list.
Make sure return just single keyword, no duplicates and no extra Informationm ** Just Keywords **.

Job Description:
{jd}

Now list the relevant skills, tools, and technologies:"""

In [None]:
def extract_keywords_falcon(jd_text: str, max_new_tokens=300):
    model_name = "tiiuae/falcon-7b-instruct"

    prompt_text = PROMPT_TEMPLATE.format(jd=jd_text)

    print(f"\n--- Loading model: {model_name} ---")
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16,
        device_map="auto",
    )

    generation_pipeline = pipeline(
        task="text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=max_new_tokens,
        temperature=0.0,   # deterministic
        do_sample=False,   # no random sampling
    )

    output = generation_pipeline(prompt_text)
    raw_text = output[0]["generated_text"]

    # Debug: Print model output to see if it's valid JSON
    print("Raw model output:\n", raw_text)

    keywords_list = []
    # Attempt to isolate everything from first '[' to last ']' in case there's extra text
    start_idx = raw_text.find("[")
    end_idx = raw_text.rfind("]")

    if start_idx != -1 and end_idx != -1:
        json_str = raw_text[start_idx:end_idx+1].strip()
    else:
        json_str = raw_text.strip()

    # Optional: if the model used single quotes instead of double quotes
    # you can do a naive replace as a fallback (not always safe):
    json_str = json_str.replace("'", '"')

    try:
        keywords_list = json.loads(json_str)
        # Ensure it's a list of strings
        if not isinstance(keywords_list, list):
            keywords_list = []
        else:
            # Make sure each item is a string
            keywords_list = [str(item) for item in keywords_list]
    except Exception as e:
        print(f"Error parsing JSON: {e}")
        # Fallback: empty or partial?

    return keywords_list


In [None]:
if __name__ == "__main__":
    extracted_keywords = extract_keywords_falcon(JOB_DESCRIPTION)
    print("\nExtracted Keywords List (Falcon):")
    print(extracted_keywords)


--- Loading model: tiiuae/falcon-7b-instruct ---


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


Raw model output:
 You are an AI assistant trained to analyze job descriptions and extract the key technical and non-technical skills, tools, or qualifications mentioned.
Focus on listing them succinctly. For example, if the JD mentions 'Power BI', 'Tableau', 'Excel macros', it should appear in your final list.
Make sure return just single keyword, no duplicates and no extra Informationm ** Just Keywords **.

Job Description:

About the job
We are partnered with a rapidly growing home services company that provides outsourced customer service
to the nation's leading utility companies! They have helped protect homeowners from the costs of unexpected repairs
to their home systems, lines, and appliances for over 30 years. They service over 1 million repair and maintenance plans
in 16 states. They help homeowners avoid the hassle of finding a repair professional
when an essential home system or appliance breaks down.

As a Data Analyst, you’ll take a proactive role in assessing process cha