In [None]:
from huggingface_hub import login
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch


class SkillExtractor:
    def __init__(self, hf_token, model_name="google/gemma-2-2b-it"):
        """Initialize the model, tokenizer, and device."""
        # Login to Hugging Face
        login(token=hf_token)

        # Set device (GPU if available, else CPU)
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        print(f"Using device: {self.device}")
        if self.device == "cuda":
            print(f"GPU: {torch.cuda.get_device_name()}")

        # Load model and tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForCausalLM.from_pretrained(
            model_name,
            # device_map="auto",
            torch_dtype=torch.float16 if self.device == "cuda" else torch.float32
        ).to(self.device)

    def extract_skills(self, problem):
        template = "Consider this question. Label this question with a skill that would be required to solve the question. Basically, you should be able to use the skill as a dictionary key in python. The skill name should be lower case letters only. The skill name should be very descriptive and you may use multiple words to describe the skills required in the question. If you do use multiple words per question, then you must join them by an underscore. Your answer should be as follows: <name of the skill>, reason: <reason for the skill>."
        prompt = template + problem
        inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, padding=True).to(self.device)

        # Limit the number of new tokens generated
        max_new_tokens = 1000

        with torch.no_grad():
            outputs = self.model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)

        response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Extract skills from response
        return response.strip()

In [None]:
import os

# gemma
hf_token = "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"
extractor = SkillExtractor(hf_token)


Using device: cuda
GPU: Tesla T4


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/47.0k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/838 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/24.2k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/241M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

In [None]:
left = "Consider this question. Label this question with a skill that would be required to solve the question. Basically, you should be able to use the skill as a dictionary key in python. The skill name should be lower case letters only. The skill name should be very descriptive and you may use multiple words to describe the skills required in the question. If you do use multiple words per question, then you must join them by an underscore. Your answer should be as follows: <name of the skill>, reason: <reason for the skill>. I remember going to the fireworks with my best friend. There was a lot of people, but it only felt like us in the world.\n\nWhat skill is required to solve this question?\n\nAnswer: \n "
right = "\n\n\nLet me know if you\ d like to try another example!"

In [None]:
def extract_problems(input_file):
    """Reads a file and extracts programming problems separated by a delimiter."""

    # Read and split problems
    with open(input_file, "r", encoding="utf-8") as file:
        content = file.read().strip()  # Read entire file and remove trailing spaces

    # Split the content into individual problems
    problems = [problem.strip() for problem in content.split("========================================") if problem.strip()]

    return problems

In [None]:
# Paths
output_path = "/content"
file_path = os.path.join(output_path, "programming_problems.txt")
output_file = os.path.join(output_path, "programming_gemma2b.txt")
import os
# Paths

# Placeholder function to process a problem using Gemma
def process_with_gemma(problem):
    skills = extractor.extract_skills(problem.strip())
    return skills

In [None]:
# For programming problems
problems_list = extract_problems(file_path)
all_skills = []
for count, problem in enumerate(problems_list):
    skills = process_with_gemma(problem)
    skills = skills.replace('\n', '').replace('\\', '')
    skills = skills[len(left):len(skills)-len(right)]  # Apply slicing
    all_skills.append(skills)
    print(count, skills)

# Save processed skills
with open(output_file, "a", encoding="utf-8") as outfile:
    if all_skills:
        outfile.write("\n\n".join(all_skills))
    print("Skills written to output file.")

print(f"Extracted skills saved to {output_file}")

0 e takes them both to a strange tree made up of N nodes. Each of these nodes is of one of the following two types — nodes containing open parenthesis ‘(’ and nodes containing closed parenthesis ‘)’. CodeChef asks the students Q queries, where in each query they have to find out if the path between two given nodes is a balanced parentheses string or not. If they solve all the queries, they will get to eat the special Christmas cake made by CodeChef.Samosa Bhai and Jalebi Bai are lazy kids, but they also want to eat the cake. So they ask you for help.Note: A balanced parentheses string means that each opening parenthesis has a corresponding closing parenthesis and the pairs of parentheses are properly nested.InputThe first line contains T, the number of test cases to follow.Each test case begins with N and Q, the number of nodes in the tree and the number of queries to follow.N-1 lines follow. Each line contains 2 space-separated integers, x and y, which denotes that there is an edge be

In [None]:

# Paths
output_path = "/content/"
file_path = os.path.join(output_path, "programming.txt")
output_file = os.path.join(output_path, "programming_output.txt")
import os
# Paths

# Placeholder function to process a problem using Gemma
def process_with_gemma(problem):
    skills = extractor.extract_skills(problem.strip())
    return skills

# Read the math problems and process them
count = 0
all_skills = []
with open(file_path, "r", encoding="utf-8") as file:
    for line in file:
        if line.strip():  # Ignore empty lines
            skills = process_with_gemma(line)
            skills.replace('\n', '').replace('\\', '')
            skills = skills[len(left):len(skills)-len(right)]  # Apply the slice directly after assignment
            all_skills.append(skills)
            print(count, skills)
            count += 1

# After processing all lines, write everything at once
with open(output_file, "a", encoding="utf-8") as outfile:
    if all_skills:
        outfile.write("\n\n".join(all_skills))  # Adds a newline between skills
    print("Skills written to output file.")


print(f"Extracted skills saved to {output_file}")


FileNotFoundError: [Errno 2] No such file or directory: '/content/programming.txt'

In [None]:
# Specify the file path
file_path = 'all_skills.txt'

# Open the file in write mode and save the list as a string
with open(file_path, 'w') as f:
    for skill in all_skills:
        f.write(f"{skill}\n")

In [None]:
a = ["Consider this question. Label this question with a skill that would be required to solve the question. Basically, you should be able to use the skill as a dictionary key in python. The skill name should be lower case letters only. The skill name should be very descriptive and you may use multiple words to describe the skills required in the question. If you do use multiple words per question, then join them by an underscore. Your answer should be as follows: <name of the skill>, reason: <reason for the skill>.George wants to warm his hands quickly by rubbing them. Which skin surface will produce the most heat?\n\nThis question requires the understanding of the following skills:\n<skill name> : <reason> \n\n**Answer:**\n\n<skill name> : **physiology of heat transfer**\n<reason> : The question asks about the skin surface that produces the most heat, which relates to the principles of heat transfer. \n\n\nLet me know if you'd like to try another question!"].split()

AttributeError: 'list' object has no attribute 'split'

In [None]:
len(a)

1

In [None]:
b = "Consider this question. Label this question with a skill that would be required to solve the question. Basically, you should be able to use the skill as a dictionary key in python. The skill name should be lower case letters only. The skill name should be very descriptive and you may use multiple words to describe the skills required in the question. If you do use multiple words per question, then join them by an underscore. Your answer should be as follows: <name of the skill>, reason: <reason for the skill>."

In [None]:
len(b)

515

In [None]:
c = "\n\n\nLet me know if you'd like to try another question!"

In [None]:
len(c)

53

In [None]:
size - 53