In [62]:
import random 
import torch
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM


def load_llm_model(llm_model_name, device="cpu"):
    llm_model = AutoModelForCausalLM.from_pretrained(llm_model_name, 
                                                     torch_dtype=torch.float16).to(device)
    return llm_model

def load_tokenizer(llm_model_name, device="cpu"):
    tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
    return tokenizer
    
def generate_prompt():
    question_type = {
        0: "definition",
        1: "formula",
        2: "scenario",
        3: "case-study",
        4: "algorithmic theory",
        5: "mathematics",
        6: "probability and statistics",
    }
    question_types = list()
    number_of_questions = 15
    for i in range(number_of_questions):
        random_idx = random.randint(0, len(question_type)-1)
        question_types.append(f"{i+1}. {question_type[random_idx]}")
    question_types = "\n".join(question_types)
    base_prompt = f"""
    Generate {number_of_questions} questions for an Interview for the position of Machine Learning Engineer.
    Each question should be of type as mentioned below:
    {question_types}
    The output provided should be in format as given below:
    Question 1: Question Type 1 ....
    Question 2: Question Type 2 ....
    Question 3: Question Type 3 .....
    and so on.
    No extra text should be generated as answer.
    The case study questions should be well defined.
    The questions should only be of the specified type.
    """
    
    base_prompt = base_prompt.format(question_types = question_types, 
                                     number_of_questions = number_of_questions)
    dialogue_template = [{
        "role":"user",
        "message":base_prompt
    }]
    prompt = tokenizer.apply_chat_template(conversation=dialogue_template,
                                           tokenize=False,
                                           add_generation_prompt=True)
    return base_prompt

def generate_question(llm_model, tokenizer, prompt, temperature=0.3, max_new_tokens=2048, device="cpu"):
    input_ids = tokenizer([prompt], 
                          return_tensors="pt").to(device)
    print("INPUT IDS")
    outputs = llm_model.generate(**input_ids, 
                                 temperature=temperature, 
                                 do_sample=True, 
                                 max_new_tokens=max_new_tokens)
    print("OUTPUTS")
    output_text = tokenizer.batch_decode(outputs)[0]
    return output_text

def post_process_questions(questions):
    questions = questions[questions.find("\n\n**")+4:].replace("**","").replace("\n","")
    questions = questions.split("Question")
    questions = [question for question in questions if question]
    questions[-1] = questions[-1][:-5]
    return questions

if __name__ == "__main__":
    print("STARTING")
    # llm_model_name = "mistralai/Mistral-7B-v0.1"
    llm_model_name = "google/gemma-2b-it"
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    llm_model = load_llm_model(llm_model_name, device)
    print("STEP 1 DONE")
    tokenizer = load_tokenizer(llm_model_name, device)
    print("STEP 2 DONE")
    prompt = generate_prompt()
    print(prompt)
    print("STEP 3 DONE")
    questions = generate_question(llm_model=llm_model, tokenizer=tokenizer, prompt=prompt, device=device)
    print("STEP 4 DONE")
    questionLst = post_process_questions(questions)
    print("ENDING")

STARTING


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

STEP 1 DONE
STEP 2 DONE

    Generate 15 questions for an Interview for the position of Machine Learning Engineer.
    Each question should be of type as mentioned below:
    1. probability and statistics
2. algorithmic theory
3. definition
4. algorithmic theory
5. mathematics
6. definition
7. algorithmic theory
8. case-study
9. case-study
10. definition
11. formula
12. mathematics
13. probability and statistics
14. formula
15. scenario
    The output provided should be in format as given below:
    Question 1: Question Type 1 ....
    Question 2: Question Type 2 ....
    Question 3: Question Type 3 .....
    and so on.
    No extra text should be generated as answer.
    The case study questions should be well defined.
    The questions should only be of the specified type.
    
STEP 3 DONE
INPUT IDS
OUTPUTS
STEP 4 DONE
ENDING


In [63]:
for question in questionLst:
    print(question)

 1:What is the difference between probability and statistics?
 2:Explain the concept of a decision boundary in the context of machine learning.
 3:Define the term "overfitting" in the context of machine learning.
 4:Describe the difference between supervised and unsupervised learning.
 5:What is the difference between linear regression and logistic regression?
 6:Explain the concept of dimensionality reduction in the context of machine learning.
 7:Define the term "ensemble learning" in the context of machine learning.
 8:Provide an example of a common algorithm used for time series analysis.
 9:Describe the difference between supervised and unsupervised learning.
 10:What is the difference between a hypothesis and a model in machine learning?
 11:Define the term "regularization" in the context of machine learning.
 12:What is the difference between a supervised and an unsupervised learning algorithm?
 13:What is the concept of "weak learning"?
 14:Describe the difference between a sup