In [1]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from sentence_transformers import SentenceTransformer, util
import warnings

warnings.filterwarnings("ignore")

In [2]:
# Configuration
class CFG:
    MODEL_ID = 'EleutherAI/gpt-neo-2.7B'
    MAX_QUESTIONS = 1000
    EXPERIENCE_LEVELS = ['Entry-Level', 'Mid-Level', 'Senior-Level']
    THRESHOLD_SIMILARITY = 0.8


In [4]:
# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(CFG.MODEL_ID)
model = AutoModelForCausalLM.from_pretrained(CFG.MODEL_ID)
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0)  # Assuming using GPU (device=0)
similarity_model = SentenceTransformer('all-MiniLM-L6-v2')


OSError: We couldn't connect to 'https://huggingface.co' to load this file, couldn't find it in the cached files and it looks like EleutherAI/gpt-neo-2.7B is not the path to a directory containing a file named config.json.
Checkout your internet connection or see how to run the library in offline mode at 'https://huggingface.co/docs/transformers/installation#offline-mode'.

In [ ]:
# Initialize DataFrame and embeddings cache
df = pd.DataFrame(columns=['Experience Level', 'Question', 'Answer'])
questions_embeddings_cache = torch.tensor([])


In [ ]:
def is_question_unique(new_question_embedding, questions_embeddings_cache):
    if questions_embeddings_cache.nelement() == 0:
        return True
    similarity_scores = util.pytorch_cos_sim(new_question_embedding, questions_embeddings_cache)
    max_similarity = torch.max(similarity_scores)
    return max_similarity.item() < CFG.THRESHOLD_SIMILARITY


In [ ]:
def generate_question_answer(experience_level, pipe, similarity_model, questions_embeddings_cache):
    question_prompt = f"You are an AI assistant skilled in product management. Create an interview question suitable for a {experience_level} Product Manager."
    generated_question = pipe(question_prompt, max_length=300)[0]['generated_text']
    new_question_embedding = similarity_model.encode(generated_question, convert_to_tensor=True)

    if is_question_unique(new_question_embedding, questions_embeddings_cache):
        questions_embeddings_cache = torch.cat((questions_embeddings_cache, new_question_embedding.unsqueeze(0)), dim=0)
        answer_prompt = f"Given the question: '{generated_question}', provide a comprehensive answer as would be expected from a {experience_level} Product Manager."
        generated_answer = pipe(answer_prompt, max_length=300)[0]['generated_text']
        return generated_question, generated_answer, True, questions_embeddings_cache
    return None, None, False, questions_embeddings_cache


In [ ]:
# Generate Q&A pairs
i=0
for level in CFG.EXPERIENCE_LEVELS:
    print("Experience Level: ",CFG.EXPERIENCE_LEVELS)
    for _ in range(CFG.MAX_QUESTIONS // len(CFG.EXPERIENCE_LEVELS)):
        question, answer, is_unique, questions_embeddings_cache = generate_question_answer(level, pipe, similarity_model, questions_embeddings_cache)
        if is_unique:
            df = pd.concat([df, pd.DataFrame({'Experience Level': [level], 'Question': [question], 'Answer': [answer]})], ignore_index=True)
            i+=1
            print(i)


In [ ]:
# Save the DataFrame
df.drop_duplicates(subset=['Question', 'Answer'], inplace=True)
df.to_csv('product_manager_interview_qa_levels.csv', index=False)