# Human Like Quesitons

In [2]:
import os
from openai import OpenAI

openai_api_key = os.environ.get("OPENAI_API_KEY")
client = OpenAI()

#### Load Existing Questons

In [2]:
# from src.gdrive_api.utils import extract_questions, find_and_load_all_problems, extract_questions_by_topic

# parent_folder_path = './notebooks'

# all_problems = find_and_load_all_problems(parent_folder_path)
# questions = extract_questions(all_problems)
# existing_questions = list(questions)
# existing_questions

In [3]:
from src.gdrive_api.utils import extract_questions, find_and_load_all_problems, extract_questions_by_topic

parent_dir = "./notebooks"
all_problems = find_and_load_all_problems(parent_dir)
questions_grouped_by_topic = extract_questions_by_topic(all_problems)

# # Printing each topic and its questions
# for topic, questions in questions_grouped_by_topic.items():
#     print(f"Topic: {topic}")
#     for question in questions:
#         print(f" - {question}")

In [4]:

questions_by_topic = extract_questions_by_topic(all_problems)

#### Load Topic

In [5]:
import json

# Function to crawl the topic hierarchy and retrieve all topics
def crawl_keys(d, sep=' > ', prefix=''):
    paths = []
    for k, v in d.items():
        path = prefix + k
        if isinstance(v, dict) and len(v.keys()) > 0:
            paths.extend(crawl_keys(v, sep, path + sep))
        else:
            paths.append(path)
    return paths

# Open the `topic_hierarchy.json` file and retrieve all topics
with open('topic_hierarchy.json') as json_file:
    topic_hierarchy = json.load(json_file)

all_topics = crawl_keys(topic_hierarchy)
print(f"Total number of topics: {len(all_topics)}")

Total number of topics: 132


#### Max Questions to generate

In [6]:
MAX_QUESTIONS = 100  # The maximum number of questions
generated_questions_count = 0  # Counter for the number of questions generated so far

In [7]:
from src.gdrive_api.utils import generate_human_like_questions
from tqdm import tqdm

problems = []

questions_grouped_by_topics = extract_questions_by_topic(all_problems)

# If the existing questions are not grouped by full topic paths, convert them first
questions_grouped_by_topic = {}
for topic in all_topics:
    questions_grouped_by_topic[topic] = questions_grouped_by_topics.get(topic, set())

with tqdm(total=MAX_QUESTIONS) as pbar:
    for topic, existing_questions in questions_grouped_by_topics.items():
        # Stop if we've reached the max limit
        if generated_questions_count >= MAX_QUESTIONS:
            break

        # Generate questions
        questions = generate_human_like_questions(topic, 5, existing_questions)
        for question in questions["questions"]:
            # If we're at the max, break
            if generated_questions_count >= MAX_QUESTIONS:
                break
            problems.append({
                "metadata": {
                    "topic": topic,
                    "type": "query",
                    "difficulty": "Easy",
                    "target_length": 1
                },
                "messages": [
                    {"role": "user", "content": question},
                ]
            })
            generated_questions_count += 1  # Increment for each question
            pbar.update(1)

100%|██████████| 100/100 [03:17<00:00,  1.98s/it]


In [14]:

# from src.gdrive_api.utils import generate_human_like_questions
# from tqdm import tqdm

# problems = []

# with tqdm(total=MAX_QUESTIONS) as pbar:
#     for topic in all_topics:
#         # Stop if we've reached the max limit
#         if generated_questions_count >= MAX_QUESTIONS:
#             break

#         existing_questions = questions_grouped_by_topic.get(f'{topic}', set())
#         print('existing' + str(existing_questions))
#         print('topics' + str(topic))

#         # Generate questions
#         questions = generate_human_like_questions(topic, 5, existing_questions)
#         for question in questions["questions"]:
#             # If we're at the max, break
#             if generated_questions_count >= MAX_QUESTIONS:
#                 break
#             problems.append({
#                 "metadata": {
#                     "topic": topic,
#                     "type": "query",
#                     "difficulty": "Easy",
#                     "target_length": 1
#                 },
#                 "messages": [
#                     {"role": "user", "content": question},
#                 ]
#             })
#             generated_questions_count += 1  # Increment for each question
#             pbar.update(1)

In [8]:
from src.gdrive_api.utils import generate_human_like_code_modification_requests

generated_questions_count = 0

with tqdm(total=MAX_QUESTIONS) as pbar:
    for topic, existing_questions in questions_grouped_by_topics.items():
        # Stop if we've reached the max limit
        if generated_questions_count >= MAX_QUESTIONS:
            break
        
        questions = generate_human_like_code_modification_requests(topic, 3, existing_questions)
        for question in questions["questions"]:
            # If we're at the max, break
            if generated_questions_count >= MAX_QUESTIONS:
                break
            problems.append({
                "metadata": {
                    "topic": topic,
                    "type": "modification",
                    "difficulty": "Easy",
                    "target_length": 1
                },
                "messages": [
                    {"role": "user", "content": question},
                ]
            })
            generated_questions_count += 1  # Increment for each question
            pbar.update(1)

100%|██████████| 100/100 [11:27<00:00,  6.88s/it]


In [9]:
problem_titles = []
problem_topic_counts = {}
file_path_to_problem = {}
for problem in problems:
    topic_type = f'{problem["metadata"]["topic"].split(" > ")[-1]}__{problem["metadata"]["type"]}'
    idx = problem_topic_counts.get(topic_type, 0)
    title = f'{topic_type}__{idx}'
    problem_titles.append(title)
    file_path_to_problem[f"{title}.ipynb"] = problem
    problem_topic_counts[topic_type] = idx + 1

#### distribution for the number of turns

In [12]:
import nbformat
from nbformat.v4 import new_notebook, new_markdown_cell
import re
import os
import random

# Define distribution for the number of turns
distribution = {1: 25, 2: 50, 6: 25}

# Ensure that the distribution sums up to 100%
assert sum(distribution.values()) == 100, "The distribution must sum to 100%"

def weighted_choice(choices):
    total = sum(weight for choice, weight in choices)
    r = random.uniform(0, total)
    upto = 0
    for choice, weight in choices:
        if upto + weight >= r:
            return choice
        upto += weight

def select_number_of_turns(distribution):
    # Normalize distribution values to probabilities
    total = sum(distribution.values())
    normalized_distribution = {k: v/total for k, v in distribution.items()}
    turns = weighted_choice(normalized_distribution.items())
    return turns

for p, t in zip(problems, problem_titles):
    # Determine the parity based on the last digit in the title
    title_number = int(re.search(r'\d+$', t).group())  # Extract the last number from the title

    turns = select_number_of_turns(distribution)
    
    parity = title_number % 2

    # Create a new notebook
    notebook = new_notebook()

    # Add metadata
    metadata = f"""# Metadata

**Python Topics** - {p["metadata"]["topic"]}

**Type** - {p["metadata"]["type"]}

**Target Number of Turns (User + Assistant)** - {turns}
"""
    metadata_cell = new_markdown_cell(metadata)
    notebook.cells.append(metadata_cell)

    # Add conversation header
    conversation_header = "# Conversation"
    conversation_header_cell = new_markdown_cell(conversation_header)
    notebook.cells.append(conversation_header_cell)

    # Append conversation messages
    title = "**User**" if parity else "**Assistant**"
    for message in p["messages"]:
        msg_content = f"""{title}

{message["content"]}
"""
        conversation_message_cell = new_markdown_cell(msg_content)
        notebook.cells.append(conversation_message_cell)

    # Save the notebook
    notebook_path = f'notebooks/test/{t}.ipynb'
     
    try:
        os.makedirs(os.path.dirname(notebook_path))
    except FileExistsError:
        pass
     
    with open(notebook_path, 'w') as f:
        nbformat.write(notebook, f)

#### biased_normal_distribution

In [14]:
import json
import nbformat
import os
import re
import numpy as np
from nbformat.v4 import new_notebook, new_markdown_cell
from scipy.stats import rankdata


def flatten_complexity_scores(data, key_prefix='', scores_dict=None):
    if scores_dict is None:
        scores_dict = {}
    for key, value in data.items():
        if isinstance(value, dict):
            if 'complexity' in value:
                scores_dict[f'{key_prefix}.{key}'.strip('.')] = value['complexity']
            else:
                new_prefix = f'{key_prefix}.{key}'.strip('.')
                flatten_complexity_scores(value, new_prefix, scores_dict)
        elif isinstance(value, int):
            scores_dict[key_prefix] = value
    return scores_dict

with open('./topic_dist.json') as f:
    complexity_data = json.load(f)

# Flatten the complexity data into a dictionary
complexity_scores = flatten_complexity_scores(complexity_data)

# Sort the complexity scores in descending order
sorted_by_complexity = dict(sorted(complexity_scores.items(), key=lambda item: item[1], reverse=True))

# Calculate the percentiles based on the sorted complexity scores
# Higher complexity will have higher rank and thus higher percentile
complexity_ranks = rankdata(list(sorted_by_complexity.values()), method='average')
topic_percentiles = {topic: rank / len(complexity_scores) for topic, rank in zip(sorted_by_complexity.keys(), complexity_ranks)}

def biased_normal_distribution(input_number):
    # Constants for the distribution
    min_val, max_val = 1, 15
    std_dev = 2.5  # Fixed standard deviation

    # Base mean for input_number 50 (biased towards lower end)
    base_mean = 1

    # Adjust the mean based on the input number (linear mapping)
    mean = np.interp(input_number, [0, 100], [min_val, max_val])
    
    # Adjust mean for the bias towards lower numbers
    adjusted_mean = (mean + 3*base_mean) / 5

    # Sample from a normal distribution
    while True:
        sample = np.random.normal(adjusted_mean, std_dev)
        if min_val <= sample <= max_val:
            break

    return int(sample)

# # Function to calculate the number of turns based on percentile
# # Higher complexity (higher percentile) results in more turns
# def calculate_turns(percentile, max_turns=10):
#     turns = max(1, round(percentile * max_turns))
#     return turns

for p, t in zip(problems, problem_titles):

    title_number = int(re.search(r'\d+$', t).group())  # Extract the last number from the title
    
    parity = title_number % 2

    metadata_topic_hierarchy = p["metadata"].get("topic") 
    normalized_topic = metadata_topic_hierarchy.replace(" > ", ".")
    print(normalized_topic)

    # Retrieve the percentile using the normalized topic string
    percentile = topic_percentiles.get(normalized_topic, 0)
    print(f"Topic: {normalized_topic}, Percentile: {percentile}")

    # Calculate turns using the biased_normal_distribution function
    turns = biased_normal_distribution(percentile * 100)  # Percentile scaled to 0-100 
    print('turns'+str(turns))
    
    # Create a new notebook
    notebook = new_notebook()
    # Add metadata
    metadata = f"""# Metadata

**Python Topics** - {p["metadata"]["topic"]}

**Type** - {p["metadata"]["type"]}

**Target Number of Turns (User + Assistant)** - {turns}
"""
    metadata_cell = new_markdown_cell(metadata)
    notebook.cells.append(metadata_cell)

    # Add conversation header
    conversation_header = "# Conversation"
    conversation_header_cell = new_markdown_cell(conversation_header)
    notebook.cells.append(conversation_header_cell)

    # Append conversation messages
    title = "**User**" if parity else "**Assistant**"
    for message in p["messages"]:
        msg_content = f"""{title}

{message["content"]}
"""
        conversation_message_cell = new_markdown_cell(msg_content)
        notebook.cells.append(conversation_message_cell)

    # Save the notebook
    notebook_path = f'notebooks/test/{t}.ipynb'
     
    try:
        os.makedirs(os.path.dirname(notebook_path))
    except FileExistsError:
        pass
     
    with open(notebook_path, 'w') as f:
        nbformat.write(notebook, f)

algorithms.by_data_structure.arrays
Topic: algorithms.by_data_structure.arrays, Percentile: 0.03787878787878788
turns2
algorithms.by_data_structure.arrays
Topic: algorithms.by_data_structure.arrays, Percentile: 0.03787878787878788
turns2
algorithms.by_data_structure.arrays
Topic: algorithms.by_data_structure.arrays, Percentile: 0.03787878787878788
turns2
algorithms.by_data_structure.arrays
Topic: algorithms.by_data_structure.arrays, Percentile: 0.03787878787878788
turns4
algorithms.by_data_structure.arrays
Topic: algorithms.by_data_structure.arrays, Percentile: 0.03787878787878788
turns2
algorithms.by_data_structure.linked_lists
Topic: algorithms.by_data_structure.linked_lists, Percentile: 0.14015151515151514
turns2
algorithms.by_data_structure.linked_lists
Topic: algorithms.by_data_structure.linked_lists, Percentile: 0.14015151515151514
turns1
algorithms.by_data_structure.linked_lists
Topic: algorithms.by_data_structure.linked_lists, Percentile: 0.14015151515151514
turns2
algorithms.b

In [15]:
with open('./topic_dist.json') as f:
    data = json.load(f)

# Flatten the data structure, creating a list of (category, complexity) tuples
def flatten_data(data, parent_key='', sep='_'):
    items = []
    for k, v in data.items():
        new_key = f'{parent_key}{sep}{k}' if parent_key else k
        if isinstance(v, dict):
            if 'complexity' in v:
                items.append((new_key, v['complexity']))
            else:
                items.extend(flatten_data(v, new_key, sep=sep))
    return items

# Flatten the provided JSON data
flattened_data = flatten_data(data)

# Now we sort the flattened data by complexity in descending order
sorted_data = sorted(flattened_data, key=lambda x: x[1], reverse=True)

for item in sorted_data:
    print(f"{item[0]}: Complexity {item[1]}")

algorithms_by_data_structure_advanced_data_structures: Complexity 8
algorithms_by_topic_dynamic_programming: Complexity 8
python_language_and_scripting_python_c_extensions: Complexity 8
deep_learning_backpropagation_understanding: Complexity 8
algorithms_by_data_structure_graphs: Complexity 7
algorithms_by_topic_famous_algorithms: Complexity 7
algorithms_by_topic_game_theory: Complexity 7
algorithms_by_topic_divide_and_conquer: Complexity 7
algorithms_by_topic_backtracking: Complexity 7
algorithms_by_topic_combinatorics: Complexity 7
python_language_and_scripting_metaclasses_and_class_factories: Complexity 7
python_language_and_scripting_memory_management_and_python_internals: Complexity 7
python_language_and_scripting_advanced_networking: Complexity 7
python_language_and_scripting_cython_and_PyPy: Complexity 7
python_language_and_scripting_parallel_programming: Complexity 7
unit_testing_methodology_test_ai_and_ml_models: Complexity 7
web_development_web_security: Complexity 7
ml_princ

In [19]:
import json

with open('notebooks/test/problems.json', 'w') as f:
    json.dump(problems, f, indent=4)

In [20]:
with open('notebooks/test/problems.json') as f:
    problems = json.load(f)

In [21]:
from tqdm import tqdm
from src.gdrive_api.folder_upload import upload_folder
from src.gdrive_api.auth import build_service

service = build_service('creds/google__sa.json')
# destination_folder_url = "https://drive.google.com/drive/u/0/folders/1Z1bdYMe2Qmo_vs-OaKDaYIiV3rIqLJH9"
# destination_folder_url = "https://drive.google.com/drive/u/2/folders/1sfPFHkXYpKyY41V0pfz3Qw3k4VLy5Hvb"
destination_folder_url = 'https://drive.google.com/drive/folders/1T14cHENpx7j2aS4Yb_3WhnXhMql0zq2y'

uploaded_files = upload_folder(service, 'notebooks/test', destination_folder_url, force_replace = True, is_url=True)

file_path_to_url = {}
with tqdm(total=len(uploaded_files)) as pbar:
    for file_path, file_url in uploaded_files.items():
        if file_url is not None:
            drive_id = file_url.split("id=")[-1].split("&")[0].strip()
            colab_url = f"https://colab.research.google.com/drive/{drive_id}"
            file_path_to_url[file_path] = colab_url
        else:
            print(f"Skipped uploading {file_path}")
        pbar.update(1)

for file_path in file_path_to_url.keys():
    if file_path == "problems.json":
        continue
    problem = file_path_to_problem[file_path]
    problem["metadata"]["colab_url"] = file_path_to_url[file_path]
    problem["metadata"]["file_path"] = file_path
    problem["metadata"]["batch_idx"] = "test"

------------------------------------------------------------
Processing directory .: 1 of 0 in total.
Uploading file 1 of 41 in '.', 1 of 41 in total.
Uploading new file 'arrays__modification__0.ipynb'.
File 'arrays__modification__0.ipynb' has been uploaded.
Uploaded 'arrays__modification__0.ipynb' to folder ID '1T14cHENpx7j2aS4Yb_3WhnXhMql0zq2y'.
arrays__modification__0.ipynb
Uploading file 2 of 41 in '.', 2 of 41 in total.
Uploading new file 'arrays__modification__1.ipynb'.
File 'arrays__modification__1.ipynb' has been uploaded.
Uploaded 'arrays__modification__1.ipynb' to folder ID '1T14cHENpx7j2aS4Yb_3WhnXhMql0zq2y'.
arrays__modification__1.ipynb
Uploading file 3 of 41 in '.', 3 of 41 in total.
Uploading new file 'arrays__modification__2.ipynb'.
File 'arrays__modification__2.ipynb' has been uploaded.
Uploaded 'arrays__modification__2.ipynb' to folder ID '1T14cHENpx7j2aS4Yb_3WhnXhMql0zq2y'.
arrays__modification__2.ipynb
Uploading file 4 of 41 in '.', 4 of 41 in total.
Replacing exist

100%|██████████| 41/41 [00:00<?, ?it/s]


In [22]:
problems = list(file_path_to_problem.values())

with open('notebooks/test/problems.json', 'w') as f:
    json.dump(problems, f, indent=4)

In [23]:
# from google.oauth2 import service_account
# from googleapiclient.discovery import build

# # Path to your service account key file
# SERVICE_ACCOUNT_FILE = 'creds/google__sa.json'

# # The scopes required for the Sheets API
# SCOPES = ['https://www.googleapis.com/auth/spreadsheets']

# # The ID of your spreadsheet
# SPREADSHEET_ID = '1qBU7Kvuuij2fxbqPxebReKMxWgIBmOIE5Gi4ZuX0j_4'


# # Authenticate and build the service
# creds = service_account.Credentials.from_service_account_file(
#         SERVICE_ACCOUNT_FILE, scopes=SCOPES)
# service = build('sheets', 'v4', credentials=creds)

# # Specify the range and values to update
# range_ = 'Conversations_Batch_3!A2:E610'  # For example, this updates cells from A1 to D5 in Sheet1
# values = []

# for problem in problems:
#     values.append([
#         problem["metadata"]["colab_url"],
#         problem["metadata"]["topic"],
#         problem["metadata"]["target_length"],
#         problem["metadata"]["type"],
#         problem["metadata"]["batch_idx"],
#     ])


# body = {
#     'values': values
# }

# # Call the Sheets API to update the range
# request = service.spreadsheets().values().update(spreadsheetId=SPREADSHEET_ID, range=range_, valueInputOption='RAW', body=body)
# response = request.execute()