In [5]:
import os
from openai import OpenAI


openai_api_key = os.environ.get("OPENAI_API_KEY")
client = OpenAI()

def generate_algorithm_question(topic):
    try:
        response = client.chat.completions.create(
            model="gpt-4-1106-preview",
            messages=[
                {"role": "system", "content": "You are a world class Python developer. And you're concise and precise."},
                {"role": "user", "content": f"Create 2 easy problems about {topic}."}
            ],
            temperature=0.0,
            max_tokens=4096,
            seed = 42
        )
        questions = response.choices[0].message.content
        return questions
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

# Example usage
topic = "fundamental algorithmic problems"
questions = generate_algorithm_question(topic)
if questions:
    print("Generated Algorithm Questions:")
    print(questions)

Generated Algorithm Questions:
### Problem 1: Find the Missing Number

**Description:**
You are given a list of `n-1` integers and these integers are in the range of 1 to `n`. There are no duplicates in the list. One of the integers is missing in the list. Write an algorithm to find the missing integer.

**Example:**
Input: `[1, 2, 4, 6, 3, 7, 8]`
Output: `5`

**Python Function:**
```python
def find_missing_number(numbers):
    # Your code here
    pass
```

**Constraints:**
- The list of numbers will always be unsorted.
- You should aim to have a solution with a time complexity of O(n).

---

### Problem 2: Check if String is a Palindrome

**Description:**
A palindrome is a word, phrase, number, or other sequences of characters that reads the same forward and backward (ignoring spaces, punctuation, and capitalization). Write a function that checks if a given string is a palindrome.

**Example:**
Input: `"A man, a plan, a canal: Panama"`
Output: `True`

**Python Function:**
```python
d

In [6]:
topic = "Python language fundamentals"
questions = generate_algorithm_question(topic)
if questions:
    print("Python language fundamentals:")
    print(questions)

Python language fundamentals and scripting techniques:
Certainly! Here are two problems that cover Python language fundamentals:

### Problem 1: Data Types and Manipulation

**Title:** Grocery List Organizer

**Description:**
You are tasked with writing a Python program to help organize a grocery list. The list contains items and their respective quantities. Your program should be able to handle input in the form of a string, where each item and its quantity are separated by a comma, and each pair is separated by a semicolon.

**Input:**
A string representing the grocery list. For example: `"apples:4,bananas:2,oranges:5; milk:1,bread:2; eggs:12"`

**Output:**
Your program should output a dictionary where the keys are the items and the values are the total quantities. The items should be sorted alphabetically.

**Example:**
```python
input_string = "apples:4,bananas:2,oranges:5; milk:1,bread:2; eggs:12"
# Your function should return the following dictionary:
# {'apples': 4, 'bananas': 2

In [7]:
topic = "Python scripting techniques:"
questions = generate_algorithm_question(topic)
if questions:
    print("Python scripting techniques:")
    print(questions)

Python scripting techniques:
### Problem 1: File Transformation Pipeline

#### Background:
You are tasked with creating a Python script that processes a series of text files. Each file contains multiple lines of data, where each line is a comma-separated list of values. The script should perform a series of transformations on these files to produce a new set of processed files.

#### Requirements:
1. Read all `.txt` files from a directory called `input_data`.
2. For each file, perform the following transformations:
   - Trim leading and trailing whitespace from each line.
   - If a line contains more than 5 values, only keep the first 5 values.
3. After processing each file, write the transformed lines to a new file with the same name in a directory called `processed_data`.
4. Ensure that the `processed_data` directory is created if it does not exist.
5. Log each step of the process, including:
   - The name of the file being processed.
   - The number of lines read.
   - The number of

# Human Like Quesitons

In [18]:
import json



def generate_human_like_questions(topic, n=5):
    SYSTEM_PROMPT = f"""IDENTITY:
You are a world class Python developer. And you're concise and precise.

CONTEXT:
We are trying to generate human-like queries that a user would send to an ai assistant through a chat interface. 
The user's query tone & structure should be diversified as much as possible making sure to include some realistic examples.

CONSTRAINTS:
1. Python Related
2. Easy (Solvable by a median developer in 15 minutes)
3. Questions should elicit a response that includes code.

INSTRUCTION:
You will be given a topic and an ask for number of questions to generate.
Act accordingly.

RESPONSE FORMAT:
A JSON-valid list of questions(strings) like {{"questions": ["question1", "question2", ...]}}
"""

    try:
        response = client.chat.completions.create(
            model="gpt-4-1106-preview",
            messages=[
                {"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user", "content": f"Topic: {topic} \nNumber of questions: {n}"},
            ],
            temperature=0.0,
            max_tokens=4096,
            seed = 42,
            response_format={ 
                "type": "json_object" 
            },
        )
        questions = json.loads(response.choices[0].message.content)
        return questions
    except Exception as e:
        print(f"An error occurred: {e}")
        return None
    

def generate_human_like_code_modification_requests(topic, n=5):
    SYSTEM_PROMPT = f"""IDENTITY:
You are a world class Python developer. And you're concise and precise.

CONTEXT:
We are trying to generate human-like code modification requests that a user would send to an ai assistant through a chat interface.
The user's query tone & structure should be diversified as much as possible making sure to include some realistic examples.

CONSTRAINTS:
1. Python Related
2. Easy (Solvable by a median developer in 15 minutes)
3. Questions should include code along with a request to modify it.

INSTRUCTION:
You will be given a topic and an ask for number of questions to generate.
Act accordingly.

RESPONSE FORMAT:
A JSON-valid list of questions(strings) like {{"questions": ["question1", "question2", ...]}}
    """
    try:
        response = client.chat.completions.create(
            model="gpt-4-1106-preview",
            messages=[
                {"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user", "content": f"Topic: {topic} \nNumber of code modification requests: {n}"},
            ],
            temperature=0.0,
            max_tokens=4096,
            seed = 42,
            response_format={ 
                "type": "json_object" 
            },
        )
        questions = json.loads(response.choices[0].message.content)
        return questions
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

In [12]:
import json

# open json file "topic_hierarchy.json" into a dict
with open('topic_hierarchy.json') as json_file:
    topic_hierarchy = json.load(json_file)

# Crawl the keys recursively and concatenate each unique path into a separate string and return a list of all unique paths
def crawl_keys(d, sep=' > ', prefix=''):
    paths = []
    for k, v in d.items():
        path = prefix + k
        if isinstance(v, dict) and len(v.keys()) > 0:
            paths.extend(crawl_keys(v, sep, path + sep))
        else:
            paths.append(path)
    return paths

all_topics = crawl_keys(topic_hierarchy)
print(f"Total number of topics: {len(all_topics)}")
all_topics

Total number of topics: 76


['algorithms > by_data_structure > arrays',
 'algorithms > by_data_structure > linked_lists',
 'algorithms > by_data_structure > stacks',
 'algorithms > by_data_structure > queues',
 'algorithms > by_data_structure > trees',
 'algorithms > by_data_structure > graphs',
 'algorithms > by_data_structure > hash_tables',
 'algorithms > by_data_structure > heaps',
 'algorithms > by_data_structure > strings',
 'algorithms > by_data_structure > advanced_data_structures',
 'algorithms > by_topic > dynamic_programming',
 'algorithms > by_topic > famous_algorithms',
 'algorithms > by_topic > greedy_algorithms',
 'algorithms > by_topic > recursion',
 'algorithms > by_topic > searching',
 'algorithms > by_topic > sorting',
 'algorithms > by_topic > math',
 'algorithms > by_topic > bit_manipulation',
 'algorithms > by_topic > geometry',
 'algorithms > by_topic > probability',
 'algorithms > by_topic > game_theory',
 'algorithms > by_topic > divide_and_conquer',
 'algorithms > by_topic > backtracking

In [13]:
from tqdm import tqdm

problems = []
with tqdm(total=len(all_topics)) as pbar:
    for topic in all_topics:
        questions = generate_human_like_questions(topic, 5)
        for question in questions["questions"]:
            problems.append({
                "metadata": {
                    "topic": topic,
                    "type": "query",
                    "difficulty": "Easy",
                    "target_length": 1
                },
                "messages": [
                    {"role": "user", "content": question},
                ]
            })
        pbar.update(1)


  0%|          | 0/76 [00:00<?, ?it/s]

100%|██████████| 76/76 [06:20<00:00,  5.00s/it]


In [19]:
with tqdm(total=len(all_topics)) as pbar:
    for topic in all_topics:
        questions = generate_human_like_code_modification_requests(topic, 3)
        for question in questions["questions"]:
            problems.append({
                "metadata": {
                    "topic": topic,
                    "type": "modification",
                    "difficulty": "Easy",
                    "target_length": 1
                },
                "messages": [
                    {"role": "user", "content": question},
                ]
            })
        pbar.update(1)

100%|██████████| 76/76 [13:42<00:00, 10.82s/it]


In [10]:
problem_titles = []
problem_topic_counts = {}
file_path_to_problem = {}
for problem in problems:
    topic_type = f'{problem["metadata"]["topic"].split(" > ")[-1]}__{problem["metadata"]["type"]}'
    idx = problem_topic_counts.get(topic_type, 0)
    title = f'1T__{topic_type}__{idx}'
    problem_titles.append(title)
    file_path_to_problem[f"{title}.ipynb"] = problem
    problem_topic_counts[topic_type] = idx + 1

In [33]:
import nbformat
from nbformat.v4 import new_notebook, new_markdown_cell


for p, t in zip(problems, problem_titles):
    # Create a new notebook
    notebook = new_notebook()
    # Add metadata
    metadata = f"""# Metadata

**Python Topics** - {p["metadata"]["topic"]}

**Type** - {p["metadata"]["type"]}

**Target Number of Turns (User + Assistant)** - 1
"""
    metadata_cell = new_markdown_cell(metadata)
    notebook.cells.append(metadata_cell)

    # Add conversation header
    conversation_header = "# Conversation"
    conversation_header_cell = new_markdown_cell(conversation_header)
    notebook.cells.append(conversation_header_cell)

    # Add User Message
    user_message = f"""**User**

{p["messages"][0]["content"]}
"""
    user_message_cell = new_markdown_cell(user_message)
    notebook.cells.append(user_message_cell)

    # Save the notebook
    with open(f'notebooks/v0/{t}.ipynb', 'w') as f:
        nbformat.write(notebook, f)


In [34]:
import json

with open('notebooks/v0/problems.json', 'w') as f:
    json.dump(problems, f, indent=4)

In [1]:
with open('notebooks/v0/problems.json') as f:
    problems = json.load(f)

In [11]:
problems[0]

{'metadata': {'topic': 'algorithms > by_data_structure > arrays',
  'difficulty': 'Easy',
  'target_length': 1,
  'type': 'query',
  'colab_url': 'https://colab.research.google.com/drive/1u9md_qevq_zqSN_r11OwmwOutbPgEDeu',
  'batch_idx': 'v0'},
 'messages': [{'role': 'user',
   'content': 'How can I reverse an array in Python without using built-in functions?'}]}

In [13]:
from tqdm import tqdm
from src.gdrive_api.folder_upload import upload_folder
from src.gdrive_api.auth import build_service

service = build_service('creds/google__sa.json')
destination_folder_url = "https://drive.google.com/drive/u/0/folders/1Z1bdYMe2Qmo_vs-OaKDaYIiV3rIqLJH9"
uploaded_files = upload_folder(service, 'notebooks/v0', destination_folder_url, force_replace = True, is_url=True)

file_path_to_url = {}
with tqdm(total=len(uploaded_files)) as pbar:
    for file_path, file_url in uploaded_files.items():
        if file_url is not None:
            drive_id = file_url.split("id=")[-1].split("&")[0].strip()
            colab_url = f"https://colab.research.google.com/drive/{drive_id}"
            file_path_to_url[file_path] = colab_url
        else:
            print(f"Skipped uploading {file_path}")
        pbar.update(1)

for file_path in file_path_to_url.keys():
    if file_path == "problems.json":
        continue
    problem = file_path_to_problem[file_path]
    problem["metadata"]["colab_url"] = file_path_to_url[file_path]
    problem["metadata"]["file_path"] = file_path
    problem["metadata"]["batch_idx"] = "v0"

In [17]:
problems = list(file_path_to_problem.values())

with open('notebooks/v0/problems.json', 'w') as f:
    json.dump(problems, f, indent=4)

In [22]:
from google.oauth2 import service_account
from googleapiclient.discovery import build

# Path to your service account key file
SERVICE_ACCOUNT_FILE = 'creds/google__sa.json'

# The scopes required for the Sheets API
SCOPES = ['https://www.googleapis.com/auth/spreadsheets']

# The ID of your spreadsheet
SPREADSHEET_ID = '1qBU7Kvuuij2fxbqPxebReKMxWgIBmOIE5Gi4ZuX0j_4'

# Authenticate and build the service
creds = service_account.Credentials.from_service_account_file(
        SERVICE_ACCOUNT_FILE, scopes=SCOPES)
service = build('sheets', 'v4', credentials=creds)

# Specify the range and values to update
range_ = 'Conversations!A2:E610'  # For example, this updates cells from A1 to D5 in Sheet1
values = []

for problem in problems:
    values.append([
        problem["metadata"]["colab_url"],
        problem["metadata"]["topic"],
        problem["metadata"]["target_length"],
        problem["metadata"]["type"],
        problem["metadata"]["batch_idx"],
    ])


body = {
    'values': values
}

# Call the Sheets API to update the range
request = service.spreadsheets().values().update(spreadsheetId=SPREADSHEET_ID, range=range_, valueInputOption='RAW', body=body)
response = request.execute()