# Settings & Pointers

In [None]:
service_account_path = 'creds/google__sa.json'
DELIVERY_SHEET_ID = '1eUif5I8xhHU8fY0X9v8r2JI9hWPh7Dq_9VXpSIHwww4'


# Download the delivered batch sheet

In [6]:
import pandas as pd
from google.oauth2 import service_account
from googleapiclient.discovery import build

def download_sheet_as_df(service_account_path, sheet_id, sheet_name):
    # Authenticate with the service account
    scopes = ['https://www.googleapis.com/auth/spreadsheets.readonly']
    creds = service_account.Credentials.from_service_account_file(
        service_account_path, scopes=scopes)
    service = build('sheets', 'v4', credentials=creds)

    # Construct the range to read
    sheet_range = f"{sheet_name}!A:Z"  # Adjust the range A:Z as needed

    # Make the API request
    result = service.spreadsheets().values().get(
        spreadsheetId=sheet_id, range=sheet_range).execute()
    values = result.get('values', [])

    # Convert to a DataFrame
    if not values:
        print("No data found.")
        return pd.DataFrame()
    else:
        return pd.DataFrame(values[1:], columns=values[0])

# Usage Example
sheet_name = 'Batch 1'  
df = download_sheet_as_df(service_account_path, DELIVERY_SHEET_ID, sheet_name)
df

Unnamed: 0,task_link,metadata__topic,duration_mins,number_of_turns
0,https://colab.research.google.com/drive/1X4H8L...,algorithms > by_data_structure > strings,119,6
1,https://colab.research.google.com/drive/1G_d7Z...,algorithms > by_data_structure > strings,60,3
2,https://colab.research.google.com/drive/1vYEPg...,algorithms > by_data_structure > strings,50,3
3,https://colab.research.google.com/drive/1_arMF...,algorithms > by_topic > divide_and_conquer,10,1
4,https://colab.research.google.com/drive/10Ahc4...,algorithms > by_topic > divide_and_conquer,10,1
...,...,...,...,...
295,https://colab.research.google.com/drive/1myuw7...,python_language_and_scripting > modules_and_pa...,2,1
296,https://colab.research.google.com/drive/1N-un8...,algorithms > by_topic > backtracking,15,1
297,https://colab.research.google.com/drive/1n0voP...,unit_testing_methodology > test_driven_develop...,10,1
298,https://colab.research.google.com/drive/1n2Iov...,algorithms > by_data_structure > hash_tables,20,1


# Download and convert to messages

In [9]:
import io
import threading

import nbformat
from fuzzywuzzy import fuzz

from google.oauth2 import service_account
from googleapiclient.discovery import build
from googleapiclient.http import MediaIoBaseDownload


def get_closest_match(query, choices):
    """
    Get the closest match(es) to a query string from a list of choices.

    :param query: The query string.
    :param choices: A list of strings to match against.
    :param limit: The maximum number of matches to return.
    """
    best_role = None
    best_score = 0
    for choice in choices:
        score = fuzz.ratio(query, choice)
        if score > best_score and score > 25:
            best_score = score
            best_role = choice

    return best_role, best_score


def notebook_parser(notebook):
    """
    Parse a notebook and extract the message objects.

    :param notebook: The notebook object.
    """
    messages = []
    for cell in notebook.cells[2:]:
        if cell["cell_type"] == "markdown":
            markdown_headers = ["**User**", "**Assistant**"]
            lines = cell["source"].split("\n")
            first_line = lines[0]
            role, score = get_closest_match(first_line, markdown_headers)
            if score>25:
                message = {
                    "role": role.replace("*", "").strip(),
                    "content": "\n".join(lines[1:]).strip("\n"),
                    "type": "markdown"
                }
                messages.append(message)

        elif cell["cell_type"] == "code":
            code_headers = ["# User", "# Assistant"]
            lines = cell["source"].split("\n")
            first_line = lines[0]
            role, score = get_closest_match(first_line, code_headers)
            if score>25:
                message = {
                    "role": role.replace("#", "").strip(),
                    "content": "\n".join(lines[1:]).strip("\n"),
                    "type": "code"
                }
                messages.append(message)
    return messages


def download_and_parse_notebook(service_account_file, file_id):
    # Authenticate with the service account
    credentials = service_account.Credentials.from_service_account_file(
        service_account_file, scopes=['https://www.googleapis.com/auth/drive'])
    service = build('drive', 'v3', credentials=credentials)

    # Request to download the file
    request = service.files().get_media(fileId=file_id)
    fh = io.BytesIO()
    downloader = MediaIoBaseDownload(fh, request)

    # Download the file
    done = False
    while not done:
        status, done = downloader.next_chunk()
        print("Download progress: %d%%." % int(status.progress() * 100))

    # Move the buffer's pointer to the beginning
    fh.seek(0)

    # Open the notebook
    notebook = nbformat.read(fh, as_version=4)

    # Parse the notebook
    messages = notebook_parser(notebook)

    # # Extract the first cell
    first_cell = notebook.cells[0]
    lines = first_cell["source"].split("\n")
    metadata = {}
    for line in lines:
        if "**Python Topics**" in line:
            metadata["topic"] = line.split(" - ")[1]
        if "**Type**" in line:
            metadata["type"] = line.split(" - ")[1]
        if "**Target Number of Turns (User + Assistant)**" in line:
            metadata["target_turns"] = line.split(" - ")[1]

    return {
        "id": file_id,
        "metadata": metadata,
        "messages":messages
    }


def threading_processor(service_account_path, file_id, results):
    results.append(download_and_parse_notebook(service_account_path, file_id))



threads = []
parsed_conversations = []
for i in range(len(df)):
    file_id = df["task_link"][i].split("/")[-1]
    thread = threading.Thread(target=threading_processor, args=(service_account_path, file_id, parsed_conversations))
    threads.append(thread)
    thread.start()

for thread in threads:
    thread.join()


Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.


In [10]:
parsed_conversations[0]

{'id': '1vYEPgcifwiWo9SawRIAHGlpi3-fSHsYb',
 'metadata': {'topic': 'algorithms > by_data_structure > strings',
  'type': 'query',
  'target_turns': '3'},
 'messages': [{'role': 'User',
   'content': 'Can you provide a snippet to concatenate a list of strings into a single string, separated by commas?',
   'type': 'markdown'},
  {'role': 'Assistant',
   'content': 'To concatenate a list of strings into a single string with commas as separators, you can use the `join()` method in Python.',
   'type': 'markdown'},
  {'role': 'Assistant',
   'content': 'def concatenate_strings(string_list):\n\n    """\n    Concatenates a list of strings into a single string, separated by commas.\n\n    Parameters:\n    string_list (list of str): A list of strings to be concatenated.\n\n    Returns:\n    str: A single string concatenated from the string list, separated by commas.\n    """\n\n    return \', \'.join(string_list)\n\nwords = [\'apple\', \'banana\', \'cherry\']\nresult = concatenate_strings(word

In [12]:
count_valid = 0
count_invalid = 0
invalid_roles = []
for conversation in parsed_conversations:
    for message in conversation:
        if message["role"] in ["User", "Assistant"]:
            count_valid += 1
        else:
            count_invalid += 1
            invalid_roles.append(message["role"])

print(f"Valid messages: {count_valid}")
print(f"Invalid messages: {count_invalid}")

TypeError: string indices must be integers

In [26]:
import os

os.mkdir("jsonl_conversations")

In [27]:
os.mkdir("jsonl_conversations/Batch 1")

In [13]:
import json

for i, conversation in enumerate(parsed_conversations):
    drive_id = conversation["id"] 
    messages = conversation["messages"]
    with open(f"jsonl_conversations/Batch 1/{drive_id}.jsonl", "w") as f:
        for message in messages:
            f.write(json.dumps(message) + "\n")

In [None]:
from src.gdrive_api import build_service
from src.gdrive_api.folder_upload import upload_folder

service_account_path = 'creds/google__sa.json'
service = build_service(service_account_path)
destination_folder_url = 'https://drive.google.com/drive/u/0/folders/1zK2kB0l2gcMTbK_CjbrOZ9rTQ6OKsaCw'
uploaded_files = upload_folder(service, 'jsonl_conversations/Batch 1/', destination_folder_url, force_replace = True, is_url=True)
uploaded_files

# Inspect the jsonl google drive folder to list file_names vs drive links

In [15]:
from google.oauth2 import service_account
from googleapiclient.discovery import build

# Path to your service account JSON key file
SERVICE_ACCOUNT_FILE = 'creds/google__sa.json'

# Define the scopes
SCOPES = ['https://www.googleapis.com/auth/drive']

# Authenticate and create the service
credentials = service_account.Credentials.from_service_account_file(
        SERVICE_ACCOUNT_FILE, scopes=SCOPES)

service = build('drive', 'v3', credentials=credentials)

# Replace with your Google Drive folder ID
folder_id = '1zK2kB0l2gcMTbK_CjbrOZ9rTQ6OKsaCw'

# Query to get files from the folder
query = f"'{folder_id}' in parents"

# Specify the page size
page_size = 1000  # You can change this number as needed

# Initialize page token
page_token = None


all_files = []
# Loop through pages of results
while True:
    response = service.files().list(q=query,
                                    spaces='drive',
                                    fields='nextPageToken, files(id, name, webViewLink)',
                                    pageSize=page_size,
                                    pageToken=page_token).execute()

    # Print file names and URLs
    for file in response.get('files', []):
        all_files.append(file)
        print('Found file: %s (%s) %s' % (file.get('name'), file.get('id'), file.get('webViewLink')))

    # Update the page token
    page_token = response.get('nextPageToken', None)
    if page_token is None:
        break

Found file: 1jsED2F__k8qamKqxpnydyalGvHXW8fWT.jsonl (129yINl_W2iA-g33wbIND9nALGkZDXVRc) https://drive.google.com/file/d/129yINl_W2iA-g33wbIND9nALGkZDXVRc/view?usp=drivesdk
Found file: 1LDAx8y_748woY_6WjGFq27jLc4aMz0xe.jsonl (1Dq2-TmiMZFoPmAyzIwLRKa5vwCkMbLC7) https://drive.google.com/file/d/1Dq2-TmiMZFoPmAyzIwLRKa5vwCkMbLC7/view?usp=drivesdk
Found file: 13Rb927-8UjXiAmR69M-hCogWVdmAe3Xo.jsonl (15fXbV08_2hXfZ6WGaolzbAdEkdcL7__2) https://drive.google.com/file/d/15fXbV08_2hXfZ6WGaolzbAdEkdcL7__2/view?usp=drivesdk
Found file: 16RHmW4pKyXaScuVljGZs9i6erLgDvZ5I.jsonl (1UY_M9T7y1YXj-kl81Obkitf6l5nTfdJv) https://drive.google.com/file/d/1UY_M9T7y1YXj-kl81Obkitf6l5nTfdJv/view?usp=drivesdk
Found file: 1I-VPHR1LFw7qi-tuO9dsg2bdHmODwzky.jsonl (1JpRJTcJ1YKf6ee55g17woVTW8JuyACkt) https://drive.google.com/file/d/1JpRJTcJ1YKf6ee55g17woVTW8JuyACkt/view?usp=drivesdk
Found file: 1LRPOL_7JLVLtuRLNdwv4WlLNraMHi6h3.jsonl (17HgcghGwrphuJsG8S4R_9pI2FbQKX6r8) https://drive.google.com/file/d/17HgcghGwrphuJsG8S4R

In [17]:
jsonl_df = pd.DataFrame(all_files)
jsonl_df

Unnamed: 0,webViewLink,id,name
0,https://drive.google.com/file/d/129yINl_W2iA-g...,129yINl_W2iA-g33wbIND9nALGkZDXVRc,1jsED2F__k8qamKqxpnydyalGvHXW8fWT.jsonl
1,https://drive.google.com/file/d/1Dq2-TmiMZFoPm...,1Dq2-TmiMZFoPmAyzIwLRKa5vwCkMbLC7,1LDAx8y_748woY_6WjGFq27jLc4aMz0xe.jsonl
2,https://drive.google.com/file/d/15fXbV08_2hXfZ...,15fXbV08_2hXfZ6WGaolzbAdEkdcL7__2,13Rb927-8UjXiAmR69M-hCogWVdmAe3Xo.jsonl
3,https://drive.google.com/file/d/1UY_M9T7y1YXj-...,1UY_M9T7y1YXj-kl81Obkitf6l5nTfdJv,16RHmW4pKyXaScuVljGZs9i6erLgDvZ5I.jsonl
4,https://drive.google.com/file/d/1JpRJTcJ1YKf6e...,1JpRJTcJ1YKf6ee55g17woVTW8JuyACkt,1I-VPHR1LFw7qi-tuO9dsg2bdHmODwzky.jsonl
...,...,...,...
295,https://drive.google.com/file/d/1lZA-oOwLfbnWq...,1lZA-oOwLfbnWqQ_c1__T9nO97WcMNKVJ,1_Flx9u5t-Mx0n30imFa6sQkLAmGoq5dm.jsonl
296,https://drive.google.com/file/d/1sc4ZTHOTbHPgq...,1sc4ZTHOTbHPgqSq466FzPrDeLEu7A-_f,1FddVMFIKBFIeaM8p41OToZ5OkW9Gfy6n.jsonl
297,https://drive.google.com/file/d/1ZzyTxVLgd9KMn...,1ZzyTxVLgd9KMnjKDw945t-ZMdd9x9dVx,1gccW2ROHaALRj2I5qaQhPhBFna9T9DxU.jsonl
298,https://drive.google.com/file/d/1etgO_J9tXazfG...,1etgO_J9tXazfGevcjF6es1hqYOb7LJf4,1dpQO4_shICvw1ff_SzKU93vzwQI7VD8P.jsonl


# Join to identify which jsonl belongs to which colab

In [19]:
df["colab_id"] = df["task_link"].apply(lambda x: x.split("/")[-1])
jsonl_df["colab_id"] = jsonl_df["name"].apply(lambda x: x.split(".")[0])


df_merged = df.merge(jsonl_df, on="colab_id", how="inner")
df_merged = df_merged[["task_link", "metadata__topic", "duration_mins", "number_of_turns", "webViewLink"]]
df_merged = df_merged.rename(columns={"webViewLink": "jsonl_link"})
df_merged

Unnamed: 0,task_link,metadata__topic,duration_mins,number_of_turns,jsonl_link
0,https://colab.research.google.com/drive/1X4H8L...,algorithms > by_data_structure > strings,119,6,https://drive.google.com/file/d/1cAGKy7u_sYzZW...
1,https://colab.research.google.com/drive/1G_d7Z...,algorithms > by_data_structure > strings,60,3,https://drive.google.com/file/d/1ql2hECkLQG3pI...
2,https://colab.research.google.com/drive/1vYEPg...,algorithms > by_data_structure > strings,50,3,https://drive.google.com/file/d/1YMkZcyiEqpqo3...
3,https://colab.research.google.com/drive/1_arMF...,algorithms > by_topic > divide_and_conquer,10,1,https://drive.google.com/file/d/1Z2bdCHcRWyr6k...
4,https://colab.research.google.com/drive/10Ahc4...,algorithms > by_topic > divide_and_conquer,10,1,https://drive.google.com/file/d/1n7v8gQtsbwrbB...
...,...,...,...,...,...
295,https://colab.research.google.com/drive/1myuw7...,python_language_and_scripting > modules_and_pa...,2,1,https://drive.google.com/file/d/1db4GqcAKqihjg...
296,https://colab.research.google.com/drive/1N-un8...,algorithms > by_topic > backtracking,15,1,https://drive.google.com/file/d/1j852p0QrExf-X...
297,https://colab.research.google.com/drive/1n0voP...,unit_testing_methodology > test_driven_develop...,10,1,https://drive.google.com/file/d/1AGHX5gGk8ncvh...
298,https://colab.research.google.com/drive/1n2Iov...,algorithms > by_data_structure > hash_tables,20,1,https://drive.google.com/file/d/1o21BJjn3sphNu...


In [20]:
df_merged.to_csv("batch_1_with_jsonl.csv", index=False)

In [None]:
import os

files = os.listdir("jsonl_conversations/Batch 1/")

for file in files:
    drive_id = file.split(".")[0]
    colab_link = f"https://colab.research.google.com/drive/{drive_id}"

    