In [7]:
import sys 
sys.path.append('../../')

service_account_file = 'creds/google__sa.json'

tracking_sheet_id = "1qBU7Kvuuij2fxbqPxebReKMxWgIBmOIE5Gi4ZuX0j_4"
included_sheet_names = [
    "Conversations_Batch_2",
    "Conversations_Batch_3",
    "Conversations_Batch_4",
    "Conversations_Batch_5",
]

jupyter_gdrive_folder_ids = [
    "1Z1bdYMe2Qmo_vs-OaKDaYIiV3rIqLJH9", # V0
    "1sfPFHkXYpKyY41V0pfz3Qw3k4VLy5Hvb", # V1
    "1jV7WA5zB172DJUp7Z2XzHr62E6U6_NtY",
]

delivery_sheet_id = "1eUif5I8xhHU8fY0X9v8r2JI9hWPh7Dq_9VXpSIHwww4"
delivery_jsonl_gdrive_folder_id = "1b3UuMfgwxpOsW0GnsdsrEBWdjUvg8Ub7"

gpt_reviews_path = "gpt_reviews.csv"

In [5]:
#########################
    # Colab #
#########################

from concurrent.futures import ThreadPoolExecutor, as_completed
from google.oauth2.service_account import Credentials
from googleapiclient.discovery import build
from tqdm import tqdm
import pandas as pd

def get_file_name_from_colab_link(colab_link, service_account_file):
    try:
        file_id = colab_link.split('/drive/')[1]
    except IndexError:
        return None

    SCOPES = ['https://www.googleapis.com/auth/drive']
    credentials = Credentials.from_service_account_file(service_account_file, scopes=SCOPES)
    service = build('drive', 'v3', credentials=credentials)

    try:
        file = service.files().get(fileId=file_id).execute()
        return file.get('name')
    except Exception as e:
        return None


def fetch_file_names_parallel(links, service_account_file, max_workers=100):
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(get_file_name_from_colab_link, link, service_account_file): link for link in links}
        results = {}
        for future in tqdm(as_completed(futures), total=len(futures), desc="Fetching File Names"):
            link = futures[future]
            try:
                file_name = future.result()
                results[link] = file_name
            except Exception as e:
                results[link] = None
        return results


In [8]:
from src.sheets_utils import download_sheet_as_df

progress_batches = []
for sheet_name in included_sheet_names:
    print(sheet_name)
    bdf = download_sheet_as_df(service_account_file, tracking_sheet_id, sheet_name)
    progress_batches.append(bdf)
    print(bdf.shape)


delivered = pd.concat([
    download_sheet_as_df(service_account_file, delivery_sheet_id, "Batch 1"),
    download_sheet_as_df(service_account_file, delivery_sheet_id, "Batch 2"),
    download_sheet_as_df(service_account_file, delivery_sheet_id, "Batch 3"),
], ignore_index=True)


df = pd.concat(progress_batches, ignore_index=True)
file_names = fetch_file_names_parallel(df['task_link'], service_account_file)
df['task_file_name'] = df['task_link'].map(file_names)
completed_df = df[df["completion_status"] == "Done"]
completed_to_be_delivered_df = completed_df[~completed_df["task_link"].isin(delivered["task_link"])]

gpt_reviews_df = pd.read_csv(gpt_reviews_path)

completed_to_be_delivered_df = completed_to_be_delivered_df.merge(gpt_reviews_df, on="task_link")
flagged_gpt_reviewer_df = completed_to_be_delivered_df[completed_to_be_delivered_df["avg_jdg_score"] <= 2.5] 
completed_to_be_delivered_df = completed_to_be_delivered_df[completed_to_be_delivered_df["avg_jdg_score"] > 2.5]
flagged_gpt_reviewer_df

Unnamed: 0,task_link,metadata__topic,assigned_to_email,completion_status,modified_question?,duration_mins,completion_date,comments,metadata__problem_type,metadata__target_length,...,reviewer_email,Team_Type(Internal/External),metadata__type,modified_question,task_file_name,code_feedback,lang_feedback,code_judgment,lang_judgment,avg_jdg_score
1,https://colab.research.google.com/drive/1cP6qz...,unit_testing_methodology > test_ai_and_ml_models,patelia.a@turing.com,Done,FALSE,,,,modification,1,...,,External,,,1T__test_ai_and_ml_models__modification__0.ipynb,#Turn 1:\n\n## Code(2/5):\n**Critical_Issues**...,#Turn 1:\n\n## Language(4/5):\nNone,The function's name and behavior are mismatche...,The assistant's reply is not provided for revi...,1.5
9,https://colab.research.google.com/drive/1d613I...,algorithms > by_data_structure > trees,ritesh.r@turing.com,Done,,35,12/28/2023,,,2+,...,ruturaj.m@turing.com,,query,FALSE,trees__query__1.ipynb,#Turn 1:\n\n## Code(4/5):\n**Medium_Issues**\n...,#Turn 1:\n\n## Language(4/5):\n**Medium_Issues...,"Turn 1 contains a redundant check for None, wh...",The assistant's explanations contain redundanc...,2.5
16,https://colab.research.google.com/drive/1xw5RA...,database > indexing_and_search_performance,kumbar.r@turing.com,Done,,60,12/29/2023,,,2+,...,,,query,FALSE,indexing_and_search_performance__query__2.ipynb,#Turn 1:\n\n## Code(1/5):\n**Critical_Issues**...,#Turn 1:\n\n## Language(3/5):\n**Medium_Issues...,The code provided in each turn either lacks ac...,The responses are generally clear but lack det...,2.5
17,https://colab.research.google.com/drive/1IMCwR...,algorithms > by_data_structure > hash_tables,ritesh.r@turing.com,Done,,25,12/28/2023,,,2+,...,,,query,FALSE,hash_tables__query__0.ipynb,#Turn 1:\n\n## Code(4/5):\nNone\n\n======\n\n#...,#Turn 1:\n\n## Language(4/5):\nNone\n\n======\...,The code contains a variable name inconsistenc...,The assistant's response contained a major mis...,2.5
41,https://colab.research.google.com/drive/1cQ2Az...,deep_learning > loss_functions,gedeon.a@turing.com,Done,,63,1/1/2024,,,2+,...,,,query,FALSE,loss_functions__query__0.ipynb,#Turn 1:\n\n## Code(2/5):\n**Critical_Issues**...,#Turn 1:\n\n## Language(1/5):\n**Medium_Issues...,The code reviews indicate a range of issues fr...,The response is incomplete as it fails to prov...,1.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
531,https://colab.research.google.com/drive/1VJt0i...,algorithms > by_topic > game_theory,freitas.g@turing.com,Done,,45,1/2/2024,,,,...,,,algorithms > by_topic > game_theory,FALSE,game_theory__modification__2_V3_A.ipynb,#Turn 1:\n\n## Code(2/5):\n**Critical_Issues**...,#Turn 1:\n\n## Language(3/5):\n**Medium_Issues...,The code in each turn has correctness issues o...,The responses are understandable but require c...,2.5
540,https://colab.research.google.com/drive/1W9TYM...,python_language_and_scripting > basic_python_s...,marcel.s@turing.com,Done,,30,1/2/2024,,,,...,,,,FALSE,basic_python_syntax__modification__2_V3_A.ipynb,#Turn 1:\n\n## Code(4/5):\nNone\n\n======\n\n#...,#Turn 1:\n\n## Language(4/5):\nNone\n\n======\...,The code in Turn 2 contains a critical error i...,The explanation provided by the assistant cont...,2.5
544,https://colab.research.google.com/drive/1wMtlD...,algorithms > by_topic > searching,aman.s@turing.com,Done,,42,1/4/2024,,,,...,,,,FALSE,searching__query__1_V3_A.ipynb,#Turn 1:\n\n## Code(2/5):\n**Critical_Issues**...,#Turn 1:\n\n## Language(4/5):\nNone\n\n======\...,The code in Turn 1 is incorrect as it misappli...,The responses are clear but include a signific...,2.5
551,https://colab.research.google.com/drive/1xF1OD...,python_language_and_scripting > basic_python_s...,marcel.s@turing.com,Done,,30,1/2/2024,,,,...,,,,FALSE,basic_python_syntax__query__1_V3_A.ipynb,#Turn 1:\n\n## Code(4/5):\nNone,#Turn 1:\n\n## Language(4/5):\nNone,The code is correct but may benefit from minor...,The assistant's reply is not available for rev...,2.5


In [10]:
flagged_gpt_reviewer_df.groupby("assigned_to_email").agg({"completion_status": "count", "avg_jdg_score": "mean"}).sort_values("completion_status", ascending=False)

Unnamed: 0_level_0,completion_status,avg_jdg_score
assigned_to_email,Unnamed: 1_level_1,Unnamed: 2_level_1
souza.m@turing.com,8,2.0
aman.s@turing.com,7,2.428571
adil.m@turing.com,4,2.375
shaharyar.t@turing.com,4,2.125
paulo.c@turing.com,4,2.5
elsadek.a@turing.com,3,2.5
abdul.r@turing.com,3,2.5
ritesh.r@turing.com,3,2.5
marcel.s@turing.com,3,2.5
santiago.c@turing.com,3,2.5


In [12]:
from src.sheets_utils import upload_df_to_sheet

flagged_gpt_reviewer_df = flagged_gpt_reviewer_df.fillna("")
upload_df_to_sheet(service_account_file, tracking_sheet_id, "gpt_flags_1", flagged_gpt_reviewer_df)