In [1]:
service_account_path = "creds/google__sa.json"
tracking_sheet_id = "1qBU7Kvuuij2fxbqPxebReKMxWgIBmOIE5Gi4ZuX0j_4"

In [2]:
import pandas as pd

from src.sheets_utils import download_sheet_as_df


contributors_df = download_sheet_as_df(
    service_account_path,
    tracking_sheet_id,
    "Contributors"
)

tasks_df = pd.concat(
    [
        download_sheet_as_df(
            service_account_path,
            tracking_sheet_id,
            "Conversations_Batch_2"
        ),
        download_sheet_as_df(
            service_account_path,
            tracking_sheet_id,
            "Conversations_Batch_3"
        ),
        download_sheet_as_df(
            service_account_path,
            tracking_sheet_id,
            "Conversations_Batch_4"
        ),
        download_sheet_as_df(
            service_account_path,
            tracking_sheet_id,
            "Conversations_Batch_5"
        ),
    ],
    ignore_index=True
)
tasks_df = tasks_df[tasks_df["completion_status"] == "Done"].reset_index()


reviews_df = download_sheet_as_df(
    service_account_path,
    tracking_sheet_id,
    "Reviews"
)

In [3]:
# Parse Conversations into list of dicts

import io
import threading

import nbformat
from fuzzywuzzy import fuzz

from google.oauth2 import service_account
from googleapiclient.discovery import build
from googleapiclient.http import MediaIoBaseDownload


def get_closest_match(query, choices):
    """
    Get the closest match(es) to a query string from a list of choices.

    :param query: The query string.
    :param choices: A list of strings to match against.
    :param limit: The maximum number of matches to return.
    """
    best_role = None
    best_score = 0
    for choice in choices:
        score = fuzz.ratio(query, choice)
        if score > best_score and score > 25:
            best_score = score
            best_role = choice

    return best_role, best_score


def notebook_parser(notebook):
    """
    Parse a notebook and extract the message objects.

    :param notebook: The notebook object.
    """
    messages = []
    for cell in notebook.cells[2:]:
        if cell["cell_type"] == "markdown":
            markdown_headers = ["**User**", "**Assistant**"]
            lines = cell["source"].split("\n")
            first_line = lines[0]
            role, score = get_closest_match(first_line, markdown_headers)
            if score>25:
                message = {
                    "role": role.replace("*", "").strip(),
                    "content": "\n".join(lines[1:]).strip("\n"),
                    "type": "markdown"
                }
                messages.append(message)

        elif cell["cell_type"] == "code":
            code_headers = ["# User", "# Assistant"]
            lines = cell["source"].split("\n")
            first_line = lines[0]
            role, score = get_closest_match(first_line, code_headers)
            if score>25:
                message = {
                    "role": role.replace("#", "").strip(),
                    "content": "\n".join(lines[1:]).strip("\n"),
                    "type": "code"
                }
                messages.append(message)
    return messages


def download_and_parse_notebook(service_account_file, file_id):
    # Authenticate with the service account
    credentials = service_account.Credentials.from_service_account_file(
        service_account_file, scopes=['https://www.googleapis.com/auth/drive'])
    service = build('drive', 'v3', credentials=credentials)

    # Request to download the file
    request = service.files().get_media(fileId=file_id)
    fh = io.BytesIO()
    downloader = MediaIoBaseDownload(fh, request)

    # Download the file
    done = False
    while not done:
        status, done = downloader.next_chunk()
        print("Download progress: %d%%." % int(status.progress() * 100))

    # Move the buffer's pointer to the beginning
    fh.seek(0)

    # Open the notebook
    notebook = nbformat.read(fh, as_version=4)

    # Parse the notebook
    messages = notebook_parser(notebook)

    # # Extract the first cell
    first_cell = notebook.cells[0]
    lines = first_cell["source"].split("\n")
    metadata = {}
    for line in lines:
        if "**Python Topics**" in line:
            metadata["topic"] = line.split(" - ")[1]
        if "**Type**" in line:
            metadata["type"] = line.split(" - ")[1]
        if "**Target Number of Turns (User + Assistant)**" in line:
            metadata["target_turns"] = line.split(" - ")[1]

    return {
        "id": file_id,
        "metadata": metadata,
        "messages":messages
    }


def threading_processor(service_account_path, file_id, results):
    results.append(download_and_parse_notebook(service_account_path, file_id))


threads = []
parsed_conversations = []
for i in range(tasks_df.shape[0]):
    file_id = tasks_df["task_link"][i].split("/")[-1]
    thread = threading.Thread(target=threading_processor, args=(service_account_path, file_id, parsed_conversations))
    threads.append(thread)
    thread.start()

for thread in threads:
    thread.join()


Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.


Exception in thread Thread-311:
Traceback (most recent call last):
  File "/home/joe96/miniconda3/lib/python3.9/threading.py", line 980, in _bootstrap_inner
    self.run()
  File "/home/joe96/miniconda3/lib/python3.9/threading.py", line 917, in run
    self._target(*self._args, **self._kwargs)
  File "/tmp/ipykernel_525/2184826214.py", line 115, in threading_processor
  File "/tmp/ipykernel_525/2184826214.py", line 83, in download_and_parse_notebook
  File "/home/joe96/projects/turing/character.ai/character_tasks/venv/lib/python3.9/site-packages/googleapiclient/_helpers.py", line 130, in positional_wrapper
    return wrapped(*args, **kwargs)
  File "/home/joe96/projects/turing/character.ai/character_tasks/venv/lib/python3.9/site-packages/googleapiclient/http.py", line 780, in next_chunk
    raise HttpError(resp, content, uri=self._uri)
googleapiclient.errors.HttpError: <HttpError 404 when requesting https://www.googleapis.com/drive/v3/files/1lHYB-8JiU67LlaqjvaRuLYUbetxWbnD5%23scrollTo%

Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.


Exception in thread Thread-313:
Traceback (most recent call last):
  File "/home/joe96/miniconda3/lib/python3.9/threading.py", line 980, in _bootstrap_inner
    self.run()
  File "/home/joe96/miniconda3/lib/python3.9/threading.py", line 917, in run
    self._target(*self._args, **self._kwargs)
  File "/tmp/ipykernel_525/2184826214.py", line 115, in threading_processor
  File "/tmp/ipykernel_525/2184826214.py", line 83, in download_and_parse_notebook
  File "/home/joe96/projects/turing/character.ai/character_tasks/venv/lib/python3.9/site-packages/googleapiclient/_helpers.py", line 130, in positional_wrapper
    return wrapped(*args, **kwargs)
  File "/home/joe96/projects/turing/character.ai/character_tasks/venv/lib/python3.9/site-packages/googleapiclient/http.py", line 780, in next_chunk
    raise HttpError(resp, content, uri=self._uri)
googleapiclient.errors.HttpError: <HttpError 404 when requesting https://www.googleapis.com/drive/v3/files/1rfNQU__74pEdovonm_-u6yrhF0UsAa2C?alt=media r

Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.Download progress: 100%.

Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.


In [4]:
from datetime import datetime

def get_number_of_turns(messages):

    initial_role = messages[0]["role"]

    count = 0
    for message in messages:
        if message["role"] == initial_role:
            count += 1
    return count


def standardize_date_format(date):
    """
    Given a date string, standardize the date format to YYYY/MM/DD.
    """
    if date is None:
        return ""
    try:
        # Parse the date string into a datetime object
        standardized_date = datetime.strptime(date, "%Y/%m/%d")
    except ValueError:
        try:
            # Attempt to parse other common formats here
            # Example: MM/DD/YYYY
            standardized_date = datetime.strptime(date, "%m/%d/%Y")
        except ValueError:
            return "Invalid date format"

    # Format the datetime object into the desired string format
    return standardized_date.strftime("%Y/%m/%d")

not_found_emails = set()
metadata_only = []
for conversation in parsed_conversations:
    
    # Extract actual number of turns
    conversation["metadata"]["actual_turns"] = get_number_of_turns(conversation["messages"])

    try:
        tracking_record = tasks_df[tasks_df["task_link"].str.contains(conversation["id"])].iloc[0].to_dict()
    except IndexError:
        print("IndexError for id:", conversation["id"])
        continue

    # Get Author email
    conversation["metadata"]["assigned_to_email"] = tracking_record["assigned_to_email"]

    # Get duration
    conversation["metadata"]["duration_mins"] = tracking_record["duration_mins"]

    # Get Completion Date
    conversation["metadata"]["completion_date"] = standardize_date_format(tracking_record["completion_date"])

    try:
        contrib_entry = contributors_df[contributors_df["Email"] == tracking_record["assigned_to_email"]].iloc[0]
    except IndexError:
        not_found_emails.add(tracking_record["assigned_to_email"])
        print("IndexError for email:", tracking_record["assigned_to_email"])
        continue

    # Get Join Date
    conversation["metadata"]["joined_on"] = contrib_entry["Joined on"]

    # Get Team
    try:
        conversation["metadata"]["team"] = contrib_entry["Source"]
    except IndexError:
        print("IndexError for email:", tracking_record["assigned_to_email"])
        conversation["metadata"]["team"] = "Unknown"

    metadata_only.append(conversation["metadata"])

metadata_only_df = pd.DataFrame(metadata_only)

IndexError for email: satya.s@turing.com
IndexError for email: raman.k@turing.com
IndexError for email: satya.s@turing.com
IndexError for email: 
IndexError for email: 
IndexError for email: https://colab.research.google.com/drive/1s6QrFchojtSInYl0xrwJ-Dcv6gqqL8lB
IndexError for email: raman.k@turing.com
IndexError for email: toh.y@turing,com
IndexError for email: toh.y@turing,com
IndexError for email: raman.k@turing.com
IndexError for email: 
IndexError for email: andranik.g@gmail.com


## Filter data on full timers who joined on 22/12/2023

In [5]:
metadata_only_df = metadata_only_df[metadata_only_df["team"]=="Vetting"]
metadata_only_df = metadata_only_df[metadata_only_df["joined_on"]=="12/22/2023"]
metadata_only_df

Unnamed: 0,topic,type,target_turns,actual_turns,assigned_to_email,duration_mins,completion_date,joined_on,team
96,data_analysis > json_parsing,query,2,2,marcus.a@turing.com,30,2023/12/22,12/22/2023,Vetting
99,web_development > web_servers,query,3,3,paulo.c@turing.com,60,2023/12/25,12/22/2023,Vetting
100,algorithms > by_topic > divide_and_conquer,query,3,3,freitas.g@turing.com,30,2023/12/22,12/22/2023,Vetting
102,web_development > web_services,query,2,2,freitas.g@turing.com,20,2023/12/22,12/22/2023,Vetting
104,unit_testing_methodology > security_testing,query,2,2,marcel.s@turing.com,60,2023/12/26,12/22/2023,Vetting
...,...,...,...,...,...,...,...,...,...
1848,database > no_sql_database_concepts,query,5,5,gedeon.a@turing.com,33,2023/12/29,12/22/2023,Vetting
1849,database > no_sql_database_concepts,query,5,5,gedeon.a@turing.com,34,2023/12/29,12/22/2023,Vetting
1850,python_language_and_scripting > modules_and_pa...,query,4,4,armas.j@turing.com,20,2023/12/27,12/22/2023,Vetting
1853,python_language_and_scripting > packaging,modification,3,3,caram.v@turing.com,30,2024/01/03,12/22/2023,Vetting


In [6]:
# Process the duration_mins column to make into int

def process_duration_mins(duration_mins):
    if duration_mins == "" or duration_mins is None:
        return 15
    elif isinstance(duration_mins, int):
        return duration_mins
    else:
        return int(duration_mins.split(" ")[0])
    
metadata_only_df["duration_mins"] = metadata_only_df["duration_mins"].apply(process_duration_mins)

trainer_avg_turn_duration = metadata_only_df.groupby("assigned_to_email").agg({"duration_mins": "mean", "actual_turns": "mean"}).reset_index()
trainer_avg_turn_duration["avg_turn_duration"] = trainer_avg_turn_duration["duration_mins"] / trainer_avg_turn_duration["actual_turns"]
trainer_avg_turn_duration = trainer_avg_turn_duration.sort_values("avg_turn_duration", ascending=True)
trainer_avg_turn_duration = trainer_avg_turn_duration.rename(columns={
    "duration_mins": "avg_mins_per_convo",
    "actual_turns": "avg_turns_per_convo",
    "avg_turn_duration": "avg_mins_per_turn"
})
trainer_avg_turn_duration

Unnamed: 0,assigned_to_email,avg_mins_per_convo,avg_turns_per_convo,avg_mins_per_turn
7,gedeon.a@turing.com,43.923077,5.8,7.572944
5,elsadek.a@turing.com,32.755102,3.530612,9.277457
14,prateek.j@turing.com,22.848485,2.30303,9.921053
3,armas.j@turing.com,31.710526,3.157895,10.041667
16,safi.u@turing.com,39.714286,3.952381,10.048193
2,archit.k@turing.com,35.113636,3.204545,10.957447
4,caram.v@turing.com,31.761905,2.873016,11.055249
18,zain.v@turing.com,34.793103,3.068966,11.337079
13,paulo.c@turing.com,33.612903,2.564516,13.106918
17,sudharchith.s@turing.com,36.75,2.7,13.611111


In [7]:
reviews = reviews_df[reviews_df["Author Email"].isin(trainer_avg_turn_duration["assigned_to_email"].tolist())]
reviews = reviews.astype({"Code Quality": "int32", "Language Quality": "int32"})
reviews["avg_quality_score"] = (reviews["Code Quality"] + reviews["Language Quality"]) / 2

trainer_avg_quality = reviews.groupby("Author Email").agg({"avg_quality_score": "mean", "Timestamp":"count"}).reset_index()
trainer_avg_quality = trainer_avg_quality.rename(columns={"Timestamp": "total_reviews"})
trainer_avg_quality = trainer_avg_quality.sort_values("avg_quality_score", ascending=False)
trainer_avg_quality

Unnamed: 0,Author Email,avg_quality_score,total_reviews
3,armas.j@turing.com,4.875,4
6,freitas.g@turing.com,4.875,4
9,kishore.g@turing.com,4.833333,3
13,paulo.c@turing.com,4.833333,6
8,ishwar.b@turing.com,4.8,5
12,marcus.a@turing.com,4.7,5
14,prateek.j@turing.com,4.642857,7
7,gedeon.a@turing.com,4.625,4
16,safi.u@turing.com,4.6,10
5,elsadek.a@turing.com,4.6,5


In [8]:
trainer_throughput = metadata_only_df.groupby("assigned_to_email").agg({"topic": "count", "duration_mins":"sum", "actual_turns":"sum"}).reset_index()
trainer_throughput = trainer_throughput.sort_values("topic", ascending=False)
trainer_throughput = trainer_throughput.rename(columns={
    "topic": "total_convos",
    "duration_mins": "total_mins",
    "actual_turns": "total_turns"
})
trainer_throughput.sort_values("total_turns", ascending=False)

Unnamed: 0,assigned_to_email,total_convos,total_mins,total_turns
7,gedeon.a@turing.com,65,2855,377
16,safi.u@turing.com,63,2502,249
0,abdullah.i@turing.com,59,3782,214
4,caram.v@turing.com,63,2001,181
18,zain.v@turing.com,58,2018,178
5,elsadek.a@turing.com,48,1605,173
13,paulo.c@turing.com,58,2084,159
9,kishore.g@turing.com,29,2050,145
6,freitas.g@turing.com,50,2250,143
2,archit.k@turing.com,31,1545,141


In [9]:
import numpy as np
import pandas as pd


def extract_top_level_topic(topic):
    if isinstance(topic, str):
        return topic.split(" > ")[0]
    else:
        return topic


def calculate_diversity(df, group_by="assigned_to_email", count_column="count", entropy_column="diversity"):
    """
    Calculate the diversity of a dataframe.

    :param df: The dataframe to calculate diversity on.
    :param group_by: The column to group by.
    :param count_column: The column to count.
    :param entropy_column: The column to store the entropy in.
    """
    # Step 1: Normalize counts
    total_counts = df.groupby(group_by)[count_column].transform('sum')
    df['normalized_count'] = df[count_column] / total_counts

    # Step 2: Calculate entropy
    df['entropy_component'] = -df['normalized_count'] * np.log2(df['normalized_count'])
    entropy = df.groupby(group_by)['entropy_component'].sum().reset_index()

    entropy.columns = [group_by, entropy_column]
    return entropy


metadata_only_df["top_level_topic"] = metadata_only_df["topic"].apply(extract_top_level_topic)

# Aggregation of Count of tasks per trainer per top level topic in a new variable trainer_diversity
trainer_diversity = metadata_only_df.groupby(["assigned_to_email", "top_level_topic"]).agg({"topic": "count"}).reset_index()
trainer_diversity = trainer_diversity.rename(columns={"topic": "count"})
trainer_diversity = trainer_diversity.sort_values("count", ascending=False)

# Calculate entropy
topleveltopic_diversity = calculate_diversity(trainer_diversity, entropy_column="topic_diversity")
topleveltopic_diversity

Unnamed: 0,assigned_to_email,topic_diversity
0,abdullah.i@turing.com,1.87412
1,adil.m@turing.com,2.557079
2,archit.k@turing.com,2.309352
3,armas.j@turing.com,1.513897
4,caram.v@turing.com,2.239728
5,elsadek.a@turing.com,2.011178
6,freitas.g@turing.com,2.60399
7,gedeon.a@turing.com,2.720036
8,ishwar.b@turing.com,2.398303
9,kishore.g@turing.com,1.982466


In [10]:
# Hour Tracking Utilization



In [11]:
dpi_df = trainer_avg_turn_duration.merge(trainer_avg_quality, left_on="assigned_to_email", right_on="Author Email", how="inner")
dpi_df = dpi_df.merge(trainer_throughput, on="assigned_to_email", how="inner")
dpi_df = dpi_df.merge(topleveltopic_diversity, on="assigned_to_email", how="inner")
dpi_df = dpi_df.drop(columns=["Author Email"])
dpi_df

Unnamed: 0,assigned_to_email,avg_mins_per_convo,avg_turns_per_convo,avg_mins_per_turn,avg_quality_score,total_reviews,total_convos,total_mins,total_turns,topic_diversity
0,gedeon.a@turing.com,43.923077,5.8,7.572944,4.625,4,65,2855,377,2.720036
1,elsadek.a@turing.com,32.755102,3.530612,9.277457,4.6,5,48,1605,173,2.011178
2,prateek.j@turing.com,22.848485,2.30303,9.921053,4.642857,7,33,754,76,2.292493
3,armas.j@turing.com,31.710526,3.157895,10.041667,4.875,4,33,1205,120,1.513897
4,safi.u@turing.com,39.714286,3.952381,10.048193,4.6,10,63,2502,249,2.691335
5,archit.k@turing.com,35.113636,3.204545,10.957447,4.4,5,31,1545,141,2.309352
6,caram.v@turing.com,31.761905,2.873016,11.055249,4.583333,6,63,2001,181,2.239728
7,zain.v@turing.com,34.793103,3.068966,11.337079,4.5,2,58,2018,178,2.451713
8,paulo.c@turing.com,33.612903,2.564516,13.106918,4.833333,6,58,2084,159,2.269277
9,sudharchith.s@turing.com,36.75,2.7,13.611111,4.142857,7,15,735,54,1.456565


In [12]:
def transform_to_zscore(sequence):
    """
    Calculate the z-score of a column.

    :param df: The dataframe to calculate z-score on.
    :param column: The column to calculate z-score on.
    """
    return (sequence - sequence.mean()) / sequence.std(ddof=0)


dpi_normalized_df = dpi_df.copy()

# Loop on all numerical columns and transform to z-score
for column in dpi_normalized_df.columns:
    if column != "assigned_to_email":
        dpi_normalized_df[column] = transform_to_zscore(dpi_normalized_df[column])

dpi_normalized_df

Unnamed: 0,assigned_to_email,avg_mins_per_convo,avg_turns_per_convo,avg_mins_per_turn,avg_quality_score,total_reviews,total_convos,total_mins,total_turns,topic_diversity
0,gedeon.a@turing.com,0.132295,2.95258,-1.594009,0.308409,-0.65065,1.407627,1.365583,2.958216,1.140285
1,elsadek.a@turing.com,-0.833876,0.411645,-1.166609,0.211764,-0.192785,0.440258,-0.191057,0.417546,-0.11727
2,prateek.j@turing.com,-1.690924,-0.962826,-1.00523,0.377441,0.722944,-0.413303,-1.250817,-0.790518,0.381798
3,armas.j@turing.com,-0.924245,-0.005671,-0.974986,1.274855,-0.65065,-0.413303,-0.689182,-0.24253,-0.999477
4,safi.u@turing.com,-0.231818,0.883881,-0.97335,0.211764,2.096538,1.293819,0.925988,1.36407,1.089367
5,archit.k@turing.com,-0.629833,0.046562,-0.745358,-0.561394,-0.192785,-0.527112,-0.265776,0.019009,0.411707
6,caram.v@turing.com,-0.9198,-0.324637,-0.720834,0.147334,0.265079,1.293819,0.302086,0.51718,0.288191
7,zain.v@turing.com,-0.657563,-0.105241,-0.650166,-0.174815,-1.566379,1.009299,0.323257,0.479817,0.664265
8,paulo.c@turing.com,-0.759665,-0.670051,-0.206386,1.113781,0.265079,1.009299,0.405447,0.243186,0.340613
9,sudharchith.s@turing.com,-0.488267,-0.518356,-0.079961,-1.555453,0.722944,-1.437577,-1.274478,-1.064512,-1.101187


In [13]:
# Create a final score column as a weighted average of all the columns

weights = {
    # Utilization = 1
    "avg_tracked_mins_per_convo": 0.5,
    "avg_tracked_mins_per_turn": 0.5,

    # Efficiency = 1
    "avg_mins_per_convo": 0.3,
    "avg_mins_per_turn": 0.7  ,

    # Quality = 2
    "avg_quality_score": 2,

    # Throughput = 2
    "total_convos": 0.6,
    "total_turns": 1.4,

    # Diversity = 1
    "topic_diversity": 1
}


greater_is_better = {
    # Utilization = 1
    "avg_tracked_mins_per_convo": False,
    "avg_tracked_mins_per_turn": False,

    # Efficiency = 1
    "avg_mins_per_convo": False,
    "avg_mins_per_turn": False,

    # Quality = 2
    "avg_quality_score": True,

    # Throughput = 2
    "total_convos": True,
    "total_turns": True,

    # Diversity = 1
    "topic_diversity": True
}


dpi_normalized_df["final_score"] = 0
count_cols = 0
for column in weights.keys():
    try:
        if greater_is_better[column]:
            dpi_normalized_df["final_score"] += dpi_normalized_df[column] * weights[column]
        else:
            dpi_normalized_df["final_score"] += (dpi_normalized_df[column]*(-1)) * weights[column]
        count_cols += 1
    except KeyError:
        print("KeyError for column:", column)
        continue


dpi_normalized_df["final_score"] = dpi_normalized_df["final_score"] / count_cols
dpi_normalized_df = dpi_normalized_df.sort_values("final_score", ascending=False)
dpi_normalized_df

KeyError for column: avg_tracked_mins_per_convo
KeyError for column: avg_tracked_mins_per_turn


Unnamed: 0,assigned_to_email,avg_mins_per_convo,avg_turns_per_convo,avg_mins_per_turn,avg_quality_score,total_reviews,total_convos,total_mins,total_turns,topic_diversity,final_score
0,gedeon.a@turing.com,0.132295,2.95258,-1.594009,0.308409,-0.65065,1.407627,1.365583,2.958216,1.140285,1.303216
4,safi.u@turing.com,-0.231818,0.883881,-0.97335,0.211764,2.096538,1.293819,0.925988,1.36407,1.089367,0.824962
8,paulo.c@turing.com,-0.759665,-0.670051,-0.206386,1.113781,0.265079,1.009299,0.405447,0.243186,0.340613,0.647764
12,freitas.g@turing.com,-0.128453,-0.630321,0.452412,1.274855,-0.65065,0.554066,0.612169,0.043918,0.934413,0.599983
6,caram.v@turing.com,-0.9198,-0.324637,-0.720834,0.147334,0.265079,1.293819,0.302086,0.51718,0.288191,0.477288
7,zain.v@turing.com,-0.657563,-0.105241,-0.650166,-0.174815,-1.566379,1.009299,0.323257,0.479817,0.664265,0.374057
1,elsadek.a@turing.com,-0.833876,0.411645,-1.166609,0.211764,-0.192785,0.440258,-0.191057,0.417546,-0.11727,0.370294
3,armas.j@turing.com,-0.924245,-0.005671,-0.974986,1.274855,-0.65065,-0.413303,-0.689182,-0.24253,-0.999477,0.320412
10,kishore.g@turing.com,2.447944,2.056855,0.052137,1.113781,-1.108514,-0.64092,0.363107,0.068826,-0.168207,0.166713
2,prateek.j@turing.com,-1.690924,-0.962826,-1.00523,0.377441,0.722944,-0.413303,-1.250817,-0.790518,0.381798,0.165485


In [14]:
# Add final score to the dpi_df
dpi_df = dpi_df.merge(dpi_normalized_df[["assigned_to_email", "final_score"]], on="assigned_to_email", how="inner")
dpi_df = dpi_df.sort_values("final_score", ascending=False)
dpi_df

Unnamed: 0,assigned_to_email,avg_mins_per_convo,avg_turns_per_convo,avg_mins_per_turn,avg_quality_score,total_reviews,total_convos,total_mins,total_turns,topic_diversity,final_score
0,gedeon.a@turing.com,43.923077,5.8,7.572944,4.625,4,65,2855,377,2.720036,1.303216
4,safi.u@turing.com,39.714286,3.952381,10.048193,4.6,10,63,2502,249,2.691335,0.824962
8,paulo.c@turing.com,33.612903,2.564516,13.106918,4.833333,6,58,2084,159,2.269277,0.647764
12,freitas.g@turing.com,40.909091,2.6,15.734266,4.875,4,50,2250,143,2.60399,0.599983
6,caram.v@turing.com,31.761905,2.873016,11.055249,4.583333,6,63,2001,181,2.239728,0.477288
7,zain.v@turing.com,34.793103,3.068966,11.337079,4.5,2,58,2018,178,2.451713,0.374057
1,elsadek.a@turing.com,32.755102,3.530612,9.277457,4.6,5,48,1605,173,2.011178,0.370294
3,armas.j@turing.com,31.710526,3.157895,10.041667,4.875,4,33,1205,120,1.513897,0.320412
10,kishore.g@turing.com,70.689655,5.0,14.137931,4.833333,3,29,2050,145,1.982466,0.166713
2,prateek.j@turing.com,22.848485,2.30303,9.921053,4.642857,7,33,754,76,2.292493,0.165485


In [92]:
from dotenv import load_dotenv, find_dotenv
from openai import OpenAI

load_dotenv(find_dotenv())
opanai = OpenAI()


def fix_missing_roles(messages):
    """
    Fix missing roles in a list of messages.

    :param messages: The list of messages.
    """
    def predict_role(messages_subsequence):
        try:
            response = opanai.chat.completions.create(
                model="gpt-4-1106-preview",
                messages=[
                    {"role":"system", "content": "Your task is to accurately predict whether the empty role is a User or an Assistant. You are only allowed to reply with a single word: 'User' or 'Assistant'."},
                    {"role":"user", "content": f"Here's a part of the conversation including an empty role:\n\n{messages_subsequence}"}
                ],
                temperature=0,
                seed=42
            )
            print(response.choices[0])
            missing_role = response.choices[0].message.content
            assert missing_role in ["User", "Assistant"]
            return missing_role, None
        except Exception as e:
            return None, e

    errors = []
    for i in range(len(messages)):
        if messages[i]["role"] == "":
            subsequence = messages[max(0, i-2):min(len(messages), i+3)]
            messages[i]["role"], error = predict_role(subsequence)
            if error is not None:
                errors.append(error)
    return messages, errors



test = [
    {'role': 'user', 'content': 'Hello'},
    {'role': '', 'content': 'How can I help you?'},
    {'role': 'user', 'content': 'I have a question'},
    {'role': 'assistant', 'content': 'Sure, what is it?'},
    {'role': '', 'content': 'Can you explain this concept to me?'},
    {'role': 'assistant', 'content': 'Of course, here is a brief explanation'},
    {"role": "assistant", "content": "print('Hello World')"},
    {"role": "", "content": "This print should explain the concept for you"},
    {'role': 'user', 'content': 'Thank you!'}
]

fix_missing_roles(test)

Choice(finish_reason='stop', index=0, message=ChatCompletionMessage(content='Assistant', role='assistant', function_call=None, tool_calls=None), logprobs=None)
Choice(finish_reason='stop', index=0, message=ChatCompletionMessage(content='User', role='assistant', function_call=None, tool_calls=None), logprobs=None)
Choice(finish_reason='stop', index=0, message=ChatCompletionMessage(content='Assistant', role='assistant', function_call=None, tool_calls=None), logprobs=None)


([{'role': 'user', 'content': 'Hello'},
  {'role': 'Assistant', 'content': 'How can I help you?'},
  {'role': 'user', 'content': 'I have a question'},
  {'role': 'assistant', 'content': 'Sure, what is it?'},
  {'role': 'User', 'content': 'Can you explain this concept to me?'},
  {'role': 'assistant', 'content': 'Of course, here is a brief explanation'},
  {'role': 'assistant', 'content': "print('Hello World')"},
  {'role': 'Assistant',
   'content': 'This print should explain the concept for you'},
  {'role': 'user', 'content': 'Thank you!'}],
 [])