In [8]:
import sys 
sys.path.append('../../')

service_account_path = "../../creds/google__sa.json"
tracking_sheet_id = "1qBU7Kvuuij2fxbqPxebReKMxWgIBmOIE5Gi4ZuX0j_4"

In [9]:
import pandas as pd

from src.sheets_utils import download_sheet_as_df


contributors_df = download_sheet_as_df(
    service_account_path,
    tracking_sheet_id,
    "Contributors"
)

tasks_df = pd.concat(
    [
        download_sheet_as_df(
            service_account_path,
            tracking_sheet_id,
            "Conversations_Batch_2"
        ),
        download_sheet_as_df(
            service_account_path,
            tracking_sheet_id,
            "Conversations_Batch_3"
        ),
        download_sheet_as_df(
            service_account_path,
            tracking_sheet_id,
            "Conversations_Batch_4"
        ),
        download_sheet_as_df(
            service_account_path,
            tracking_sheet_id,
            "Conversations_Batch_5"
        ),
    ],
    ignore_index=True
)
tasks_df = tasks_df[tasks_df["completion_status"] == "Done"].reset_index()


reviews_df = download_sheet_as_df(
    service_account_path,
    tracking_sheet_id,
    "Reviews"
)

In [10]:
# Parse Conversations into list of dicts

import io
import threading

import nbformat
from fuzzywuzzy import fuzz

from google.oauth2 import service_account
from googleapiclient.discovery import build
from googleapiclient.http import MediaIoBaseDownload


def get_closest_match(query, choices):
    """
    Get the closest match(es) to a query string from a list of choices.

    :param query: The query string.
    :param choices: A list of strings to match against.
    :param limit: The maximum number of matches to return.
    """
    best_role = None
    best_score = 0
    for choice in choices:
        score = fuzz.ratio(query, choice)
        if score > best_score and score > 25:
            best_score = score
            best_role = choice

    return best_role, best_score


def notebook_parser(notebook):
    """
    Parse a notebook and extract the message objects.

    :param notebook: The notebook object.
    """
    messages = []
    for cell in notebook.cells[2:]:
        if cell["cell_type"] == "markdown":
            markdown_headers = ["**User**", "**Assistant**"]
            lines = cell["source"].split("\n")
            first_line = lines[0]
            role, score = get_closest_match(first_line, markdown_headers)
            if score>25:
                message = {
                    "role": role.replace("*", "").strip(),
                    "content": "\n".join(lines[1:]).strip("\n"),
                    "type": "markdown"
                }
                messages.append(message)

        elif cell["cell_type"] == "code":
            code_headers = ["# User", "# Assistant"]
            lines = cell["source"].split("\n")
            first_line = lines[0]
            role, score = get_closest_match(first_line, code_headers)
            if score>25:
                message = {
                    "role": role.replace("#", "").strip(),
                    "content": "\n".join(lines[1:]).strip("\n"),
                    "type": "code"
                }
                messages.append(message)
    return messages


def download_and_parse_notebook(service_account_file, file_id):
    # Authenticate with the service account
    credentials = service_account.Credentials.from_service_account_file(
        service_account_file, scopes=['https://www.googleapis.com/auth/drive'])
    service = build('drive', 'v3', credentials=credentials)

    # Request to download the file
    request = service.files().get_media(fileId=file_id)
    fh = io.BytesIO()
    downloader = MediaIoBaseDownload(fh, request)

    # Download the file
    done = False
    while not done:
        status, done = downloader.next_chunk()
        print("Download progress: %d%%." % int(status.progress() * 100))

    # Move the buffer's pointer to the beginning
    fh.seek(0)

    # Open the notebook
    notebook = nbformat.read(fh, as_version=4)

    # Parse the notebook
    messages = notebook_parser(notebook)

    # # Extract the first cell
    first_cell = notebook.cells[0]
    lines = first_cell["source"].split("\n")
    metadata = {}
    for line in lines:
        if "**Python Topics**" in line:
            metadata["topic"] = line.split(" - ")[1]
        if "**Type**" in line:
            metadata["type"] = line.split(" - ")[1]
        if "**Target Number of Turns (User + Assistant)**" in line:
            metadata["target_turns"] = line.split(" - ")[1]

    return {
        "id": file_id,
        "metadata": metadata,
        "messages":messages
    }


def threading_processor(service_account_path, file_id, results):
    results.append(download_and_parse_notebook(service_account_path, file_id))


threads = []
parsed_conversations = []
for i in range(tasks_df.shape[0]):
    file_id = tasks_df["task_link"][i].split("/")[-1]
    thread = threading.Thread(target=threading_processor, args=(service_account_path, file_id, parsed_conversations))
    threads.append(thread)
    thread.start()

for thread in threads:
    thread.join()


Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.


Exception in thread Thread-311:
Traceback (most recent call last):
  File "/home/joe96/miniconda3/lib/python3.9/threading.py", line 980, in _bootstrap_inner
    self.run()
  File "/home/joe96/miniconda3/lib/python3.9/threading.py", line 917, in run
    self._target(*self._args, **self._kwargs)
  File "/tmp/ipykernel_32542/2184826214.py", line 115, in threading_processor
  File "/tmp/ipykernel_32542/2184826214.py", line 83, in download_and_parse_notebook
  File "/home/joe96/projects/turing/character.ai/character_tasks/venv/lib/python3.9/site-packages/googleapiclient/_helpers.py", line 130, in positional_wrapper
    return wrapped(*args, **kwargs)
  File "/home/joe96/projects/turing/character.ai/character_tasks/venv/lib/python3.9/site-packages/googleapiclient/http.py", line 780, in next_chunk
    raise HttpError(resp, content, uri=self._uri)
googleapiclient.errors.HttpError: <HttpError 404 when requesting https://www.googleapis.com/drive/v3/files/1lHYB-8JiU67LlaqjvaRuLYUbetxWbnD5%23scrol

Download progress: 100%.


Exception in thread Thread-313:
Traceback (most recent call last):
  File "/home/joe96/miniconda3/lib/python3.9/threading.py", line 980, in _bootstrap_inner
    self.run()
  File "/home/joe96/miniconda3/lib/python3.9/threading.py", line 917, in run


Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.


    self._target(*self._args, **self._kwargs)
  File "/tmp/ipykernel_32542/2184826214.py", line 115, in threading_processor
  File "/tmp/ipykernel_32542/2184826214.py", line 83, in download_and_parse_notebook
  File "/home/joe96/projects/turing/character.ai/character_tasks/venv/lib/python3.9/site-packages/googleapiclient/_helpers.py", line 130, in positional_wrapper
    return wrapped(*args, **kwargs)
  File "/home/joe96/projects/turing/character.ai/character_tasks/venv/lib/python3.9/site-packages/googleapiclient/http.py", line 780, in next_chunk
    raise HttpError(resp, content, uri=self._uri)
googleapiclient.errors.HttpError: <HttpError 404 when requesting https://www.googleapis.com/drive/v3/files/1rfNQU__74pEdovonm_-u6yrhF0UsAa2C?alt=media returned "File not found: 1rfNQU__74pEdovonm_-u6yrhF0UsAa2C.". Details: "[{'message': 'File not found: 1rfNQU__74pEdovonm_-u6yrhF0UsAa2C.', 'domain': 'global', 'reason': 'notFound', 'location': 'fileId', 'locationType': 'parameter'}]">


Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.Download progress: 100%.
Download progress: 100%.

Download progress: 100%.
Download progress: 100%.
Download progress: 100%.


In [22]:
from datetime import datetime

def get_number_of_turns(messages):

    initial_role = messages[0]["role"]

    count = 0
    for message in messages:
        if message["role"] == initial_role:
            count += 1
    return count


def standardize_date_format(date):
    """
    Given a date string, standardize the date format to YYYY/MM/DD.
    """
    if date is None:
        return ""
    try:
        # Parse the date string into a datetime object
        standardized_date = datetime.strptime(date, "%Y/%m/%d")
    except ValueError:
        try:
            # Attempt to parse other common formats here
            # Example: MM/DD/YYYY
            standardized_date = datetime.strptime(date, "%m/%d/%Y")
        except ValueError:
            return "Invalid date format"

    # Format the datetime object into the desired string format
    return standardized_date.strftime("%Y/%m/%d")

not_found_emails = set()
metadata_only = []
for conversation in parsed_conversations:
    
    # Extract actual number of turns
    conversation["metadata"]["actual_turns"] = get_number_of_turns(conversation["messages"])

    try:
        tracking_record = tasks_df[tasks_df["task_link"].str.contains(conversation["id"])].iloc[0].to_dict()
    except IndexError:
        print("IndexError for id:", conversation["id"])
        continue

    # Get Author email
    conversation["metadata"]["assigned_to_email"] = tracking_record["assigned_to_email"]

    # Get duration
    conversation["metadata"]["duration_mins"] = tracking_record["duration_mins"]

    # Get Completion Date
    conversation["metadata"]["completion_date"] = standardize_date_format(tracking_record["completion_date"])

    try:
        contrib_entry = contributors_df[contributors_df["Email"] == tracking_record["assigned_to_email"]].iloc[0]
    except IndexError:
        not_found_emails.add(tracking_record["assigned_to_email"])
        print("IndexError for email:", tracking_record["assigned_to_email"])
        continue

    # Get Join Date
    conversation["metadata"]["joined_on"] = contrib_entry["Joined on"]

    # Get Team
    try:
        conversation["metadata"]["team"] = contrib_entry["Source"]
    except IndexError:
        print("IndexError for email:", tracking_record["assigned_to_email"])
        conversation["metadata"]["team"] = "Unknown"

    metadata_only.append(conversation["metadata"])

metadata_only_df = pd.DataFrame(metadata_only)

IndexError for email: https://colab.research.google.com/drive/1s6QrFchojtSInYl0xrwJ-Dcv6gqqL8lB
IndexError for email: satya.s@turing.com
IndexError for email: 
IndexError for email: satya.s@turing.com
IndexError for email: 
IndexError for email: andranik.g@gmail.com
IndexError for email: raman.k@turing.com
IndexError for email: raman.k@turing.com
IndexError for email: raman.k@turing.com
IndexError for email: shaharyar.t@turing.com
IndexError for email: toh.y@turing,com
IndexError for email: toh.y@turing,com
IndexError for email: andranik.g@gmail.com


## Filter data on full timers who joined on 22/12/2023

In [24]:
# metadata_only_df = metadata_only_df[metadata_only_df["team"]=="Vetting"]

start_date = '2024-01-05'
end_date = '2024-01-08'
metadata_only_df["completion_date"] = metadata_only_df["completion_date"].replace("Invalid date format", "")
metadata_only_df["completion_date"] = pd.to_datetime(metadata_only_df["completion_date"], errors='coerce')
metadata_only_df["completion_date_str"] = metadata_only_df["completion_date"].dt.strftime('%Y-%m-%d')
metadata_only_df = metadata_only_df[(metadata_only_df['completion_date'] >= start_date) & (metadata_only_df['completion_date'] <= end_date)]

# metadata_only_df = metadata_only_df[metadata_only_df["completion_date"]>="12//2023"]
metadata_only_df

Unnamed: 0,topic,type,target_turns,actual_turns,assigned_to_email,duration_mins,completion_date,joined_on,team,completion_date_str
468,,,3-7,4,aarunik.g@turing.com,70,2024-01-05,12/28/2023,Vetting,2024-01-05
481,,,3-7,3,abdullah.i@turing.com,85,2024-01-07,12/22/2023,Vetting,2024-01-07
488,,,1-4,4,abdullah.i@turing.com,65,2024-01-07,12/22/2023,Vetting,2024-01-07
498,,query,,4,aman.s@turing.com,116,2024-01-05,12/28/2023,Vetting,2024-01-05
523,,query,,4,elsadek.a@turing.com,83,2024-01-05,12/22/2023,Vetting,2024-01-05
...,...,...,...,...,...,...,...,...,...,...
2122,,,6-10,5,abdullah.i@turing.com,95,2024-01-05,12/22/2023,Vetting,2024-01-05
2132,,,3-7,4,abdullah.i@turing.com,100,2024-01-05,12/22/2023,Vetting,2024-01-05
2140,algorithms > by_topic > sorting,query,1-4,1,ritesh.r@turing.com,15,2024-01-05,,Anthropic,2024-01-05
2159,,,1-4,3,abdullah.i@turing.com,45,2024-01-05,12/22/2023,Vetting,2024-01-05


In [25]:
# Process the duration_mins column to make into int

def process_duration_mins(duration_mins):
    if duration_mins == "" or duration_mins is None:
        return 15
    elif isinstance(duration_mins, int):
        return duration_mins
    else:
        return int(duration_mins.split(" ")[0])
    
metadata_only_df["duration_mins"] = metadata_only_df["duration_mins"].apply(process_duration_mins)

trainer_avg_turn_duration = metadata_only_df.groupby("assigned_to_email").agg({"duration_mins": "mean", "actual_turns": "mean"}).reset_index()
trainer_avg_turn_duration["avg_turn_duration"] = trainer_avg_turn_duration["duration_mins"] / trainer_avg_turn_duration["actual_turns"]
trainer_avg_turn_duration = trainer_avg_turn_duration.sort_values("avg_turn_duration", ascending=True)
trainer_avg_turn_duration = trainer_avg_turn_duration.rename(columns={
    "duration_mins": "avg_mins_per_convo",
    "actual_turns": "avg_turns_per_convo",
    "avg_turn_duration": "avg_mins_per_turn"
})
trainer_avg_turn_duration

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  metadata_only_df["duration_mins"] = metadata_only_df["duration_mins"].apply(process_duration_mins)


Unnamed: 0,assigned_to_email,avg_mins_per_convo,avg_turns_per_convo,avg_mins_per_turn
28,sudharchith.s@turing.com,27.115385,4.0,6.778846
26,shaharyar.t@turing.com,30.0,3.666667,8.181818
13,jha.r@turing.com,22.142857,2.285714,9.6875
15,kumbar.r@turing.com,96.666667,9.75,9.91453
1,abdul.r@turing.com,41.0,3.6,11.388889
27,singh.r@turing.com,48.125,4.125,11.666667
24,safi.u@turing.com,43.4375,3.6875,11.779661
6,archit.k@turing.com,50.769231,4.307692,11.785714
14,khalid.s@turing.com,47.5,3.916667,12.12766
7,armas.j@turing.com,46.666667,3.666667,12.727273


In [26]:
reviews = reviews_df[reviews_df["Author Email"].isin(trainer_avg_turn_duration["assigned_to_email"].tolist())]
reviews = reviews.astype({"Code Quality": "int32", "Language Quality": "int32"})
reviews["avg_quality_score"] = (reviews["Code Quality"] + reviews["Language Quality"]) / 2

trainer_avg_quality = reviews.groupby("Author Email").agg({"avg_quality_score": "mean", "Timestamp":"count"}).reset_index()
trainer_avg_quality = trainer_avg_quality.rename(columns={"Timestamp": "total_reviews"})
trainer_avg_quality = trainer_avg_quality.sort_values("avg_quality_score", ascending=False)
trainer_avg_quality

Unnamed: 0,Author Email,avg_quality_score,total_reviews
21,rimsha.s@turing.com,5.0,1
7,armas.j@turing.com,4.9,5
12,ishwar.b@turing.com,4.833333,6
29,toh.y@turing.com,4.833333,9
11,freitas.g@turing.com,4.75,6
19,paulo.c@turing.com,4.714286,7
17,marcus.a@turing.com,4.7,5
5,andranik.g@turing.com,4.666667,6
9,caram.v@turing.com,4.642857,7
13,jha.r@turing.com,4.642857,7


In [27]:
trainer_throughput = metadata_only_df.groupby("assigned_to_email").agg({"topic": "count", "duration_mins":"sum", "actual_turns":"sum"}).reset_index()
trainer_throughput = trainer_throughput.sort_values("topic", ascending=False)
trainer_throughput = trainer_throughput.rename(columns={
    "topic": "total_convos",
    "duration_mins": "total_mins",
    "actual_turns": "total_turns"
})
trainer_throughput.sort_values("total_turns", ascending=False)

Unnamed: 0,assigned_to_email,total_convos,total_mins,total_turns
15,kumbar.r@turing.com,0,1160,117
28,sudharchith.s@turing.com,2,705,104
24,safi.u@turing.com,0,695,59
6,archit.k@turing.com,0,660,56
10,elsadek.a@turing.com,4,763,55
14,khalid.s@turing.com,4,570,47
9,caram.v@turing.com,1,620,43
1,abdul.r@turing.com,0,410,36
27,singh.r@turing.com,3,385,33
2,abdullah.i@turing.com,0,675,33


In [28]:
import numpy as np
import pandas as pd


def extract_top_level_topic(topic):
    if isinstance(topic, str):
        return topic.split(" > ")[0]
    else:
        return topic


def calculate_diversity(df, group_by="assigned_to_email", count_column="count", entropy_column="diversity"):
    """
    Calculate the diversity of a dataframe.

    :param df: The dataframe to calculate diversity on.
    :param group_by: The column to group by.
    :param count_column: The column to count.
    :param entropy_column: The column to store the entropy in.
    """
    # Step 1: Normalize counts
    total_counts = df.groupby(group_by)[count_column].transform('sum')
    df['normalized_count'] = df[count_column] / total_counts

    # Step 2: Calculate entropy
    df['entropy_component'] = -df['normalized_count'] * np.log2(df['normalized_count'])
    entropy = df.groupby(group_by)['entropy_component'].sum().reset_index()

    entropy.columns = [group_by, entropy_column]
    return entropy


metadata_only_df["top_level_topic"] = metadata_only_df["topic"].apply(extract_top_level_topic)

# Aggregation of Count of tasks per trainer per top level topic in a new variable trainer_diversity
trainer_diversity = metadata_only_df.groupby(["assigned_to_email", "top_level_topic"]).agg({"topic": "count"}).reset_index()
trainer_diversity = trainer_diversity.rename(columns={"topic": "count"})
trainer_diversity = trainer_diversity.sort_values("count", ascending=False)

# Calculate entropy
topleveltopic_diversity = calculate_diversity(trainer_diversity, entropy_column="topic_diversity")
topleveltopic_diversity

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  metadata_only_df["top_level_topic"] = metadata_only_df["topic"].apply(extract_top_level_topic)


Unnamed: 0,assigned_to_email,topic_diversity
0,aarunik.g@turing.com,1.584963
1,aman.s@turing.com,0.0
2,armas.j@turing.com,0.0
3,caram.v@turing.com,0.0
4,elsadek.a@turing.com,0.811278
5,jha.r@turing.com,0.918296
6,khalid.s@turing.com,2.0
7,marcus.a@turing.com,0.0
8,pawan.s@turing.com,0.0
9,ritesh.r@turing.com,0.0


In [29]:
# Hour Tracking Utilization



In [30]:
dpi_df = trainer_avg_turn_duration.merge(trainer_avg_quality, left_on="assigned_to_email", right_on="Author Email", how="inner")
dpi_df = dpi_df.merge(trainer_throughput, on="assigned_to_email", how="inner")
dpi_df = dpi_df.merge(topleveltopic_diversity, on="assigned_to_email", how="inner")
dpi_df = dpi_df.drop(columns=["Author Email"])
dpi_df

Unnamed: 0,assigned_to_email,avg_mins_per_convo,avg_turns_per_convo,avg_mins_per_turn,avg_quality_score,total_reviews,total_convos,total_mins,total_turns,topic_diversity
0,sudharchith.s@turing.com,27.115385,4.0,6.778846,4.1875,8,2,705,104,1.0
1,shaharyar.t@turing.com,30.0,3.666667,8.181818,2.833333,3,1,90,11,0.0
2,jha.r@turing.com,22.142857,2.285714,9.6875,4.642857,7,3,155,16,0.918296
3,singh.r@turing.com,48.125,4.125,11.666667,4.5,5,3,385,33,1.584963
4,khalid.s@turing.com,47.5,3.916667,12.12766,4.6,5,4,570,47,2.0
5,armas.j@turing.com,46.666667,3.666667,12.727273,4.9,5,1,280,22,0.0
6,toh.y@turing.com,41.5,3.1,13.387097,4.833333,9,4,415,31,1.0
7,elsadek.a@turing.com,54.5,3.928571,13.872727,4.5,6,4,763,55,0.811278
8,caram.v@turing.com,56.363636,3.909091,14.418605,4.642857,7,1,620,43,0.0
9,aarunik.g@turing.com,57.142857,3.714286,15.384615,4.142857,7,3,400,26,1.584963


In [31]:
def transform_to_zscore(sequence):
    """
    Calculate the z-score of a column.

    :param df: The dataframe to calculate z-score on.
    :param column: The column to calculate z-score on.
    """
    return (sequence - sequence.mean()) / sequence.std(ddof=0)


dpi_normalized_df = dpi_df.copy()

# Loop on all numerical columns and transform to z-score
for column in dpi_normalized_df.columns:
    if column != "assigned_to_email":
        dpi_normalized_df[column] = transform_to_zscore(dpi_normalized_df[column])

dpi_normalized_df

Unnamed: 0,assigned_to_email,avg_mins_per_convo,avg_turns_per_convo,avg_mins_per_turn,avg_quality_score,total_reviews,total_convos,total_mins,total_turns,topic_diversity
0,sudharchith.s@turing.com,-1.039424,0.492545,-1.618884,-0.304302,1.337749,-0.246183,1.536394,2.938218,0.52014
1,shaharyar.t@turing.com,-0.900016,0.165149,-1.316996,-2.957467,-1.682974,-1.107823,-1.392824,-0.808657,-0.907555
2,jha.r@turing.com,-1.279736,-1.19121,-0.993008,0.587861,0.733604,0.615457,-1.083232,-0.607212,0.403491
3,singh.r@turing.com,-0.024071,0.615319,-0.567137,0.307967,-0.474685,0.615457,0.012248,0.0777,1.355287
4,khalid.s@turing.com,-0.054276,0.410696,-0.467942,0.503893,-0.474685,1.477098,0.893395,0.641746,1.947834
5,armas.j@turing.com,-0.094549,0.165149,-0.338919,1.091671,-0.474685,-1.107823,-0.487863,-0.365479,-0.907555
6,toh.y@turing.com,-0.344244,-0.391426,-0.19694,0.961054,1.941893,1.477098,0.155136,-0.002878,0.52014
7,elsadek.a@turing.com,0.28402,0.422389,-0.092444,0.307967,0.12946,1.477098,1.812645,0.964058,0.250703
8,caram.v@turing.com,0.374086,0.403255,0.025017,0.587861,0.733604,-1.107823,1.131542,0.48059,-0.907555
9,aarunik.g@turing.com,0.411744,0.21192,0.23288,-0.391769,0.733604,0.615457,0.083692,-0.204323,1.355287


In [32]:
# Create a final score column as a weighted average of all the columns

weights = {
    # Utilization = 1
    "avg_tracked_mins_per_convo": 0.5,
    "avg_tracked_mins_per_turn": 0.5,

    # Efficiency = 1
    "avg_mins_per_convo": 0.3,
    "avg_mins_per_turn": 0.7  ,

    # Quality = 2
    "avg_quality_score": 2,

    # Throughput = 2
    "total_convos": 0.6,
    "total_turns": 1.4,

    # Diversity = 1
    "topic_diversity": 0.7,
    "turns_diversity": 0.3
}


greater_is_better = {
    # Utilization = 1
    "avg_tracked_mins_per_convo": False,
    "avg_tracked_mins_per_turn": False,

    # Efficiency = 1
    "avg_mins_per_convo": False,
    "avg_mins_per_turn": False,

    # Quality = 2
    "avg_quality_score": True,

    # Throughput = 2
    "total_convos": True,
    "total_turns": True,

    # Diversity = 1
    "topic_diversity": True,
    "turns_diversity": True
}


dpi_normalized_df["final_score"] = 0
count_cols = 0
for column in weights.keys():
    try:
        if greater_is_better[column]:
            dpi_normalized_df["final_score"] += dpi_normalized_df[column] * weights[column]
        else:
            dpi_normalized_df["final_score"] += (dpi_normalized_df[column]*(-1)) * weights[column]
        count_cols += 1
    except KeyError:
        print("KeyError for column:", column)
        continue


dpi_normalized_df["final_score"] = dpi_normalized_df["final_score"] / count_cols
dpi_normalized_df = dpi_normalized_df.sort_values("final_score", ascending=False)
dpi_normalized_df

KeyError for column: avg_tracked_mins_per_convo
KeyError for column: avg_tracked_mins_per_turn
KeyError for column: turns_diversity


Unnamed: 0,assigned_to_email,avg_mins_per_convo,avg_turns_per_convo,avg_mins_per_turn,avg_quality_score,total_reviews,total_convos,total_mins,total_turns,topic_diversity,final_score
0,sudharchith.s@turing.com,-1.039424,0.492545,-1.618884,-0.304302,1.337749,-0.246183,1.536394,2.938218,0.52014,0.861056
4,khalid.s@turing.com,-0.054276,0.410696,-0.467942,0.503893,-0.474685,1.477098,0.893395,0.641746,1.947834,0.749969
6,toh.y@turing.com,-0.344244,-0.391426,-0.19694,0.961054,1.941893,1.477098,0.155136,-0.002878,0.52014,0.568261
7,elsadek.a@turing.com,0.28402,0.422389,-0.092444,0.307967,0.12946,1.477098,1.812645,0.964058,0.250703,0.501145
3,singh.r@turing.com,-0.024071,0.615319,-0.567137,0.307967,-0.474685,0.615457,0.012248,0.0777,1.355287,0.407818
2,jha.r@turing.com,-1.279736,-1.19121,-0.993008,0.587861,0.733604,0.615457,-1.083232,-0.607212,0.403491,0.342728
5,armas.j@turing.com,-0.094549,0.165149,-0.338919,1.091671,-0.474685,-1.107823,-0.487863,-0.365479,-0.907555,0.106216
8,caram.v@turing.com,0.374086,0.403255,0.025017,0.587861,0.733604,-1.107823,1.131542,0.48059,-0.907555,0.069805
9,aarunik.g@turing.com,0.411744,0.21192,0.23288,-0.391769,0.733604,0.615457,0.083692,-0.204323,1.355287,-0.006359
10,marcus.a@turing.com,2.241304,1.474736,1.010828,0.699819,-0.474685,-1.107823,-0.464048,-0.647501,-0.907555,-0.364469


In [33]:
# Add final score to the dpi_df
dpi_df = dpi_df.merge(dpi_normalized_df[["assigned_to_email", "final_score"]], on="assigned_to_email", how="inner")
dpi_df = dpi_df.sort_values("final_score", ascending=False)
dpi_df

Unnamed: 0,assigned_to_email,avg_mins_per_convo,avg_turns_per_convo,avg_mins_per_turn,avg_quality_score,total_reviews,total_convos,total_mins,total_turns,topic_diversity,final_score
0,sudharchith.s@turing.com,27.115385,4.0,6.778846,4.1875,8,2,705,104,1.0,0.861056
4,khalid.s@turing.com,47.5,3.916667,12.12766,4.6,5,4,570,47,2.0,0.749969
6,toh.y@turing.com,41.5,3.1,13.387097,4.833333,9,4,415,31,1.0,0.568261
7,elsadek.a@turing.com,54.5,3.928571,13.872727,4.5,6,4,763,55,0.811278,0.501145
3,singh.r@turing.com,48.125,4.125,11.666667,4.5,5,3,385,33,1.584963,0.407818
2,jha.r@turing.com,22.142857,2.285714,9.6875,4.642857,7,3,155,16,0.918296,0.342728
5,armas.j@turing.com,46.666667,3.666667,12.727273,4.9,5,1,280,22,0.0,0.106216
8,caram.v@turing.com,56.363636,3.909091,14.418605,4.642857,7,1,620,43,0.0,0.069805
9,aarunik.g@turing.com,57.142857,3.714286,15.384615,4.142857,7,3,400,26,1.584963,-0.006359
10,marcus.a@turing.com,95.0,5.0,19.0,4.7,5,1,285,15,0.0,-0.364469


In [92]:
from dotenv import load_dotenv, find_dotenv
from openai import OpenAI

load_dotenv(find_dotenv())
opanai = OpenAI()


def fix_missing_roles(messages):
    """
    Fix missing roles in a list of messages.

    :param messages: The list of messages.
    """
    def predict_role(messages_subsequence):
        try:
            response = opanai.chat.completions.create(
                model="gpt-4-1106-preview",
                messages=[
                    {"role":"system", "content": "Your task is to accurately predict whether the empty role is a User or an Assistant. You are only allowed to reply with a single word: 'User' or 'Assistant'."},
                    {"role":"user", "content": f"Here's a part of the conversation including an empty role:\n\n{messages_subsequence}"}
                ],
                temperature=0,
                seed=42
            )
            print(response.choices[0])
            missing_role = response.choices[0].message.content
            assert missing_role in ["User", "Assistant"]
            return missing_role, None
        except Exception as e:
            return None, e

    errors = []
    for i in range(len(messages)):
        if messages[i]["role"] == "":
            subsequence = messages[max(0, i-2):min(len(messages), i+3)]
            messages[i]["role"], error = predict_role(subsequence)
            if error is not None:
                errors.append(error)
    return messages, errors



test = [
    {'role': 'user', 'content': 'Hello'},
    {'role': '', 'content': 'How can I help you?'},
    {'role': 'user', 'content': 'I have a question'},
    {'role': 'assistant', 'content': 'Sure, what is it?'},
    {'role': '', 'content': 'Can you explain this concept to me?'},
    {'role': 'assistant', 'content': 'Of course, here is a brief explanation'},
    {"role": "assistant", "content": "print('Hello World')"},
    {"role": "", "content": "This print should explain the concept for you"},
    {'role': 'user', 'content': 'Thank you!'}
]

fix_missing_roles(test)

Choice(finish_reason='stop', index=0, message=ChatCompletionMessage(content='Assistant', role='assistant', function_call=None, tool_calls=None), logprobs=None)
Choice(finish_reason='stop', index=0, message=ChatCompletionMessage(content='User', role='assistant', function_call=None, tool_calls=None), logprobs=None)
Choice(finish_reason='stop', index=0, message=ChatCompletionMessage(content='Assistant', role='assistant', function_call=None, tool_calls=None), logprobs=None)


([{'role': 'user', 'content': 'Hello'},
  {'role': 'Assistant', 'content': 'How can I help you?'},
  {'role': 'user', 'content': 'I have a question'},
  {'role': 'assistant', 'content': 'Sure, what is it?'},
  {'role': 'User', 'content': 'Can you explain this concept to me?'},
  {'role': 'assistant', 'content': 'Of course, here is a brief explanation'},
  {'role': 'assistant', 'content': "print('Hello World')"},
  {'role': 'Assistant',
   'content': 'This print should explain the concept for you'},
  {'role': 'user', 'content': 'Thank you!'}],
 [])