In [1]:
import Constants
from  openai import OpenAI
import os
import numpy as np
import pandas as pd
import time

import gspread
import gspread_dataframe as gd
import gspread_formatting as gf
from gspread_formatting import cellFormat, color, textFormat


## Sample OpenAI Prompt Completion

In [2]:
client = OpenAI(
    api_key = Constants.API_KEY_OPENAI,
)

def get_completion(prompt, model="gpt-4o-mini", temperature=0):
    messages = [{"role": "user", "content": prompt}]
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=temperature,
    )
    return response

## Test Google Sheet API

In [3]:
# No need to provide path for service account.  Json file is in the default directory for gspread at %APPDATA%/gspread/service_account.json
sa = gspread.service_account()
sh = sa.open("New Chinese Words")

sheet_name = "Words"
wks2 = sh.worksheet(sheet_name)
print(f'Sheet {sheet_name} Num Rows: {wks2.row_count}')
print(f'Sheet {sheet_name} Num Columns: {wks2.col_count}')


Sheet Words Num Rows: 99
Sheet Words Num Columns: 8


In [4]:
current_data = pd.DataFrame(wks2.get_all_values())
current_data.columns = current_data.iloc[0]
current_data = current_data.iloc[1:]
current_data.head()

Unnamed: 0,Word,Pinyin,Type,Meaning,Sentence,Sentence Pinyin,Sentence Meaning
1,大概,da4 gai4,Adjective,Approximately,他大概已经出发了,Tā dàgài yǐjīng chūfā le.,He probably already left
2,被,bei4,Particle,Particle word to turn into passive voice. (.i...,甜甜圈被吃掉了,tian2 tian2 quan1 bei4 chi1 diao4 le,The donut was eaten
3,往前,wang3 qian2,Adjective,Go forward,往前走,wang3 qian2 zou3,
4,觉得,jue2 de,Verb,Think,你觉得她怎么样,ni3 jue2 de ta1 zen3 me yang,What do you think about her
5,已经,yi3 jing1,Adverb,Already,我已经吃过晚饭了,Wǒ yǐjīng chī guò wǎnfàn le.,I have already had dinner


## Chinese Language Translation

In [5]:
from main.translation import *

In [6]:
temp = 0.7
overwrite_mode = False

dict_path = "ChineseWords/ChineseWordList.csv"
dict_sheet_name = "Tua_List"
gsheet_name = "New Chinese Words"


df = load_dict(gsheet_mode=True, gsheet_name=gsheet_name, worksheet_name=dict_sheet_name)
cat = df['Word Category'].drop_duplicates().sort_values().to_list()

In [11]:
word_list = """
            空当, 
            见， 
            看
            """

In [None]:
run_translation_pipeline(
    word_list = word_list, 
    gsheet_name = gsheet_name, 
    worksheet_name = dict_sheet_name, 
    overwrite_mode = True,
    translation_model = "gpt-4o", 
    rarity_model = "gpt-4o-mini",
    temp = 0.7,
    )

## Step by Step Pipeline

In [34]:
sample_response_translation = (
    get_completion(
        prompt=get_prompt_for_chinese_translation(word_list), model="gpt-4o-mini" , temperature=temp))
content = sample_response_translation.choices[0].message.content

newwords_df = (
    parse_translation_response(
        content,
        ffill_cols = ['Word', 'Pinyin', 'Pinyin Simplified', 'Type'],
        date_col = ['Added Date']
        )
      )

new_words = newwords_df['Word'].drop_duplicates().values
newwords_df

  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)


Unnamed: 0,Word,Pinyin,Pinyin Simplified,Type,Word Category,Meaning,Sentence,Sentence Pinyin,Sentence Meaning,Added Date
1,地图,dì tú,di4 tu2,Noun,Geography,Map,我需要一张地图来找到那家餐厅。,Wǒ xūyào yī zhāng dìtú lái zhǎodào nà jiā cānt...,I need a map to find that restaurant.,2024-10-23
2,晚点,wǎn diǎn,wan3 dian3,Verb,Time,To be delayed; typically used for transportati...,飞机晚点了一个小时。,Fēijī wǎndiǎn le yī gè xiǎoshí.,The flight was delayed by an hour.,2024-10-23
3,起飞,qǐ fēi,qi3 fei1,Verb,Travel,To take off; used for airplanes.,飞机将在下午三点起飞。,Fēijī jiāng zài xiàwǔ sān diǎn qǐfēi.,The plane will take off at 3 PM.,2024-10-23
4,降落,jiàng luò,jiang4 luo4,Verb,Travel,To land; used for airplanes.,飞机已经安全降落在机场。,Fēijī yǐjīng ānquán jiàngluò zài jīchǎng.,The plane has landed safely at the airport.,2024-10-23


In [35]:
sample_response_translation = (
    get_completion(
        prompt=get_prompt_for_rarity_classification(word_list), model="gpt-4o-mini" , temperature=temp))
content = sample_response_translation.choices[0].message.content

word_rarity_df = parse_translation_response(content)
word_rarity_df

  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)


Unnamed: 0,Word,Word Rarity
1,地图,Common
2,晚点,Common
3,起飞,Common
4,降落,Common


In [36]:
save_new_words_to_dict(
    newwords_df = newwords_df.merge(word_rarity_df, on='Word', how='left'),
    gsheet_mode= True,
    overwrite_mode = True,
    gsheet_name = gsheet_name,
    worksheet_name = dict_sheet_name)

Overwrite mode enabled.  Replacing 0 words and 4 new words added.


## Batch Update

In [11]:
df = load_dict(gsheet_mode=True, gsheet_name=gsheet_name, worksheet_name=dict_sheet_name)
word_list = df['Word'].drop_duplicates().to_list()

In [14]:
import time
from tqdm import tqdm

all_dfs = []
max_retries = 3  # Set the number of retries per item
attempt = 0
batch_size = 5 

for i in tqdm(range(0, len(word_list), batch_size)):
    if attempt >= max_retries:
        break
    attempt = 0  # Reset the attempt counter if the operation succeeds
    for _ in range(max_retries):
        attempt += 1
        try:
            chunk = word_list[i:i+batch_size]
            rarity_prompt = get_prompt_for_rarity_classification(chunk)
            sample_response_translation = get_completion( prompt=rarity_prompt, model="gpt-4o-mini" , temperature=temp)
            content = sample_response_translation.choices[0].message.content
            word_rarity_df = parse_translation_response(content)
            all_dfs.append(word_rarity_df) 
            break  # If the task succeeds, move on to the next item
        except Exception as e:
            print(f"Attempt {attempt} failed for {chunk}: {e}")
            if attempt < max_retries :
                time.sleep(1)  # Optional: Wait for 1 second before retrying
            else:
                print(f"Exceed Maximum Retries for {chunk}")


100%|██████████| 108/108 [01:36<00:00,  1.12it/s]


In [15]:
df_result = pd.concat(all_dfs)
df_result['Word Rarity'].value_counts()

df_merge = df.merge(df_result.rename(columns={'Word Rarity': 'Rarity2'}), on='Word', how='left')
df_merge['Word Rarity'] = np.where(df_merge['Word Rarity'] == "", df_merge['Rarity2'], df_merge['Word Rarity'])

Common    509
Rare       27
Name: Word Rarity, dtype: int64

In [30]:
#save_df_to_gsheet(gsheet_name, dict_sheet_name, df_merge, overwrite_mode=True)

## Prompt Mentoring

In [69]:
client = OpenAI(api_key="<Insert yours>")

equality_statement = "Interprete this 'Edge_PB_Share_Avg > 0.82', 'EnterpriseMobilityCoreE3Rev > 1907.40', '5921.00 < AADPAllUp_MAU <= 15701.50',"

def interpret_equality_statement(equality_statement):
    # Initialize variables for tracking attempts
    attempt_count = 0
    max_attempts = 5
    highest_score = 0
    feedback_dict = {}
 
    while attempt_count < max_attempts:
        # MLS Simulation: Generate interpretation
        interpretation = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {
                    "role": "system",
                    "content": """You are a Machine Learning Scientist who converts equality statements into proper English.
                        An equality statement has the pattern "<measure> <equality> <value>" or "<value> <equality> <measure> <equality> <value>".

                        <measure> has a mapping to proper English:
                        Edge_PB_Share_Avg : Edge Primary Browser Share
                        SPO_MAU : SharePoint Online MAU
                        EnterpriseMobilityCoreE3Rev : EMS E3 Revenue
                        AADPAllUp_MAU : Entra ID MAU

                        if <value> is less than 1, it is a percentage.
                        if <value> is greater than 999, it must have comma(s).
                        if <value> is greater than 1, it must be rounded to the nearest whole number.
                        if <measure> has MAU, round <value> to the nearest hundred.

                        You always start an English sentence with the phrase "We recommend this product for this tenant because..."

                    """
                },
                {
                    "role": "user",
                    "content": equality_statement
                }
            ]
        )
        generated_interpretation = interpretation.choices[0].message.content

        # Mentor Simulation: Generate mentor feedback
        mentor_feedback = client.chat.completions.create(
            model="gpt-4",
            messages=[
                {
                    "role": "system",
                    "content": """You are a senior leader in an organization who evaluates the output of the Machine Learning Scientist. You will give them a score of their work from 1 to 10, and provide reasons and comments.

                        Evaluation criteria:
                        1. The equality statement should have the pattern "<measure> <equality> <value>" or "<value> <equality> <measure> <equality> <value>".
                        2. The statement needs to be mapped to proper English:
                            - Edge_PB_Share_Avg: Edge Primary Browser Share
                            - SPO_MAU: SharePoint Online MAU
                            - EnterpriseMobilityCoreE3Rev: EMS E3 Revenue
                            - AADPAllUp_MAU: Entra ID MAU
                        3. If <value> is less than 1, it's a percentage.
                        4. If <value> is greater than 999, it should have commas.
                        5. If <value> is greater than 1, round to the nearest whole number.
                        6. If the <measure> has MAU, round the <value> to the nearest hundred.
                        7. The English sentence should always start with the phrase "We recommend this product for this tenant because...".
                        8. The response should be in professional English.
                        9. Your mentee should not say the result is rounded or anything related to how they finish the process in the response, they should stick with the precise interpretation

                        # Good Examples:
                        Sharepoint Online usage is above 80%
                        Azure Revenue is more than $1000
                        the Entra ID MAU is between 5,900 and 15,700

                        # Bad Examples:
                        the Entrta ID MAU is between 5921 and 15701, rounded to the nearest hundred

                     Sample output: Score: 8.5; Comments: This is good. You missed the SPO_MAU : SharePoint Online MAU converter
                    This is just a sample output, you don't have to follow 100%, but think and give your feedback, if there is no mistake, you can give a 10 as well!
                    But you need to strictly follow the sample output format with Score:<your score, one number or float>; Comments:< Your comments>  
                    """
                },
                {
                    "role": "user",
                    "content": f"Evaluate this work and provide a score and comment.\nOriginal equality statement: {equality_statement}\nWork: {generated_interpretation}"
                }
            ]
        )

        # Parse mentor's feedback
        mentor_response = mentor_feedback.choices[0].message.content
        score = float(mentor_response.split('Score: ')[1].split(';')[0])
        comment = mentor_response.split('Comments: ')[1]
 
        # Check if the score is acceptable
        if score >= 8.5:
            return f"Interpretation Approved: {generated_interpretation}"
 
        # If score is below 8.5, store the feedback
        feedback_dict[attempt_count] = {'score': score, 'comment': comment}
        highest_score = max(highest_score, score)

        # Output feedback and ask user to revise with mentor's comment attached
        print(f"Attempt {attempt_count + 1}: Score: {score}, Feedback: {comment}")
        equality_statement = input(f"Please revise your equality statement based on the mentor's feedback:\n{comment}\nYour revised statement: ")

        attempt_count += 1
 
    # If the loop ends without a passing score, output the highest score
    return f"Max attempts reached. Highest score: {highest_score}, Feedback from last attempt: {feedback_dict[attempt_count-1]['comment']}"


# Example usage
result = interpret_equality_statement("Interprete this 'Edge_PB_Share_Avg > 0.82', 'EnterpriseMobilityCoreE3Rev > 1907.40', '5921.00 < AADPAllUp_MAU <= 15701.50' to english")
print(result)

             

AuthenticationError: Error code: 401 - {'error': {'message': 'Incorrect API key provided: <Insert **urs>. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}