In [6]:
import main.Constants as Constants
from  openai import OpenAI
import os
import numpy as np
import pandas as pd
import time

import gspread
import gspread_dataframe as gd
import gspread_formatting as gf
from gspread_formatting import cellFormat, color, textFormat


## Sample OpenAI Prompt Completion

In [7]:
client = OpenAI(
    api_key = Constants.API_KEY_OPENAI,
)

def get_completion(prompt, model="gpt-4o-mini", temperature=0):
    messages = [{"role": "user", "content": prompt}]
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=temperature,
    )
    return response

## Test Google Sheet API

In [10]:
# No need to provide path for service account.  Json file is in the default directory for gspread at %APPDATA%/gspread/service_account.json
sa = gspread.service_account()
sh = sa.open("New Chinese Words")

sheet_name = "Tua_List"
wks2 = sh.worksheet(sheet_name)
print(f'Sheet {sheet_name} Num Rows: {wks2.row_count}')
print(f'Sheet {sheet_name} Num Columns: {wks2.col_count}')


Sheet Tua_List Num Rows: 1859
Sheet Tua_List Num Columns: 17


In [24]:
current_data = pd.DataFrame(wks2.get_all_values())
current_data.columns = current_data.iloc[0]
current_data = current_data.iloc[1:]
current_data.head()

Unnamed: 0,Word Id,Word,Pinyin,Pinyin Simplified,Type,Word Category,Word Rarity,Meaning,Sentence,Sentence Pinyin,Sentence Meaning,Added Date,Num_Quiz_Attempt,Num_Correct,Num_Wrong,Last_Quiz
1,1,帮助,bang1 zhu4,bang1 zhu4,Noun/Verb,Support,Common,Help/Assistance,我可以帮助你学习中文,Wǒ kěyǐ bāngzhù nǐ xuéxí Zhōngwén.,I can help you study Chinese.,2024-09-21,0,0,0,
2,2,包裹,bao1 guo3,bao1 guo3,Noun,Object,Common,Package,他昨天收到了一个包裹。,Wǒ zuótiān shōudào le bāoguǒ.,I received the package yesterday.,2024-09-21,0,0,0,
3,4,不错,bu2 cuo4,bu2 cuo4,Adjective,Opinion,Common,Good (More positive than 还好),这个电影不错，我们可以一起去看。,Zhè gè diànyǐng bùcuò wǒmen kěyǐ yìqǐ qù kàn,This movie is pretty good; we can go watch it ...,2024-09-21,1,1,0,2024-12-26
4,5,不过,bu2 guo4,bu2 guo4,Grammar,Grammar,Common,But (Similar to dan4 shi4 but less formal),我喜欢这家餐厅，不过价格有点贵。,Wǒ xǐhuān zhè jiā cāntīng búguò jiàgé yǒudiǎn ...,I like this restaurant but the price is a bit ...,2024-09-21,0,0,0,
5,6,不太,bu2 tai4,bu2 tai4,Adjuster,Degree,Common,not quite,不太好,bu2 tai4 hao3,not quite good,2024-09-21,0,0,0,


## Chinese Language Translation

In [20]:
from main.translation import *

In [None]:
dict_sheet_name = "Tua_List"
gsheet_name = "New Chinese Words"


df = load_dict(gsheet_mode=True, gsheet_name=gsheet_name, worksheet_name=dict_sheet_name)
cat = df['Word Category'].drop_duplicates().sort_values().to_list()

translator_pipe = TranslationPipeline(gsheet_name=gsheet_name, worksheet_name=dict_sheet_name)

In [66]:
word_list = """
            适应, 
            阳光, 
            分析, 
            冥想
            """

In [52]:
#translator_pipe.translation_module(word_list=word_list, temp=0.7)

In [53]:
#translator_pipe.new_words_df

In [54]:
#translator_pipe.update_module(overwrite_mode=True)

In [67]:
translator_pipe.run_translation_pipeline(word_list=word_list, translation_model="gpt-4o", temp=0.5, overwrite_mode=True)

['各自']
Word Before: 0   Word Id Word
747    1168   各自
Word After: Empty DataFrame
Columns: [Word Id, Word]
Index: []


'Overwrite mode enabled.  Replacing 1 words and 0 new words added.'

## For Debugging Duplicate Words

In [31]:
if hasattr(translator_pipe, 'new_words_df'):
    message = save_new_words_to_dict(
        newwords_df = translator_pipe.new_words_df,
        gsheet_mode= True,
        overwrite_mode = True,
        gsheet_name = translator_pipe.gsheet_name,
        worksheet_name = translator_pipe.worksheet_name
    )

    message
else:
    raise Exception("Run the translation module first before running the update module.")

Overwrite mode enabled.  Replacing 4 words and 0 new words added.


In [34]:
newwords_df = translator_pipe.new_words_df
gsheet_mode = True
gsheet_name = translator_pipe.gsheet_name
worksheet_name = translator_pipe.worksheet_name
overwrite_mode = True

new_words = newwords_df['Word'].drop_duplicates().values

if gsheet_mode:
    chinese_dict = load_dict(gsheet_mode=gsheet_mode, gsheet_name=gsheet_name, worksheet_name=worksheet_name)

max_id = pd.to_numeric(chinese_dict['Word Id'], errors='coerce').max()
newwords_df['Word Id'] = [num + max_id for num in range(1, len(newwords_df) + 1)]
newwords_df['Num_Quiz_Attempt'] = 0
newwords_df['Num_Correct'] = 0
newwords_df['Num_Wrong'] = 0
newwords_df['Last_Quiz'] = ''

missing_cols = [col for col in chinese_dict.columns if col not in newwords_df.columns]
if len(missing_cols) > 0:
    raise Exception(f"Missing columns in df to add: {missing_cols}")

newwords_df = newwords_df[chinese_dict.columns]
existing_words = chinese_dict['Word'].drop_duplicates().values

starting_words_len = len(existing_words)
new_words_len = len(new_words)

print(new_words)
if overwrite_mode:
    print(f'Word Before: {chinese_dict.loc[chinese_dict.Word.isin(new_words)][['Word Id', 'Word']]}')
    chinese_dict = chinese_dict.loc[~chinese_dict.Word.isin(new_words)]
    print(f'Word After: {chinese_dict.loc[chinese_dict.Word.isin(new_words)][['Word Id', 'Word']]}')
    if len(chinese_dict.loc[chinese_dict.Word.isin(new_words)]) == 0:

        dedup_words_len = len(chinese_dict['Word'].drop_duplicates().values)
        chinese_dict = pd.concat([chinese_dict, newwords_df])
    
    else:
        raise Exception("Some words in the new words list already exist in the dictionary.  Please disable overwrite mode to add new words.")
        
    message = f"Overwrite mode enabled.  Replacing {starting_words_len - dedup_words_len} words and {new_words_len - (starting_words_len - dedup_words_len)} new words added."

else: 
    newwords_df = newwords_df.loc[~newwords_df.Word.isin(existing_words)]
    dedup_words_len = len(newwords_df['Word'].drop_duplicates().values)
    chinese_dict = pd.concat([chinese_dict, newwords_df])
    
    message = f"Overwrite mode disabled.  {new_words_len - dedup_words_len} exists in current dictionary, adding {dedup_words_len} words."

chinese_dict = chinese_dict.loc[(chinese_dict['Word Id'].notnull()) & (chinese_dict['Word Id'] != '')]
chinese_dict['Word'] = chinese_dict['Word'].str.strip()

if gsheet_mode:
    #Save empty df first to clear the worksheet
    save_df_to_gsheet(gsheet_name, worksheet_name, pd.DataFrame(), overwrite_mode=overwrite_mode)
    save_df_to_gsheet(gsheet_name, worksheet_name, chinese_dict, overwrite_mode=overwrite_mode)

message

['匆忙' '不及' '医院' '兜']
Word Before: 0   Word Id Word
720    1096   匆忙
721    1097   不及
722    1098   医院
723    1099    兜
724    1100    兜
725    1075   不及
726    1079    兜
727    1080    兜
Word After: Empty DataFrame
Columns: [Word Id, Word]
Index: []


'Overwrite mode enabled.  Replacing 4 words and 0 new words added.'

In [37]:
chinese_dict2 = load_dict(gsheet_mode=gsheet_mode, gsheet_name=gsheet_name, worksheet_name=worksheet_name)
chinese_dict2

Unnamed: 0,Word Id,Word,Pinyin,Pinyin Simplified,Type,Word Category,Word Rarity,Meaning,Sentence,Sentence Pinyin,Sentence Meaning,Added Date,Num_Quiz_Attempt,Num_Correct,Num_Wrong,Last_Quiz
1,1,帮助,bang1 zhu4,bang1 zhu4,Noun/Verb,Support,Common,Help/Assistance,我可以帮助你学习中文,Wǒ kěyǐ bāngzhù nǐ xuéxí Zhōngwén.,I can help you study Chinese.,2024-09-21,0,0,0,
2,2,包裹,bao1 guo3,bao1 guo3,Noun,Object,Common,Package,他昨天收到了一个包裹。,Wǒ zuótiān shōudào le bāoguǒ.,I received the package yesterday.,2024-09-21,0,0,0,
3,4,不错,bu2 cuo4,bu2 cuo4,Adjective,Opinion,Common,Good (More positive than 还好),这个电影不错，我们可以一起去看。,Zhè gè diànyǐng bùcuò wǒmen kěyǐ yìqǐ qù kàn,This movie is pretty good; we can go watch it ...,2024-09-21,0,0,0,
4,5,不过,bu2 guo4,bu2 guo4,Grammar,Grammar,Common,But (Similar to dan4 shi4 but less formal),我喜欢这家餐厅，不过价格有点贵。,Wǒ xǐhuān zhè jiā cāntīng búguò jiàgé yǒudiǎn ...,I like this restaurant but the price is a bit ...,2024-09-21,0,0,0,
5,6,不太,bu2 tai4,bu2 tai4,Adjuster,Degree,Common,not quite,不太好,bu2 tai4 hao3,not quite good,2024-09-21,0,0,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
723,1104,兜,dōu,dou1,Noun,Object,Common,Pocket,他把钥匙放在了兜里。,Tā bǎ yàoshi fàng zài le dōu lǐ.,He put the keys in his pocket.,2024-10-26,0,0,0,
724,1105,兜,dōu,dou1,Verb,Action,Common,To go around or circle; to detour,我们兜了一圈才找到那家餐馆。,Wǒmen dōu le yī quān cái zhǎodào nà jiā cānguǎn.,We circled around before finding the restaurant.,2024-10-26,0,0,0,
725,1075,不及,bù jí,bu4 ji2,Verb,Degree,Rare,Not reach; fail to meet a standard,他的成绩不及格。,Tā de chéngjì bù jí gé.,His grades fail to meet the passing standard.,2024-10-26,0,0,0,
726,1079,兜,dōu,dou1,Noun,Object,Common,Pocket,他把钥匙放进了兜里。,Tā bǎ yàoshi fàng jìnle dōu lǐ.,He put the key into his pocket.,2024-10-26,0,0,0,


In [38]:
sa = gspread.service_account()
sh = sa.open(gsheet_name)
wks = sh.worksheet(worksheet_name)

if not overwrite_mode:
    print('Not Overwrite Mode')
    existing = gd.get_as_dataframe(wks)
    df_to_save = pd.concat([existing, df_to_save])

wks.batch_clear(["A:Q"])
gd.set_with_dataframe(wks, pd.DataFrame())

In [40]:
wks.batch_clear(["A:Q"])

{'spreadsheetId': '1GVz04WQyGWf4e6dH0Bwdb_YLNF_ZuUks0YWrEUOUF-c',
 'clearedRanges': ['Tua_List!A1:Q900']}

In [41]:
save_df_to_gsheet(gsheet_name, worksheet_name, chinese_dict, overwrite_mode=overwrite_mode)

In [35]:
chinese_dict

Unnamed: 0,Word Id,Word,Pinyin,Pinyin Simplified,Type,Word Category,Word Rarity,Meaning,Sentence,Sentence Pinyin,Sentence Meaning,Added Date,Num_Quiz_Attempt,Num_Correct,Num_Wrong,Last_Quiz
1,1,帮助,bang1 zhu4,bang1 zhu4,Noun/Verb,Support,Common,Help/Assistance,我可以帮助你学习中文,Wǒ kěyǐ bāngzhù nǐ xuéxí Zhōngwén.,I can help you study Chinese.,2024-09-21,0,0,0,
2,2,包裹,bao1 guo3,bao1 guo3,Noun,Object,Common,Package,他昨天收到了一个包裹。,Wǒ zuótiān shōudào le bāoguǒ.,I received the package yesterday.,2024-09-21,0,0,0,
3,4,不错,bu2 cuo4,bu2 cuo4,Adjective,Opinion,Common,Good (More positive than 还好),这个电影不错，我们可以一起去看。,Zhè gè diànyǐng bùcuò wǒmen kěyǐ yìqǐ qù kàn,This movie is pretty good; we can go watch it ...,2024-09-21,0,0,0,
4,5,不过,bu2 guo4,bu2 guo4,Grammar,Grammar,Common,But (Similar to dan4 shi4 but less formal),我喜欢这家餐厅，不过价格有点贵。,Wǒ xǐhuān zhè jiā cāntīng búguò jiàgé yǒudiǎn ...,I like this restaurant but the price is a bit ...,2024-09-21,0,0,0,
5,6,不太,bu2 tai4,bu2 tai4,Adjuster,Degree,Common,not quite,不太好,bu2 tai4 hao3,not quite good,2024-09-21,0,0,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,1101,匆忙,cōng máng,cong1 mang2,Adjective,Time,Common,Hurried or rushed,他匆忙地赶去上班。,Tā cōngmáng de gǎn qù shàngbān.,He hurriedly rushed to work.,2024-10-26,0,0,0,
1,1102,不及,bù jí,bu4 ji2,Verb,Contrast,Rare,To not be as good as; to fall short of,他的成绩不及她。,Tā de chéngjì bù jí tā.,His grades are not as good as hers.,2024-10-26,0,0,0,
2,1103,医院,yīyuàn,yi1 yuan4,Noun,Health,Common,Hospital,她生病了，必须去医院。,"Tā shēngbìng le, bìxū qù yīyuàn.",She is sick and must go to the hospital.,2024-10-26,0,0,0,
3,1104,兜,dōu,dou1,Noun,Object,Common,Pocket,他把钥匙放在了兜里。,Tā bǎ yàoshi fàng zài le dōu lǐ.,He put the keys in his pocket.,2024-10-26,0,0,0,


## Step by Step Pipeline

In [9]:
sample_response_translation = (
    get_completion(
        prompt=get_prompt_for_chinese_translation(word_list), model="gpt-4o-mini" , temperature=temp))
content = sample_response_translation.choices[0].message.content

newwords_df = (
    parse_translation_response(
        content,
        ffill_cols = ['Word', 'Pinyin', 'Pinyin Simplified', 'Type'],
        date_col = ['Added Date']
        )
      )

new_words = newwords_df['Word'].drop_duplicates().values
newwords_df

Unnamed: 0,Word,Pinyin,Pinyin Simplified,Type,Word Category,Meaning,Sentence,Sentence Pinyin,Sentence Meaning,Added Date
1,医院,yīyuàn,yi1 yuan4,Noun,Health,Hospital,他需要去医院看医生。,Tā xūyào qù yīyuàn kàn yīshēng.,He needs to go to the hospital to see a doctor.,2024-10-25


In [35]:
sample_response_translation = (
    get_completion(
        prompt=get_prompt_for_rarity_classification(word_list), model="gpt-4o-mini" , temperature=temp))
content = sample_response_translation.choices[0].message.content

word_rarity_df = parse_translation_response(content)
word_rarity_df

  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)


Unnamed: 0,Word,Word Rarity
1,地图,Common
2,晚点,Common
3,起飞,Common
4,降落,Common


In [36]:
save_new_words_to_dict(
    newwords_df = newwords_df.merge(word_rarity_df, on='Word', how='left'),
    gsheet_mode= True,
    overwrite_mode = True,
    gsheet_name = gsheet_name,
    worksheet_name = dict_sheet_name)

Overwrite mode enabled.  Replacing 0 words and 4 new words added.


## Batch Update

In [11]:
df = load_dict(gsheet_mode=True, gsheet_name=gsheet_name, worksheet_name=dict_sheet_name)
word_list = df['Word'].drop_duplicates().to_list()

In [14]:
import time
from tqdm import tqdm

all_dfs = []
max_retries = 3  # Set the number of retries per item
attempt = 0
batch_size = 5 

for i in tqdm(range(0, len(word_list), batch_size)):
    if attempt >= max_retries:
        break
    attempt = 0  # Reset the attempt counter if the operation succeeds
    for _ in range(max_retries):
        attempt += 1
        try:
            chunk = word_list[i:i+batch_size]
            rarity_prompt = get_prompt_for_rarity_classification(chunk)
            sample_response_translation = get_completion( prompt=rarity_prompt, model="gpt-4o-mini" , temperature=temp)
            content = sample_response_translation.choices[0].message.content
            word_rarity_df = parse_translation_response(content)
            all_dfs.append(word_rarity_df) 
            break  # If the task succeeds, move on to the next item
        except Exception as e:
            print(f"Attempt {attempt} failed for {chunk}: {e}")
            if attempt < max_retries :
                time.sleep(1)  # Optional: Wait for 1 second before retrying
            else:
                print(f"Exceed Maximum Retries for {chunk}")


100%|██████████| 108/108 [01:36<00:00,  1.12it/s]


In [15]:
df_result = pd.concat(all_dfs)
df_result['Word Rarity'].value_counts()

df_merge = df.merge(df_result.rename(columns={'Word Rarity': 'Rarity2'}), on='Word', how='left')
df_merge['Word Rarity'] = np.where(df_merge['Word Rarity'] == "", df_merge['Rarity2'], df_merge['Word Rarity'])

Common    509
Rare       27
Name: Word Rarity, dtype: int64

In [30]:
#save_df_to_gsheet(gsheet_name, dict_sheet_name, df_merge, overwrite_mode=True)

## Prompt Mentoring

In [69]:
client = OpenAI(api_key="<Insert yours>")

equality_statement = "Interprete this 'Edge_PB_Share_Avg > 0.82', 'EnterpriseMobilityCoreE3Rev > 1907.40', '5921.00 < AADPAllUp_MAU <= 15701.50',"

def interpret_equality_statement(equality_statement):
    # Initialize variables for tracking attempts
    attempt_count = 0
    max_attempts = 5
    highest_score = 0
    feedback_dict = {}
 
    while attempt_count < max_attempts:
        # MLS Simulation: Generate interpretation
        interpretation = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {
                    "role": "system",
                    "content": """You are a Machine Learning Scientist who converts equality statements into proper English.
                        An equality statement has the pattern "<measure> <equality> <value>" or "<value> <equality> <measure> <equality> <value>".

                        <measure> has a mapping to proper English:
                        Edge_PB_Share_Avg : Edge Primary Browser Share
                        SPO_MAU : SharePoint Online MAU
                        EnterpriseMobilityCoreE3Rev : EMS E3 Revenue
                        AADPAllUp_MAU : Entra ID MAU

                        if <value> is less than 1, it is a percentage.
                        if <value> is greater than 999, it must have comma(s).
                        if <value> is greater than 1, it must be rounded to the nearest whole number.
                        if <measure> has MAU, round <value> to the nearest hundred.

                        You always start an English sentence with the phrase "We recommend this product for this tenant because..."

                    """
                },
                {
                    "role": "user",
                    "content": equality_statement
                }
            ]
        )
        generated_interpretation = interpretation.choices[0].message.content

        # Mentor Simulation: Generate mentor feedback
        mentor_feedback = client.chat.completions.create(
            model="gpt-4",
            messages=[
                {
                    "role": "system",
                    "content": """You are a senior leader in an organization who evaluates the output of the Machine Learning Scientist. You will give them a score of their work from 1 to 10, and provide reasons and comments.

                        Evaluation criteria:
                        1. The equality statement should have the pattern "<measure> <equality> <value>" or "<value> <equality> <measure> <equality> <value>".
                        2. The statement needs to be mapped to proper English:
                            - Edge_PB_Share_Avg: Edge Primary Browser Share
                            - SPO_MAU: SharePoint Online MAU
                            - EnterpriseMobilityCoreE3Rev: EMS E3 Revenue
                            - AADPAllUp_MAU: Entra ID MAU
                        3. If <value> is less than 1, it's a percentage.
                        4. If <value> is greater than 999, it should have commas.
                        5. If <value> is greater than 1, round to the nearest whole number.
                        6. If the <measure> has MAU, round the <value> to the nearest hundred.
                        7. The English sentence should always start with the phrase "We recommend this product for this tenant because...".
                        8. The response should be in professional English.
                        9. Your mentee should not say the result is rounded or anything related to how they finish the process in the response, they should stick with the precise interpretation

                        # Good Examples:
                        Sharepoint Online usage is above 80%
                        Azure Revenue is more than $1000
                        the Entra ID MAU is between 5,900 and 15,700

                        # Bad Examples:
                        the Entrta ID MAU is between 5921 and 15701, rounded to the nearest hundred

                     Sample output: Score: 8.5; Comments: This is good. You missed the SPO_MAU : SharePoint Online MAU converter
                    This is just a sample output, you don't have to follow 100%, but think and give your feedback, if there is no mistake, you can give a 10 as well!
                    But you need to strictly follow the sample output format with Score:<your score, one number or float>; Comments:< Your comments>  
                    """
                },
                {
                    "role": "user",
                    "content": f"Evaluate this work and provide a score and comment.\nOriginal equality statement: {equality_statement}\nWork: {generated_interpretation}"
                }
            ]
        )

        # Parse mentor's feedback
        mentor_response = mentor_feedback.choices[0].message.content
        score = float(mentor_response.split('Score: ')[1].split(';')[0])
        comment = mentor_response.split('Comments: ')[1]
 
        # Check if the score is acceptable
        if score >= 8.5:
            return f"Interpretation Approved: {generated_interpretation}"
 
        # If score is below 8.5, store the feedback
        feedback_dict[attempt_count] = {'score': score, 'comment': comment}
        highest_score = max(highest_score, score)

        # Output feedback and ask user to revise with mentor's comment attached
        print(f"Attempt {attempt_count + 1}: Score: {score}, Feedback: {comment}")
        equality_statement = input(f"Please revise your equality statement based on the mentor's feedback:\n{comment}\nYour revised statement: ")

        attempt_count += 1
 
    # If the loop ends without a passing score, output the highest score
    return f"Max attempts reached. Highest score: {highest_score}, Feedback from last attempt: {feedback_dict[attempt_count-1]['comment']}"


# Example usage
result = interpret_equality_statement("Interprete this 'Edge_PB_Share_Avg > 0.82', 'EnterpriseMobilityCoreE3Rev > 1907.40', '5921.00 < AADPAllUp_MAU <= 15701.50' to english")
print(result)

             

AuthenticationError: Error code: 401 - {'error': {'message': 'Incorrect API key provided: <Insert **urs>. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}