In [2]:
import Constants
from  openai import OpenAI
import os
import pandas as pd
import time

import gspread
import gspread_dataframe as gd
import gspread_formatting as gf
from gspread_formatting import cellFormat, color, textFormat


## Sample OpenAI Prompt Completion

In [3]:
client = OpenAI(
    api_key = Constants.API_KEY_OPENAI,
)

def get_completion(prompt, model="gpt-4o-mini", temperature=0):
    messages = [{"role": "user", "content": prompt}]
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=temperature,
    )
    return response

In [4]:
prompt = """
Can you generate generate ten pairs of text between two to twenty words long each and label with emotion that most strongly associate with the text.   There are six possible emotions for labeling ("sadness", "joy", "love", "anger", "fear", "surprise").  Make sure there's roughly equal number of each emotion.   

An example would look like this

1) 'i didnt feel humiliated' = 'sadness'
 2) 'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake' = 'sadness'
 3)  'im grabbing a minute to post i feel greedy wrong' = 'anger'
 4) 'i am ever feeling nostalgic about the fireplace i will know that it is still on the property' = 'love'
 5)  'i am feeling grouchy' = 'anger'

 Can you generate fifty pairs?
"""

In [5]:
sample_response_high_temp = get_completion(prompt=prompt, temperature=1.5)

In [6]:
print(sample_response_high_temp.choices[0].message.content)

Certainly! Here are ten pairs labeled with their associated emotions. Each pair addresses distinct characteristics of the emotional spectrum you've provided. Here they range in words from two to twenty:

1) "I lost something I cherished." = 'sadness'
2) "Nothing beats the feeling of warm sunshine." = 'joy'
3) "I would do anything to keep you safe." = 'love'
4) "It's infuriating to see you waste your talent!" = 'anger'
5) "I can't handle the uncertainty anymore." = 'fear'
6) "Surprisingly, everything turned out better than expected!" = 'surprise'
7) "Remembering how lonely I felt last year." = 'sadness'
8) "There's so much to celebrate this weekend!" = 'joy'
9) "Your compassion warms my heart softly." = 'love'
10) "It's hard not to explode with frustration." = 'anger'

11) "My heart scares me sometimes." = 'fear'
12) "I was shocked to see her at the event." = 'surprise'
13) "A farewell also means closure— I wish we had more time." = 'sadness'
14) "Laughing with friends fuels my spirit!"

In [7]:
sample_response = get_completion(prompt=prompt)

In [8]:
print(sample_response.choices[0].message.content)

Sure! Here are fifty pairs of text labeled with the associated emotions:

1) 'I can't believe they left me behind' = 'sadness'  
2) 'The sun is shining, and I feel alive!' = 'joy'  
3) 'I never knew love could feel this deep' = 'love'  
4) 'Why did you betray my trust?' = 'anger'  
5) 'I can't shake this feeling of dread' = 'fear'  

6) 'I miss the days when we were together' = 'sadness'  
7) 'Every moment with you is a treasure' = 'love'  
8) 'I just won the lottery!' = 'joy'  
9) 'How could you say that to me?' = 'anger'  
10) 'What if I fail this time?' = 'fear'  

11) 'I feel so alone in this crowded room' = 'sadness'  
12) 'Your smile brightens my day' = 'joy'  
13) 'I cherish every memory we made' = 'love'  
14) 'This is completely unacceptable!' = 'anger'  
15) 'I heard a strange noise outside' = 'fear'  

16) 'I can't believe it's over' = 'sadness'  
17) 'Today was the best day ever!' = 'joy'  
18) 'You mean the world to me' = 'love'  
19) 'I can't stand how they treated you' =

## Test Google Sheet API

In [9]:
# No need to provide path for service account.  Json file is in the default directory for gspread at %APPDATA%/gspread/service_account.json
sa = gspread.service_account()
sh = sa.open("New Chinese Words")

sheet_name = "Words"
wks2 = sh.worksheet(sheet_name)
print(f'Sheet {sheet_name} Num Rows: {wks2.row_count}')
print(f'Sheet {sheet_name} Num Columns: {wks2.col_count}')


Sheet Words Num Rows: 99
Sheet Words Num Columns: 8


In [10]:
current_data = pd.DataFrame(wks2.get_all_values())
current_data.columns = current_data.iloc[0]
current_data = current_data.iloc[1:]
current_data.head()

Unnamed: 0,Word,Pinyin,Type,Meaning,Sentence,Sentence Pinyin,Sentence Meaning
1,大概,da4 gai4,Adjective,Approximately,他大概已经出发了,Tā dàgài yǐjīng chūfā le.,He probably already left
2,被,bei4,Particle,Particle word to turn into passive voice. (.i...,甜甜圈被吃掉了,tian2 tian2 quan1 bei4 chi1 diao4 le,The donut was eaten
3,往前,wang3 qian2,Adjective,Go forward,往前走,wang3 qian2 zou3,
4,觉得,jue2 de,Verb,Think,你觉得她怎么样,ni3 jue2 de ta1 zen3 me yang,What do you think about her
5,已经,yi3 jing1,Adverb,Already,我已经吃过晚饭了,Wǒ yǐjīng chī guò wǎnfàn le.,I have already had dinner


## Chinese Language Prompt

In [11]:
temp = 0.7
overwrite_mode = False

dict_path = "ChineseWords/ChineseWordList.csv"
dict_sheet_name = "AutoWordList"
gsheet_name = "New Chinese Words"

In [12]:
def load_dict(
        dict_path: str = None, 
        gsheet_mode=False, 
        gsheet_name = None, 
        worksheet_name = None
        ):
    if gsheet_mode:
        sa = gspread.service_account()
        sh = sa.open(gsheet_name)
        wks = sh.worksheet(worksheet_name)
        current_data = pd.DataFrame(wks.get_all_values())
        current_data.columns = current_data.iloc[0]
        current_data = current_data.iloc[1:]
        return current_data
    
    else:
        if os.path.exists(dict_path):
            return pd.read_csv(dict_path)
        else:
            return pd.DataFrame(columns=["Chinese", "English", "Pinyin"])

     
def save_df_to_gsheet(
        gsheet_name, 
        wks_name,
        df_to_save,
        overwrite_mode = False
    ):
    sa = gspread.service_account()
    sh = sa.open(gsheet_name)
    wks = sh.worksheet(wks_name)

    if not overwrite_mode:
        existing = gd.get_as_dataframe(wks)
        df_to_save = pd.concat([existing, df_to_save])
    
    gd.set_with_dataframe(wks, df_to_save)


def format_gsheet(
        gsheet_name, 
        wks_name
    ):
    sa = gspread.service_account()
    sh = sa.open(gsheet_name)
    wks = sh.wks(wks_name)

    fmt = cellFormat(
        backgroundColor=color(0.6, 0.8, 0.9),
        textFormat=textFormat(bold=True, fontSize=15, foregroundColor=color(0, 0, 0.6)),
        horizontalAlignment='CENTER'
        )

    gf.format_cell_range(wks, 'A1:H1', fmt)

    fmt = cellFormat(
        textFormat=textFormat(fontSize=15),
        )

    gf.format_cell_range(wks, 'A2:A500', fmt)
    gf.format_cell_range(wks, 'E2:E500', fmt)

In [13]:
#cat = pd.DataFrame(sa.open("New Chinese Words").worksheet(dict_sheet_name).get_all_records())['Word Category'].unique()
cat = ['General', 'Grammar', 'Direction', 'Opinion', 'Time',
       'Description', 'Organization', 'Travel', 'Social', 'Technology',
       'Health', 'Object', 'Work', 'Intent', 'Geography', 'Agriculture',
       'Weather', 'Action', 'Problem Solving', 'Necessity', 'Support',
       'Business', 'Information', 'Emotion', 'Assurance', 'Economics',
       'Degree', 'Frequency', 'Question', 'Location', 'Sequence',
       'Contrast', 'Thought', 'Relationship']

def get_prompt_for_chinese_translation(chinese_words, existing_categories=cat):
    chinese_prompt =  f"""
    For each of the input Chinese words, please output the following as a one row in a table.  There should be the following columns in table related to the word.  

    Generate output similar to the following example: 
    1. Word:  农场
    2. Pinyin:  nong2 chang3
    3. Type:   Noun (This should be adjusted whether the meaning of the word is noun/adjective/verb based on the meaning and example sentence)
    4. Meaning:  Farm (This is could be a longer description of the meaning of the word if no exact translation exists in English)
    5. Sentence:  我暑假打算去爷爷的农场帮忙  
    6. Sentence Pinyin:  Wǒ shǔjià dǎsuàn qù yéye de nóngchǎng bāngmáng. 
    7. Sentence Meaning:  I plan to go to my grandfather's farm to help during the summer vacation.
    8. Word Category: Agriculture (This should be a general category that the word belongs to)

    For each word with multiple meanings, add more rows to the the table with alternate meaning and example sentence.  Each row should have a unique meaning.
    If there is only one meaning, then keep only one row for each word.  Do not add rows for alternate meanings if there is only one meaning.  If the two meanings are sufficiently similar then they can be included in the same row.
    If the meanings are different, then the second meaning should be in a new row with the same word, pinyin, word type, and sentence.  Don't omit any values in any row even if they are the same as the row above.
    
    All input words should be included in one table.  Only return the table with no other text.

    Input Chinese Word = {chinese_words}
    Do not include any word that is not in the list {chinese_words} in the Word column of the ouput table
    """


    if existing_categories:
        chinese_prompt = f"""
            {chinese_prompt}

            Existing Categories: {','.join(existing_categories)}"
            The categories above already exist in the database.  Only add new category if the word does not fit into any of the existing categories.
            """

    return chinese_prompt


In [14]:
from io import StringIO
from datetime import datetime

def parse_table_response(content: str) -> pd.DataFrame:
    '''
    Parse the table response from OpenAI into a pandas DataFrame
    '''
    # Using StringIO to treat the text as a file-like object for pandas
    data = StringIO(content)

    # Read the table into a pandas DataFrame
    df = pd.read_csv(data, delimiter='|',  engine='python')

    # Cleaning the DataFrame by stripping leading/trailing whitespaces from column names and data
    df.columns = df.columns.str.strip()
    df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
    df = df.loc[~df.Word.str.contains('--')]

    col_to_keep = [col for col in df if 'Unnamed' not in col]
    df = df[col_to_keep]
    df['Word'] = df.Word.replace('', pd.NA).ffill()
    df['Pinyin'] = df.Pinyin.replace('', pd.NA).ffill()
    df['Type'] = df.Type.replace('', pd.NA).ffill()
    df['Added Date'] = datetime.now().strftime("%Y-%m-%d")

    return df


def save_new_words_to_dict(
        newwords_df : pd.DataFrame, 
        gsheet_mode = False, 
        gsheet_name = None, 
        worksheet_name = None,
        dict_path: str = None, 
        overwrite_mode: bool =False
        ) -> None:
    '''
    Add new words to the Chinese dictionary and save to disk. 
    If overwrite_mode is enabled, then the new words will replace any existing words in the dictionary.  
    Otherwise, only new words will be added to the dictionary.
    '''
    new_words = newwords_df['Word'].drop_duplicates().values

    if gsheet_mode:
        chinese_dict = load_dict(gsheet_mode=gsheet_mode, gsheet_name=gsheet_name, worksheet_name=worksheet_name)
    else:
        chinese_dict = pd.read_csv(dict_path) 
        
    existing_words = chinese_dict['Word'].drop_duplicates().values

    starting_words_len = len(existing_words)
    new_words_len = len(new_words)

    if overwrite_mode:
        chinese_dict = chinese_dict.loc[~chinese_dict.Word.isin(new_words)]
        dedup_words_len = len(chinese_dict['Word'].drop_duplicates().values)
        chinese_dict = pd.concat([chinese_dict, newwords_df])
        
        print(f"Overwrite mode enabled.  Replacing {starting_words_len - dedup_words_len} words and {new_words_len - (starting_words_len - dedup_words_len)} new words added.")

    else: 
        newwords_df = newwords_df.loc[~newwords_df.Word.isin(existing_words)]
        dedup_words_len = len(newwords_df['Word'].drop_duplicates().values)
        chinese_dict = pd.concat([chinese_dict, newwords_df])
        
        print(f"Overwrite mode disabled.  {new_words_len - dedup_words_len} exists in current dictionary, adding {dedup_words_len} words.")

    if gsheet_mode:
        save_df_to_gsheet(gsheet_name, worksheet_name, chinese_dict, overwrite_mode=True)
    else:
        chinese_dict.to_csv(dict_path, index=False)

In [63]:
sample_response_translation = get_completion(prompt=get_prompt_for_chinese_translation("沟通, 正在, 与, 原因, 面临, 竞争, 对手"), temperature=temp)

In [64]:
content = sample_response_translation.choices[0].message.content
print(content)

| Word   | Pinyin      | Type   | Meaning                                                            | Sentence                                       | Sentence Pinyin                                   | Sentence Meaning                                           | Word Category  |
|--------|-------------|--------|--------------------------------------------------------------------|------------------------------------------------|----------------------------------------------------|-----------------------------------------------------------|----------------|
| 太太   | tàitài     | Noun   | Mrs.; a term of respect for a married woman                        | 她是我的太太。                                 | Tā shì wǒ de tàitài.                              | She is my wife.                                          | Social         |
| 关系   | guānxì     | Noun   | Relationship; connection between people or things                   | 他们之间有很好的关系。                       | Tāmen zhījiān yǒu hěn hǎo de 

In [65]:
newwords_df = parse_table_response(content)
new_words = newwords_df['Word'].drop_duplicates().values
newwords_df

Unnamed: 0,Word,Pinyin,Type,Meaning,Sentence,Sentence Pinyin,Sentence Meaning,Word Category,Added Date
1,太太,tàitài,Noun,Mrs.; a term of respect for a married woman,她是我的太太。,Tā shì wǒ de tàitài.,She is my wife.,Social,2024-09-22
2,关系,guānxì,Noun,Relationship; connection between people or things,他们之间有很好的关系。,Tāmen zhījiān yǒu hěn hǎo de guānxì.,There is a good relationship between them.,Relationship,2024-09-22
3,关系,guānxì,Verb,To relate; to concern,这个问题与我们的项目有关系。,Zhège wèntí yǔ wǒmen de xiàngmù yǒu guānxì.,This issue relates to our project.,Relationship,2024-09-22
4,同事,tóngshì,Noun,Colleague; a person who works with you,我的同事非常友好。,Wǒ de tóngshì fēicháng yǒuhǎo.,My colleague is very friendly.,Work,2024-09-22


In [66]:
save_new_words_to_dict(
    newwords_df = newwords_df,
    gsheet_mode= True,
    overwrite_mode = True,
    gsheet_name = gsheet_name,
    worksheet_name = dict_sheet_name)

Overwrite mode enabled.  Replacing 0 words and 3 new words added.


## Mandarin Word Quiz

In [44]:
def get_prompt_generate_word_quiz(
    word_dict: pd.DataFrame,
    startfrom_date_filter: str = None,
    category_filter: str = None
    ) -> str:
        
    if startfrom_date_filter:
        word_dict = word_dict.loc[word_dict['Added Date'] >= startfrom_date_filter]

    if category_filter:
            word_dict = word_dict.loc[word_dict['Word Category'] == category_filter]

    word_list = word_dict['Word'].drop_duplicates().sample(10).values
    prompt = f''' 
    Given this word list:
    {word_list}

    Can you create an Mandarin exercise where you choose 10 non-duplicated words and an example sentence using them.   
    Leaving 2 blank columns where user can input the pinyin and the meaning in English.

    The output should only be a 10x4 table with no other written text.  The table should have the following columns:
    1) Word
    2) Sentence
    3) Pinyin (Leave blank)
    4) Meaning (Leave blank)
    '''

    return prompt

# 4O Maybe Necessary to Evaluate Quiz.  Mini seems to be halluciating quite a bit. 
def get_prompt_evaluate_quiz(
        pinyin, 
        meaning
    ) -> str:
    prompt = f'''
    Here's the answer provided.  Please check them for correctness and mark any incorrect answer and provide the correct one. 

    Please add the answers to column 3 and 4 in the original table.  Add four more columns at the end with the following values:
    Pinyin Correct (Column #5):  Just yes/no response of whether the pinyin is correct
    Pinyin Explanation (Column #6):  Should contain the correct pinyin if the answer is incorrect.  Blank otherwise.
    Meaning Correct (Column #7):  Just yes/no response of whether the meaning is correct
    Meaning Explanation (Column #8):  Short explanation of why the answer is incorrect and provide correct answer.  Blank otherwise.

    The tone for the pinyin will be provided with number 1, 2, 3, 4, 5
    The special character ü⁠ can be replaced by v in the answer

    Pinyin: {pinyin}
    Meaning: {meaning}

    No other response should be given except the table
    '''
    return prompt

In [21]:
word_dict = load_dict(gsheet_mode=True, gsheet_name=gsheet_name, worksheet_name=dict_sheet_name)

In [49]:
quiz_prompt = get_prompt_generate_word_quiz(
    word_dict=word_dict,
    startfrom_date_filter='2024-09-22',
)

sample_response_translation = get_completion(prompt=quiz_prompt, temperature=temp)

content = sample_response_translation.choices[0].message.content
data = StringIO(content)

# Read the table into a pandas DataFrame
df = pd.read_csv(data, delimiter='|',  engine='python')
df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
col_to_keep = [col for col in df if 'Unnamed' not in col]
df = df[col_to_keep]
df.columns = df.columns.str.strip()

df = df.loc[~df.Word.str.contains('--')]

Unnamed: 0,Word,Sentence,Pinyin,Meaning
1,糊涂,他在这个问题上很糊涂，不知道该怎么选择。,,
2,决定,她终于决定去旅行，放松一下自己。,,
3,误会,我们之间产生了误会，需要好好沟通。,,
4,并,我们并不需要同时完成所有的任务。,,
5,敌人,在比赛中，敌人是我们必须克服的挑战。,,
6,讨论,我们可以在会议上讨论这个提议。,,
7,认真,他对学习非常认真，总是提前完成作业。,,
8,建议,我建议你多读书，丰富自己的知识。,,
9,其实,其实这件事并没有那么复杂。,,
10,答案,他的答案是正确的，得到了老师的表扬。,,


## Prompt Mentoring

In [None]:
client = OpenAI(api_key="<Insert yours>")

equality_statement = "Interprete this 'Edge_PB_Share_Avg > 0.82', 'EnterpriseMobilityCoreE3Rev > 1907.40', '5921.00 < AADPAllUp_MAU <= 15701.50',"

def interpret_equality_statement(equality_statement):
    # Initialize variables for tracking attempts
    attempt_count = 0
    max_attempts = 5
    highest_score = 0
    feedback_dict = {}
 
    while attempt_count < max_attempts:
        # MLS Simulation: Generate interpretation
        interpretation = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {
                    "role": "system",
                    "content": """You are a Machine Learning Scientist who converts equality statements into proper English.
                        An equality statement has the pattern "<measure> <equality> <value>" or "<value> <equality> <measure> <equality> <value>".

                        <measure> has a mapping to proper English:
                        Edge_PB_Share_Avg : Edge Primary Browser Share
                        SPO_MAU : SharePoint Online MAU
                        EnterpriseMobilityCoreE3Rev : EMS E3 Revenue
                        AADPAllUp_MAU : Entra ID MAU

                        if <value> is less than 1, it is a percentage.
                        if <value> is greater than 999, it must have comma(s).
                        if <value> is greater than 1, it must be rounded to the nearest whole number.
                        if <measure> has MAU, round <value> to the nearest hundred.

                        You always start an English sentence with the phrase "We recommend this product for this tenant because..."

                    """
                },
                {
                    "role": "user",
                    "content": equality_statement
                }
            ]
        )
        generated_interpretation = interpretation.choices[0].message.content

        # Mentor Simulation: Generate mentor feedback
        mentor_feedback = client.chat.completions.create(
            model="gpt-4",
            messages=[
                {
                    "role": "system",
                    "content": """You are a senior leader in an organization who evaluates the output of the Machine Learning Scientist. You will give them a score of their work from 1 to 10, and provide reasons and comments.

                        Evaluation criteria:
                        1. The equality statement should have the pattern "<measure> <equality> <value>" or "<value> <equality> <measure> <equality> <value>".
                        2. The statement needs to be mapped to proper English:
                            - Edge_PB_Share_Avg: Edge Primary Browser Share
                            - SPO_MAU: SharePoint Online MAU
                            - EnterpriseMobilityCoreE3Rev: EMS E3 Revenue
                            - AADPAllUp_MAU: Entra ID MAU
                        3. If <value> is less than 1, it's a percentage.
                        4. If <value> is greater than 999, it should have commas.
                        5. If <value> is greater than 1, round to the nearest whole number.
                        6. If the <measure> has MAU, round the <value> to the nearest hundred.
                        7. The English sentence should always start with the phrase "We recommend this product for this tenant because...".
                        8. The response should be in professional English.
                        9. Your mentee should not say the result is rounded or anything related to how they finish the process in the response, they should stick with the precise interpretation

                        # Good Examples:
                        Sharepoint Online usage is above 80%
                        Azure Revenue is more than $1000
                        the Entra ID MAU is between 5,900 and 15,700

                        # Bad Examples:
                        the Entrta ID MAU is between 5921 and 15701, rounded to the nearest hundred

                     Sample output: Score: 8.5; Comments: This is good. You missed the SPO_MAU : SharePoint Online MAU converter
                    This is just a sample output, you don't have to follow 100%, but think and give your feedback, if there is no mistake, you can give a 10 as well!
                    But you need to strictly follow the sample output format with Score:<your score, one number or float>; Comments:< Your comments>  
                    """
                },
                {
                    "role": "user",
                    "content": f"Evaluate this work and provide a score and comment.\nOriginal equality statement: {equality_statement}\nWork: {generated_interpretation}"
                }
            ]
        )

        # Parse mentor's feedback
        mentor_response = mentor_feedback.choices[0].message.content
        score = float(mentor_response.split('Score: ')[1].split(';')[0])
        comment = mentor_response.split('Comments: ')[1]
 
        # Check if the score is acceptable
        if score >= 8.5:
            return f"Interpretation Approved: {generated_interpretation}"
 
        # If score is below 8.5, store the feedback
        feedback_dict[attempt_count] = {'score': score, 'comment': comment}
        highest_score = max(highest_score, score)

        # Output feedback and ask user to revise with mentor's comment attached
        print(f"Attempt {attempt_count + 1}: Score: {score}, Feedback: {comment}")
        equality_statement = input(f"Please revise your equality statement based on the mentor's feedback:\n{comment}\nYour revised statement: ")

        attempt_count += 1
 
    # If the loop ends without a passing score, output the highest score
    return f"Max attempts reached. Highest score: {highest_score}, Feedback from last attempt: {feedback_dict[attempt_count-1]['comment']}"


# Example usage
result = interpret_equality_statement("Interprete this 'Edge_PB_Share_Avg > 0.82', 'EnterpriseMobilityCoreE3Rev > 1907.40', '5921.00 < AADPAllUp_MAU <= 15701.50' to english")
print(result)

             