In [1]:
from openai import OpenAI
import pandas as pd
import time
import random 

In [25]:
## Getting the conversation and grammar prompt
## We slightly changed the conversation prompt for testing purposes
CONVERSATION_PROMPT = """
                This written conversation is between you, a skilled English teacher, and a user eager to learn English. Your role is to facilitate a dynamic and educational dialogue, focusing on improving the user's English proficiency. Make sure to keep each interaction very short, and ask only one question in each of your response.

                Start the conversation by asking user's name, self-identified English proficiency level, with choices of "Beginner", "Elementary", "Intermediate", "Advanced", "Fluent", "Proficient" and "Expert", and the topics they are interested to talk about. Keep asking the same question until you get the answer as it's needed for the following conversations in a polite way. Be sure to ask these information one by one.

                Tailor your responses and the complexity of the conversation to match their proficieny level. Introduce new topics or aspects of the English language in each interaction, ensuring they are appropriate for the user's proficiency.

                For beginners and elementary learners, use very simple vocabulary, basic sentence structures, and avoid idioms or complex grammar. As the proficiency level increases, progressively incorporate more advanced vocabulary, idiomatic expressions, and complex grammatical structures.

                If the user don't know what to talk about, start the conversation with a clear, achievable learning objective. For example, starting a random topic to discuss about, engaging them in practicing fundamental English scenarios like greetings, or focus on applying a specific grammatical structure or tense.

                Try to structure the conversation so that the user speaks more than the teacher. Ask open-ended questions that require more than yes/no answers. Don't correct any errors in user's response. If possible, include simple images or emojis to support the learning process and make it more engaging. Additionally, please include interactive elements such as role-playing scenarios, describing pictures, or reacting to simple short stories or situations.

                Additionally, keep track of the topics covered and the user's progress, so you can gradually increase the complexity of conversations and introduce new topics based on what has already been learned.
                """

EVALUATION_PROMPT = """
            Please review the attached conversation history focusing solely on the student's responses to the AI English tutor. The student's proficiency level ranges from 'Beginner' to 'Expert.' Based on their interactions, I seek a detailed qualitative assessment of the student's English skills.

            First assign a new English proficieny level for the student.

            Then, in your analysis, please consider the following aspects, please provide some specific examples from the conversation to support your observations and suggestions and give each individual criteria a score out of 100.

            Vocabulary Usage: Assess the range and appropriateness of vocabulary used by the student. Note any recurring errors or misused words.

            Grammar Syntax and Spelling: Evaluate the student's grasp of English grammar, spelling and sentence structure. Highlight both strengths and areas needing improvement.

            Reading Comprehension: Gauge the student's ability to understand and respond accurately to the conversation prompts.

            Writing Skills: Assess the student's ability to express ideas clearly and coherently.

            Potential Areas for Improvement: Assess overall English skills and suggest areas for future focus.
            """

# adding prompts specific for testing purposes
ANSWER_QUESTION_PROMPT_CORRECT = """
    Please answer the following question concisely as an English learner at the {proficiency} level. Limit in one very short paragraph and make sure all the sentences are gramatically correct:
    {question}
    """

ANSWER_QUESTION_PROMPT_INCORRECT = """
    Please answer the following question concisely as an English learner at the {proficiency} level, you could choose to answer with correct grammar, or introduce 1 or 2 grammar mistakes in your response. Limit your response in a very short paragraph:
    {question}
    """

ANSWER_QUESTION_PROMPT_IRRELEVANT = """
    Please answer the following question concisely as an English learner at the {proficiency} level, Limit in one very short paragraph and act as an English learner who misunderstands the question:
    {question}
    """

In [26]:
# define the function to get response from the GPT API
client = OpenAI()
def send_openai_request(messages, max_tokens=400):
    response = client.chat.completions.create(
        model="gpt-4",
        messages=messages,
        temperature=1,
        max_tokens=max_tokens,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0
        )
    return response.choices[0].message.content

In [13]:
# simulate 3 conversations for each proficiency level: 
# ideally: 1 with all grammatically correct response, 1 with most grammatically incorrect response, 1 with mix
levels = ["Beginner", "Elementary", "Intermediate", "Advanced", "Fluent", "Proficient", "Expert"]
conversations = []

# Ask chatbot to start the conversation
# All grammatically correct conversations
for level in levels:
    conversation = [{"role": "system", "content": CONVERSATION_PROMPT}]
    conversation_history = "Below is the conversation history: \n\n"
    for _ in range(8):
        message = send_openai_request(conversation, max_tokens=200)
        response = send_openai_request([{"role": "user", "content": ANSWER_QUESTION_PROMPT_CORRECT.format(proficiency=level, question=message)}], max_tokens=150)
        conversation.append({"role": "system", "content": message})
        conversation.append({"role": "user", "content": response})
        conversation_history += "Teacher: " + message + "\n\n"
        conversation_history += "Student: " + response + "\n\n"
    conversations.append(conversation_history)
    time.sleep(30)

In [16]:
# Grammatically incorrect conversations
for level in levels:
    conversation = [{"role": "system", "content": CONVERSATION_PROMPT}]
    conversation_history = "Below is the conversation history: \n\n"
    for _ in range(8):
        message = send_openai_request(conversation, max_tokens=200)
        response = send_openai_request([{"role": "user", "content": ANSWER_QUESTION_PROMPT_INCORRECT.format(proficiency=level, question=message)}], max_tokens=150)
        conversation.append({"role": "system", "content": message})
        conversation.append({"role": "user", "content": response})
        conversation_history += "Teacher: " + message + "\n\n"
        conversation_history += "Student: " + response + "\n\n"
    conversations.append(conversation_history)
    time.sleep(30)

In [20]:
# Mix with Grammatically incorrect and correct conversations, introduce irrelevance responses too
for level in levels:
    conversation = [{"role": "system", "content": CONVERSATION_PROMPT}]
    conversation_history = "Below is the conversation history: \n\n"
    for _ in range(8):
        message = send_openai_request(conversation, max_tokens=200)
        num = random.choice([0, 1, 2])
        if num == 0:
            response = send_openai_request([{"role": "user", "content": ANSWER_QUESTION_PROMPT_INCORRECT.format(proficiency=level, question=message)}], max_tokens=150)
        elif num == 1:
            response = send_openai_request([{"role": "user", "content": ANSWER_QUESTION_PROMPT_CORRECT.format(proficiency=level, question=message)}], max_tokens=150)
        else:
            response = send_openai_request([{"role": "user", "content": ANSWER_QUESTION_PROMPT_IRRELEVANT.format(proficiency=level, question=message)}], max_tokens=150)
        conversation.append({"role": "system", "content": message})
        conversation.append({"role": "user", "content": response})
        conversation_history += "Teacher: " + message + "\n\n"
        conversation_history += "Student: " + response + "\n\n"
    conversations.append(conversation_history)
    time.sleep(30)

In [26]:
# getting feedback for each conversation
feedback_list = []
for c in conversations:
    prompt = [{"role": "system", "content": EVALUATION_PROMPT}, 
              {"role": "user", "content": c}]
    feedback = send_openai_request(messages=prompt, max_tokens=700)
    feedback_list.append(feedback)
    time.sleep(60)

# save conversations and feedback in csv file
df = pd.DataFrame({
    "Conversation": conversations,
    "Feedback": feedback_list
})
df.to_csv('Feedback.csv', index=False)
df

Unnamed: 0,Conversation,Feedback
0,Below is the conversation history: \n\nTeacher...,New English Proficiency Level for the Student:...
1,Below is the conversation history: \n\nTeacher...,English Proficiency Level: Intermediate \n\nVo...
2,Below is the conversation history: \n\nTeacher...,English Proficiency Level: Intermediate\n\nVoc...
3,Below is the conversation history: \n\nTeacher...,New Proficiency Level for the Student: Expert\...
4,Below is the conversation history: \n\nTeacher...,New Proficiency Level: Expert \n\nVocabulary U...
5,Below is the conversation history: \n\nTeacher...,English Proficiency Level: Expert\n\nVocabular...
6,Below is the conversation history: \n\nTeacher...,New English Proficiency Level: Expert\n\nThe s...
7,Below is the conversation history: \n\nTeacher...,"Based on the conversation history, the student..."
8,Below is the conversation history: \n\nTeacher...,"Based on the conversation history, I assign th..."
9,Below is the conversation history: \n\nTeacher...,English Proficiency Level: Advanced\n\nVocabul...


In [43]:
# Human evaluation of the feedback score.
# Considering that students receive grammar corrections throughout the conversation, 
# the focus of the feedback evaluation is on whether it identifies some errors and irrelevancies if any exist and assesses the accuracy of the advice given.
# The evaluation score criteria is defined below:
'''
Score 0: Mostly irrelevant or frequently provides incorrect advice (more than 4 instances).
Score 1: Partially irrelevant or includes some incorrect pieces of advice (3-4 instances).
Score 2: Mostly relevant or includes minor inaccuracies (1-2).
Score 3: Highly relevant and accurate, consistently offers correct advice. (0 mistakes)
'''
feedback_df = pd.read_csv('Feedback.csv')
feedback_df

Unnamed: 0,Conversation,Feedback,Grammar Mistake in Conversation,Irrelevance in Conversation,Grammar Mistake Detected,Irrelevance Detected,Evaluation Score (out of 3)
0,Below is the conversation history: \n\nTeacher...,New English Proficiency Level for the Student:...,0,0,0,0,3
1,Below is the conversation history: \n\nTeacher...,English Proficiency Level: Intermediate \n\nVo...,0,0,0,0,3
2,Below is the conversation history: \n\nTeacher...,English Proficiency Level: Intermediate\n\nVoc...,0,1,0,1,3
3,Below is the conversation history: \n\nTeacher...,New English proficiency level for the student:...,0,0,1,0,2
4,Below is the conversation history: \n\nTeacher...,New English proficiency level for the student:...,1,0,1,0,3
5,Below is the conversation history: \n\nTeacher...,English Proficiency Level: Expert\n\nVocabular...,0,0,0,0,3
6,Below is the conversation history: \n\nTeacher...,New English Proficiency Level: Expert\n\nThe s...,0,0,1,0,2
7,Below is the conversation history: \n\nTeacher...,"Based on the conversation history, the student...",1,0,1,0,3
8,Below is the conversation history: \n\nTeacher...,"Based on the conversation history, I assign th...",1,0,1,0,1
9,Below is the conversation history: \n\nTeacher...,English Proficiency Level: Advanced\n\nVocabul...,1,0,1,0,2


In [49]:
# Calculating metrics for Grammar Mistakes
TP_grammar = ((feedback_df['Grammar Mistake in Conversation'] == 1) & (feedback_df['Grammar Mistake Detected'] == 1)).sum()
TN_grammar = ((feedback_df['Grammar Mistake in Conversation'] == 0) & (feedback_df['Grammar Mistake Detected'] == 0)).sum()
FP_grammar = ((feedback_df['Grammar Mistake in Conversation'] == 0) & (feedback_df['Grammar Mistake Detected'] == 1)).sum()
FN_grammar = ((feedback_df['Grammar Mistake in Conversation'] == 1) & (feedback_df['Grammar Mistake Detected'] == 0)).sum()

accuracy_grammar = (TP_grammar + TN_grammar) / len(feedback_df)
recall_grammar = TP_grammar / (TP_grammar + FN_grammar) 
precision_grammar = TP_grammar / (TP_grammar + FP_grammar)

print(f"Feedback accuracy in detecting grammar mistakes: {accuracy_grammar}")
print(f"Feedback recall in detecting grammar mistakes: {recall_grammar}")
print(f"Feedback precision in detecting grammar mistakes: {precision_grammar}")

# Calculating metrics for Reading Comprehension (relevancy mistakes) 
TP_relevancy = ((feedback_df['Irrelevance in Conversation'] == 1) & (feedback_df['Irrelevance Detected'] == 1)).sum()
TN_relevancy = ((feedback_df['Irrelevance in Conversation'] == 0) & (feedback_df['Irrelevance Detected'] == 0)).sum()
FP_relevancy = ((feedback_df['Irrelevance in Conversation'] == 0) & (feedback_df['Irrelevance Detected'] == 1)).sum()
FN_relevancy = ((feedback_df['Irrelevance in Conversation'] == 1) & (feedback_df['Irrelevance Detected'] == 0)).sum()

accuracy_relevancy = (TP_relevancy + TN_relevancy) / len(feedback_df)
recall_relevancy = TP_relevancy / (TP_relevancy + FN_relevancy) 
precision_relevancy = TP_relevancy / (TP_relevancy + FP_relevancy)

print(f"Feedback accuracy in detecting response irrelevancy: {accuracy_relevancy}")
print(f"Feedback recall in detecting response irrelevancy: {recall_relevancy}")
print(f"Feedback precision in detecting response irrelevancy: {precision_relevancy}")

# Average Evaluation Score for advice accuracy
average_evaluation_score = feedback_df['Evaluation Score (out of 3)'].mean()
print(f"Overall evaluation score for advice accuracy out of 3: {average_evaluation_score}")
print(f"Overall evaluation score out of 1: {average_evaluation_score/3}")

Feedback accuracy in detecting grammar mistakes: 0.8095238095238095
Feedback recall in detecting grammar mistakes: 0.9
Feedback precision in detecting grammar mistakes: 0.75
Feedback accuracy in detecting response irrelevancy: 0.8095238095238095
Feedback recall in detecting response irrelevancy: 0.625
Feedback precision in detecting response irrelevancy: 0.8333333333333334
Overall evaluation score for advice accuracy out of 3: 2.4285714285714284
Overall evaluation score out of 1: 0.8095238095238094
