In [None]:
!pip install llama-index llama-index-embeddings-huggingface openai

In [2]:
# Import necessary dependencies.
import os
import zipfile
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import (
    Settings,
    SimpleDirectoryReader,
    VectorStoreIndex,
    TreeIndex,
    KnowledgeGraphIndex,
    load_index_from_storage,
    StorageContext,
    PromptTemplate
)
from llama_index.core.graph_stores import SimpleGraphStore
from llama_index.core.node_parser import SimpleNodeParser
from llama_index.llms.openai import OpenAI as OpenAI_LlamaIndex
from llama_index.core.prompts.prompt_type import PromptType
import pandas as pd
from IPython.display import Markdown, display
from openai import OpenAI
from enum import Enum
# from sentence_transformers import CrossEncoder
import numpy as np

In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

# Setup

In [4]:
# Define OPENAI_API_KEY

In [None]:
llm = OpenAI_LlamaIndex(model="gpt-4-0125-preview")
Settings.llm = llm

Settings.embed_model = HuggingFaceEmbedding(
    model_name="BAAI/bge-small-en-v1.5"
)

In [6]:
# Define root_path and documents_path

In [7]:
if not os.path.exists(documents_path):
    # Unzip the file
    with zipfile.ZipFile(f'{root_path}/processed_subtitles.zip', 'r') as zip_ref:
        zip_ref.extractall(root_path)

In [None]:
files_in_directory = os.listdir(documents_path)
number_of_files = len(files_in_directory)
number_of_files

In [None]:
documents = SimpleDirectoryReader(documents_path).load_data()
len(documents)

In [None]:
%%time

# Create or load graph index.
GRAPH_INDEX_PERSIST_DIR = f'{root_path}/graph_index_storage'

# Clear metadata.
for document in documents:
    document.metadata = {}

graph_store = SimpleGraphStore()
graph_index_storage_context = StorageContext.from_defaults(graph_store=graph_store)

if not os.path.exists(GRAPH_INDEX_PERSIST_DIR):
    print('Creating graph index...')
    graph_index = KnowledgeGraphIndex.from_documents(
        documents,
        storage_context=graph_index_storage_context,
        include_embeddings=True,
    )

    graph_index.storage_context.persist(persist_dir=GRAPH_INDEX_PERSIST_DIR)

else:
    print('Loading graph index...')
    graph_index_storage_context = StorageContext.from_defaults(persist_dir=GRAPH_INDEX_PERSIST_DIR)
    graph_index = load_index_from_storage(graph_index_storage_context)

In [11]:
# Create retriever.
graph_retriever = graph_index.as_retriever(retriever_mode='hybrid')

# Query

In [12]:
client = OpenAI(api_key=OPENAI_API_KEY)

In [13]:
def generate_response(prompt):
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": prompt,
            }
        ],
        model="gpt-4-turbo",
    )
    return chat_completion.choices[0].message.content

In [14]:
# Define prompts.
template_no_guidance = (
    "You are going to act as a mathematics tutor for a 13 year old student who is in grade 8 or 9 and lives in the United States.\n"
    "You will be encouraging and factual.\n"
    "Prefer simple, short responses.\n"
    "If the student says something inappropriate or off topic you will say you can only focus on mathematics and ask them if they have any math-related follow-up questions.\n"
    "Student: {query_str}\n"
    "You:"
)

template_low_guidance = (
    "You are going to act as a mathematics tutor for a 13 year old student who is in grade 8 or 9 and lives in the United States.\n"
    "You will be encouraging and factual.\n"
    "Only if it is relevant, examples and language from the section below may be helpful to format your response:\n"
    "===\n"
    "{context_str}\n"
    "===\n"
    "Prefer simple, short responses.\n"
    "If the student says something inappropriate or off topic you will say you can only focus on mathematics and ask them if they have any math-related follow-up questions.\n"
    "Student: {query_str}\n"
    "You:"
)

template_high_guidance = (
    "You are going to act as a mathematics tutor for a 13 year old student who is in grade 8 or 9 and lives in the United States.\n"
    "You will be encouraging and factual.\n"
    "Use examples and language from the section below to format your response:\n"
    "===\n"
    "{context_str}\n"
    "===\n"
    "Prefer simple, short responses.\n"
    "If the student says something inappropriate or off topic you will say you can only focus on mathematics and ask them if they have any math-related follow-up questions.\n"
    "Student: {query_str}\n"
    "You:"
)

In [15]:
qa_template_no_guidance = PromptTemplate(template_no_guidance)
qa_template_low_guidance = PromptTemplate(template_low_guidance)
qa_template_high_guidance = PromptTemplate(template_high_guidance)

In [16]:
class GuidanceLevel(Enum):
    NO_GUIDANCE = "no_guidance"
    LOW_GUIDANCE = "low_guidance"
    HIGH_GUIDANCE = "high_guidance"

In [17]:
def generate_context(nodes, n=3):
    # Determine which nodes to use based on the length of the nodes list
    nodes_to_use = nodes[:n] if len(nodes) > n else nodes

    # Concatenate the text of the selected nodes, separated by newline characters
    context_str = "\n".join(node.text for node in nodes_to_use)
    context_str = context_str.strip()

    return context_str

In [18]:
def generate_response_with_guidance_level(question, retriever):
    nodes = retriever.retrieve(question)
    context_str = generate_context(nodes)

    for level in [GuidanceLevel.NO_GUIDANCE, GuidanceLevel.LOW_GUIDANCE, GuidanceLevel.HIGH_GUIDANCE]:
        if level == GuidanceLevel.NO_GUIDANCE:
            context_no_guidance = context_str
            prompt = qa_template_no_guidance.format(query_str=question)
            response_no_guidance = generate_response(prompt)

        elif level == GuidanceLevel.LOW_GUIDANCE:
            context_low_guidance = context_str
            prompt = qa_template_low_guidance.format(context_str=context_str, query_str=question)
            response_low_guidance = generate_response(prompt)

        else:
            context_high_guidance = context_str
            prompt = qa_template_high_guidance.format(context_str=context_str, query_str=question)
            response_high_guidance = generate_response(prompt)

    return response_no_guidance, context_no_guidance, response_low_guidance, context_low_guidance, response_high_guidance, context_high_guidance

# Data

In [None]:
# Define data_path_posts
df_posts = pd.read_csv(data_path_posts, low_memory=False)
df_posts.head()

In [None]:
df_posts.shape

In [None]:
# Define data_path_students
df_students = pd.read_csv(data_path_students, low_memory=False)
df_students.head()

In [None]:
df_students.shape

In [23]:
df_students['is_student'] = (df_students['permissions'] & 1048576) > 0

In [None]:
df_students.head()

In [None]:
df_posts_is_student = pd.merge(df_posts, df_students[['id.1', 'is_student']],
                               left_on='useraccount_id', right_on='id.1',
                               how='inner')
df_posts_is_student = df_posts_is_student.drop(columns=['id.1'])
df_posts_is_student.head()

In [None]:
df_posts_is_student.shape

In [None]:
df_parent_posts_students = df_posts_is_student[(df_posts_is_student['is_parent_post'] == 1) & (df_posts_is_student['is_student'] == True)]
df_parent_posts_students.head()

In [None]:
df_parent_posts_students.shape

In [None]:
# Load algebra related subject_ids.
# Define data_path_algebra_subject_ids
df_algebra_subject_ids = pd.read_csv(data_path_algebra_subject_ids, low_memory=False)
df_algebra_subject_ids

In [30]:
# Perform an inner join between df_parent_posts_students and df_algebra_subject_ids
joined_df = pd.merge(df_parent_posts_students, df_algebra_subject_ids[['id', 'name']],
                     left_on='subject_id', right_on='id',
                     how='inner',
                     suffixes=('', '_algebra_subject'))

# Optionally, remove the 'id_algebra_subject' column if it's not needed
joined_df.drop(columns=['id_algebra_subject'], inplace=True)

In [None]:
joined_df.head()

In [None]:
joined_df.shape

In [33]:
def find_replies_by_id(df, id):
    """
    Find all rows in the DataFrame where 'reply_to_post_id', converted to int, matches the input id,
    and return them ordered by 'ts_created' in ascending order. The row with the matching 'id' is
    placed as the first row of the returned DataFrame.

    Parameters:
    - df: The input DataFrame.
    - id: The integer id to match in 'reply_to_post_id'.

    Returns:
    - A DataFrame containing the matching rows, ordered by 'ts_created', with the matching 'id' row first.
    """
    # Find the original post row and set aside
    original_post_df = df[df['id'] == id]

    # Ensure the comparison is done correctly by converting 'reply_to_post_id' to integers where possible
    filtered_df = df[pd.to_numeric(df['reply_to_post_id'], errors='coerce', downcast='integer').fillna(-1).astype(int) == id]

    # Order by 'ts_created' in ascending order
    ordered_replies_df = filtered_df.sort_values(by='ts_created', ascending=True)

    # Concatenate the original post row at the top if it exists
    if not original_post_df.empty:
        result_df = pd.concat([original_post_df, ordered_replies_df], ignore_index=True)
    else:
        result_df = ordered_replies_df

    return result_df

In [34]:
def generate_teacher_answer_from_replies(result_df):
    # Check if the parent post is not from a student
    if not result_df.loc[result_df['is_parent_post'] == 1, 'is_student'].bool():
        return np.nan

    # Filter rows where is_student is False
    teacher_replies_df = result_df[result_df['is_student'] == False]

    # If there are no teacher replies, return None
    if teacher_replies_df.empty:
        return np.nan

    # Concatenate the comment_text of teacher replies, separated by newlines
    try:
        comment_text_list = teacher_replies_df['comment_text'].tolist()
        comment_text_list = [str(text) for text in comment_text_list]
        concatenated_text = "\n".join(comment_text_list)
        # concatenated_text = "\n".join(str(teacher_replies_df['comment_text']))
        concatenated_text = concatenated_text.strip()
    except:
        concatenated_text = np.nan

    return concatenated_text

In [None]:
# Define data_path_sampled_5000
df_sampled_5000 = pd.read_csv(data_path_sampled_5000, low_memory=False)
df_sampled_5000.head()

In [None]:
filtered_df = df_sampled_5000.reset_index(drop=True)
filtered_df.head()

In [None]:
filtered_df.shape

In [38]:
# Main loop:
data = []

for _, row in filtered_df.iterrows():
    id_ = row['id']
    question = row['comment_text']
    result_df = find_replies_by_id(df_posts_is_student, id_)
    teacher_answer = generate_teacher_answer_from_replies(result_df)

    # Graph.
    response_graph_no_guidance, context_graph_no_guidance, response_graph_low_guidance, context_graph_low_guidance, response_graph_high_guidance, context_graph_high_guidance = generate_response_with_guidance_level(question, graph_retriever)

    data.append({
        'id': id_,
        'question': question,
        'teacher_answer': teacher_answer,
        'response_graph_no_guidance': response_graph_no_guidance,
        'context_graph_no_guidance': context_graph_no_guidance,
        'response_graph_low_guidance': response_graph_low_guidance,
        'context_graph_low_guidance': context_graph_low_guidance,
        'response_graph_high_guidance': response_graph_high_guidance,
        'context_graph_high_guidance': context_graph_high_guidance,
    })

# Convert the data list to a pandas DataFrame
final_df = pd.DataFrame(data)

# Write the DataFrame to a CSV file
# Save final_df