In [2]:
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.prompts import PromptTemplate
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.chains import LLMChain
from langchain_openai import OpenAI
import pandas as pd
import json, re, os, openai, boto3

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


In [66]:
SIMILARITY_THRESH = 0.3
INTRODUCTION = "You explain complex terms regarding Individualized Education Plans in extremely basic language to help low literacy parents fully understand their meaning."
INSTRUCTIONS = "Please keep your language simplistic and basic. Make sure the answer is accurate and well structured. Do not use any complex words above 5th grade reading level."
TEMPLATE = """Please answer this question:{term}\nTo assist in your response, here is some relevant context data:{relevant_data}""" + INSTRUCTIONS
SRC_URL_DICT = {'state.pdf': 'https://seisprodtableswest.blob.core.windows.net/trainingmanual-storage/2416f43c-e7b7-4e28-a796-6b9789828c18.pdf',
                'riverside.pdf': 'https://rcselpa.org/uploads/files/files/IEP%20Manual%2007-22.pdf',
                'sfusd.pdf': 'https://drive.google.com/file/d/1au51uKAY-6t1Sabx47W3eCJkBBDn10ns/view'}
SECTION_KEY_DICT = {'Section A: Information/Eligibility': 'A',
                    'Section B: Present Levels of Academic Achievement and Functional Performance': 'B',
                    'Section C: Special Factors': 'C',
                    'Section D: Statewide Assessments': 'D',
                    'Section E: Annual Goals and Objectives': 'E',
                    'Section F: Offer of FAPE - Service': 'F',
                    'Section G: Offer of FAPE - Educational Setting': 'G',
                    'Section H: Emergency Circumstances Program': 'H',
                    'Section I: Signature and Parent Consent': 'I',
                    'Section J: IEP Team Meeting Notes': 'J', 
                    'Section K: Assessment Plan': 'K',
                    'Section L: IEP at a Glance': 'L'}
SECTION_KEY_DICT_REVERSE_ES = {'A': 'Sección A: Información/Elegibilidad',
                                'B': 'Sección B: Niveles actuales de logro académico y desempeño funcional',
                                'C': 'Sección C: Factores especiales',
                                'D': 'Sección D: Evaluaciones estatales',
                                'E': 'Sección E: Metas y objetivos anuales',
                                'F': 'Sección F: Oferta de FAPE - Servicio',
                                'G': 'Sección G: Oferta de FAPE - Entorno educativo',
                                'H': 'Sección H: Programa de Circunstancias de Emergencia',
                                'I': 'Sección I: Firma y consentimiento de los padres',
                                'J': 'Sección J: Notas de la reunión del equipo del IEP',
                                'K': 'Sección K: Plan de evaluación',
                                'L': 'Sección L: IEP a primera vista'}
embeddings = OpenAIEmbeddings()
client = openai.Client(api_key=os.getenv("OPENAI_API_KEY"))
translate_client = boto3.client(service_name='translate', region_name='us-east-1', use_ssl=True)

In [None]:
loader1, loader2, loader3 = PyPDFLoader("riverside.pdf"), PyPDFLoader("sfusd.pdf"), PyPDFLoader("state.pdf")
entries = loader1.load_and_split() + loader2.load_and_split() + loader3.load_and_split()
FAISS_INDEX = FAISS.from_documents(entries, OpenAIEmbeddings())

In [100]:
def get_translation(text):
    return translate_client.translate_text(Text=text, SourceLanguageCode='en', TargetLanguageCode='es')['TranslatedText']
def get_header_id(text):
    processed_string = text.lower()
    # Remove content within parentheses
    processed_string = re.sub(r'\(.*?\)', '', processed_string)
    # Replace all slashes with underscores
    processed_string = re.sub(r'[ /]', '_', processed_string)
    return processed_string.strip()
def get_section_id(text):
    return SECTION_KEY_DICT[text] if text in SECTION_KEY_DICT else 'Unknown'
def get_section_es(id):
    return SECTION_KEY_DICT_REVERSE_ES[id] if id in SECTION_KEY_DICT_REVERSE_ES else 'Unknown'
def get_kb_url(row):
    return f"https://iep-kb-24839890c1f5.herokuapp.com/kb/{row['section_id']}/{row['header_id']}", f"https://iep-kb-24839890c1f5.herokuapp.com/es/kb/{row['section_id']}/{row['header_id']}"
def get_simplified_explanation(term):
    similar_response = FAISS_INDEX.similarity_search_with_score(f'What is {term}?', k=5)
    relevant_data = [f'Content: {doc.page_content} MetaData: {doc.metadata}' if score > SIMILARITY_THRESH else '' for doc, score in similar_response]
    similar_response = FAISS_INDEX.similarity_search_with_score(f'What is {term}?', k=5)
    highscore, src_url = 0, ''
    for doc, score in similar_response:
        if score > highscore:
            highscore, src_url = score, f"{SRC_URL_DICT[doc.metadata['source']]}#page={doc.metadata['page']}"
    response = client.chat.completions.create(
    model="gpt-3.5-turbo-0125",
    messages=[{"role": "system", "content": INTRODUCTION},
              {"role": "user", "content": TEMPLATE.format(term=term, relevant_data=relevant_data)}])
    return response.choices[0].message.content, src_url

In [64]:
raw_df = pd.read_csv('kb.csv')
df = raw_df.drop_duplicates(subset=['header'], keep=False).copy()
df['header_id'] = df['header'].apply(get_header_id)
df['section_id'] = df['section'].apply(get_section_id)
df = df[~(df['section_id'] == 'Unknown')]
result = df['header'].apply(get_simplified_explanation, axis=1, result_type='expand')
df['content'], df['src_url'] = result[0], result[1]
df['section_es'] = df['section_id'].apply(get_section_es)
df['content_es'] = df['content'].apply(get_translation)
df['header_es'] = df['header'].apply(get_translation)
result = df.apply(get_kb_url, axis=1, result_type='expand')
df['kb_url'], df['kb_url_es'] = result[0], result[1]
df.to_csv('kb_full.csv', index=False)

Unnamed: 0,header,section,header_id,section_id,content
0,Student Name,Section A: Information/Eligibility,student_name,A,Individualized Education Plans (IEPs) are plan...
1,Date of Birth,Section A: Information/Eligibility,date_of_birth,A,"Date of Birth is the exact day, month, and yea..."
2,Original SpEd Entry Date,Section A: Information/Eligibility,original_sped_entry_date,A,The Original SpEd Entry Date is the date when ...
3,Next Annual IEP,Section A: Information/Eligibility,next_annual_iep,A,"The ""Next Annual IEP"" refers to a meeting that..."
4,Last Eval,Section A: Information/Eligibility,last_eval,A,"The ""Last Eval"" field in an Individualized Edu..."
...,...,...,...,...,...
322,English Language Arts (ELA),Section L: IEP at a Glance,english_language_arts,L,Sure! You asked about English Language Arts (E...
323,Other State-Wide/District-Wide Assessment(s),Section L: IEP at a Glance,other_state-wide/district-wide_assessment,L,State-Wide/District-Wide Assessments are tests...
324,Supplementary Aids & Services and Other Supports,Section L: IEP at a Glance,supplementary_aids_&_services_and_other_supports,L,Supplementary Aids & Services and Other Suppor...
325,"Other Supports for School Personnel, or for St...",Section L: IEP at a Glance,"other_supports_for_school_personnel,_or_for_st...",L,"Other Supports for School Personnel, or for St..."


In [3]:
df = pd.read_csv('kb_full.csv')
terms_dir = '../pages/kb/terms/'
meta_en = {}
meta_es = {}
for section in df['section_id'].unique():
    section_df = df[df['section_id'] == section]
    section_meta_en = {}
    section_meta_es = {}
    section_dir = os.path.join(terms_dir, section)
    os.makedirs(section_dir, exist_ok=True)
    for _, row in section_df.iterrows():
        header_id = row['header_id']
        section_meta_en[row['header_id']] = row['header']
        section_meta_es[row['header_id']] = row['header_es']
        with open(os.path.join(section_dir, f"{header_id}.en.md"), 'w', encoding='utf-8') as f_en:
            f_en.write(f"# {row['header']}\n{row['header']}: {row['content']}\n\nYou can find more about this at: [{row['src_url']}]({row['src_url']})\n")
        with open(os.path.join(section_dir, f"{header_id}.es.md"), 'w', encoding='utf-8') as f_es:
            f_es.write(f"# {row['header_es']}\n{row['header_es']}: {row['content_es']}\n\nPuedes encontrar más sobre esto en: [{row['src_url']}]({row['src_url']})\n")
    with open(os.path.join(section_dir, '_meta.en.json'), 'w', encoding='utf-8') as meta_en_file:
        json.dump(section_meta_en, meta_en_file, ensure_ascii=False, indent=4)
    with open(os.path.join(section_dir, '_meta.es.json'), 'w', encoding='utf-8') as meta_es_file:
        json.dump(section_meta_es, meta_es_file, ensure_ascii=False, indent=4)
    meta_en[section] = df[df['section_id'] == section].iloc[0]['section']
    meta_es[section] = df[df['section_id'] == section].iloc[0]['section_es']
with open(os.path.join(terms_dir, '_meta.en.json'), 'w', encoding='utf-8') as main_meta_en_file:
    json.dump(meta_en, main_meta_en_file, ensure_ascii=False, indent=4)
with open(os.path.join(terms_dir, '_meta.es.json'), 'w', encoding='utf-8') as main_meta_es_file:
    json.dump(meta_es, main_meta_es_file, ensure_ascii=False, indent=4)