In [1]:
import openpyxl
import openai
import time
from tqdm import tqdm
import pandas as pd

# Topic Generation

In [161]:
def return_prompt(ipt, document):
    prompt = ipt.replace("{DOCUMENT}", document)
    return prompt

In [None]:
data = 'data/topic_modeling_data.csv'
df = pd.read_csv(data, encoding = 'latin-1', sep=',')

prompt = open(r"prompts/topic_generation_prompt.txt", 'r')
prompt = prompt.read()

In [163]:
document = ''
for i in range(0, 11, 1):
    doc = df.loc[i, 'Text']
    document += f'Document {i + 1}: {doc} \n'

In [164]:
full_prompt = return_prompt(prompt, document)

In [165]:
# full_prompt

In [None]:
# Set your OpenAI API key
openai.api_key = ''

In [None]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": full_prompt}
    ],
    temperature=0.01,
    max_tokens=5000
)


In [168]:
topic_list=response.choices[0].message.content

In [169]:
topic_list

'Based on the documents provided, the following top-level topics have been identified:\n\n1. **Education**\n   - Description: This topic encompasses various aspects of the educational process, including instructional strategies, curriculum development, assessment methods, professional development for educators, student performance tracking, and educational objectives and goals alignment with standards.\n\n2. **Student Assessment and Achievement**\n   - Description: This topic covers the processes and methodologies involved in evaluating student performance, including standardized testing, reading assessments, and other forms of academic evaluation. It also includes strategies for improving student achievement levels in core subjects like math, ELA, and science.\n\n3. **Professional Development**\n   - Description: This topic involves the continuous education and skill development of teachers and educational staff, including the implementation of best teaching practices, collaboration a

In [128]:
word_count = len(full_prompt.split())
word_count

4283

In [170]:
with open('prompts/Stage-1_LLM_topic_list00.txt', 'w') as file:
    file.write(topic_list)

## individual documents

In [53]:
topic_list = []
for i in range(len(df)):
    doc = df.loc[i, 'Text']
    full_prompt = return_prompt(prompt, doc)
    response = openai.ChatCompletion.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": full_prompt}
        ],
        temperature=0.2,
        max_tokens=4000
    )
    topic = response.choices[0].message.content
    df.loc[i, 'Topic'] = topic
    topic_list.append(topic)
    

In [54]:
topic_list

['1. **Education Performance Metrics**\n   - Description: This topic covers the measurement and analysis of student performance in various academic areas, including math, ELA (English Language Arts), and science. It involves tracking proficiency levels, graduation rates, and comparative rankings among districts.\n\n2. **Student Support Programs**\n   - Description: This topic encompasses programs and objectives aimed at improving the educational outcomes of specific student groups, such as those eligible for free or reduced lunch and students with IEPs (Individualized Education Programs). It includes strategies for increasing proficiency in core academic areas.\n\n3. **Academic Improvement Strategies**\n   - Description: Strategies and objectives focused on enhancing the academic performance of students across various subjects. This includes setting specific targets for improvement in state testing scores and rankings compared to other districts.\n\n4. **Data Analysis and Reporting**\n

In [55]:
with open('prompts/Topic_list_indiv_0shot.txt', 'w') as file:
    for topic in topic_list:
        file.write(topic + '\n')

In [56]:
df.to_csv('prompts/detailed_topic_list_0shot.csv', encoding = 'latin-1', index = False)

# Merging Prompt

In [77]:
def return_prompt2(ipt, document):
    prompt = ipt.replace("{topic_list}", document)
    return prompt

In [78]:
prompt = open(r"prompts/merging_prompt.txt", 'r')
prompt = prompt.read()

topic_l = open(r"prompts/Topic_list_indiv_0shot.txt", 'r')
topic_l = topic_l.read()

full_prompt = return_prompt2(prompt, topic_l)

In [79]:
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": full_prompt}
    ],
    temperature=0.2,
    max_tokens=4000
)

In [80]:
response.choices[0].message.content

"[1] Education\n   - Description: Topics related to the process of teaching and learning, including specific programs, goals, metrics for success in academic environments, and the improvement of educational outcomes. This encompasses planning and implementation of educational programs, curriculum development, assessment of educational outcomes, goals related to improving educational outcomes such as student proficiency in various content areas, the reduction of negative indicators such as absenteeism and low grades, processes and tools used to evaluate student performance including Common Formative Assessments and I-Ready Testing, specific metrics used to measure student performance and school effectiveness including attendance rates, office referrals, and grade distribution, and strategies implemented to improve student outcomes and address areas of concern, aiming to increase proficiency and reduce negative indicators. It also includes the process of gathering and analyzing data to i

In [None]:
with open('prompts/merged_topic_list.txt', 'w') as file:
    file.write(response.choices[0].message.content)

# Topic Assignment

In [5]:
def return_prompt2(ipt, topics, document):
    prompt = ipt.replace("{TOPIC LIST}", topics)
    prompt = prompt.replace("{DOCUMENT}", str(document))
    return prompt

In [None]:
prompt = open(r"prompts/topic_assignment_prompt.txt", 'r')
prompt = prompt.read()

topics = open(r"final-topic-list.txt", 'r')
topics = topics.read()

data = 'data/all_bip_TM.csv'
df = pd.read_csv(data, encoding = 'utf-8', sep=',')

openai.api_key = ''
len(df)

2817

In [None]:
topic_list = []

for i in tqdm(range(len(df))):
    doc = df.loc[i, 'Text']
    full_prompt = return_prompt2(prompt, topics, doc)
    response = openai.ChatCompletion.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": full_prompt}
        ],
        temperature=0.2,
        max_tokens=4000
    )
    topic = response.choices[0].message.content
    df.loc[i, 'Assigned Topics by LLM'] = topic
#     print(topic+"\n*****************\n\n")
    topic_list.append(topic)



100%|██████████| 2817/2817 [3:11:21<00:00,  4.08s/it]  


In [None]:
df.to_csv('data/topic_assigned_data.csv',  index = False)

Unnamed: 0,PersonId,BuildingID,DistrictID,Text,SchoolYear,Assigned Topics by LLM
3,00A2FBE6-BC16-E511-80C0-005056B067F1,7CF88B4E-CC34-4BAB-8EDD-AC6A00EC88D3,E3A2EA3F-5E13-E511-80C0-005056B067F1,1. Staff will create a common assessment to gi...,2020-2021,1. Academic Assessments \n2. Academic Goals ...
4,00A2FBE6-BC16-E511-80C0-005056B067F1,7CF88B4E-CC34-4BAB-8EDD-AC6A00EC88D3,E3A2EA3F-5E13-E511-80C0-005056B067F1,1. Increase Student to Student Feedback in the...,2021-2022,1. Instruction\n2. Academic Goals\n\nExplanati...
12,01FCCBE6-BC16-E511-80C0-005056B067F1,BBA6EA3F-5E13-E511-80C0-005056B067F1,82A2EA3F-5E13-E511-80C0-005056B067F1,Objective 1: Maintain or exceed the new attend...,2020-2021,1. Attendance\n2. Academic Assessments\n3. Aca...
13,01FCCBE6-BC16-E511-80C0-005056B067F1,BBA6EA3F-5E13-E511-80C0-005056B067F1,82A2EA3F-5E13-E511-80C0-005056B067F1,Objective 1: Maintain or exceed the new attend...,2021-2022,1. Attendance \n2. Academic Assessments \n3....
22,02A2FBE6-BC16-E511-80C0-005056B067F1,F6A5EA3F-5E13-E511-80C0-005056B067F1,C2A1EA3F-5E13-E511-80C0-005056B067F1,Building level goals were written and revised ...,2020-2021,1. Academic Goals\n2. Behavioral Goals\n3. Cur...
...,...,...,...,...,...,...
2795,FEC4DCE6-BC16-E511-80C0-005056B067F1,CEA9EA3F-5E13-E511-80C0-005056B067F1,CAA0EA3F-5E13-E511-80C0-005056B067F1,,2021-2022,1. Academic Assessments\n2. Academic Goals\n3....
2801,FFACDCE6-BC16-E511-80C0-005056B067F1,FAA5EA3F-5E13-E511-80C0-005056B067F1,D1A1EA3F-5E13-E511-80C0-005056B067F1,,2020-2021,1. Academic Assessments \n2. Academic Goals ...
2802,FFACDCE6-BC16-E511-80C0-005056B067F1,FAA5EA3F-5E13-E511-80C0-005056B067F1,D1A1EA3F-5E13-E511-80C0-005056B067F1,Our 2021-2022 Building Goals: 1. Increase crit...,2021-2022,1. Academic Goals\n2. Behavioral Goals\n3. Ins...
2806,FFBDDCE6-BC16-E511-80C0-005056B067F1,4AA8EA3F-5E13-E511-80C0-005056B067F1,F0A1EA3F-5E13-E511-80C0-005056B067F1,Academic Goal: 100% of Hawk Point Elementary g...,2020-2021,1. Academic Goals\n2. Attendance\n3. Behaviora...


In [93]:
df = pd.read_csv('data/Topic Analysis Results/All-Data-Topic Analysis.csv', sep =',')

In [101]:
c = 0
haha = []
for i in range(len(df)):
    line = df.loc[i, 'Assigned Topics by LLM'].split('Explanation:')
#     print(line)
    if len(line) == 2:
        df.loc[i, 'Topics'] = line[0]
        df.loc[i, 'Explanation'] = line[1]
    else:
        haha.append(i)
        df.loc[i, 'Explanation'] = 'I\'m unable to access external documents, including Google Docs.'



In [108]:
df.to_csv('data/Topic-Analysis-Column_divided/All-Data-Topic Analysis.csv',  index = False)

In [109]:
school_years_to_keep = ['2017-2018', '2018-2019']

# Filter the rows based on the school year
filtered_df = df[df['SchoolYear'].isin(school_years_to_keep)]
filtered_df.to_csv('data/Topic-Analysis-Column_divided/pre-pandemic-topic-analysis.csv',  index = False)


school_years_to_keep = ['2020-2021', '2021-2022']

# Filter the rows based on the school year
filtered_df = df[df['SchoolYear'].isin(school_years_to_keep)]
filtered_df.to_csv('data/Topic-Analysis-Column_divided/post-pandemic-topic-analysis.csv',  index = False)


In [103]:
df.loc[1264]

PersonId                               72FACBE6-BC16-E511-80C0-005056B067F1
BuildingID                             BCA7EA3F-5E13-E511-80C0-005056B067F1
DistrictID                             33A1EA3F-5E13-E511-80C0-005056B067F1
Text                               See attachedSee attached\n\nSee Attached
SchoolYear                                                        2021-2022
Assigned Topics by LLM    It seems that there is no document attached fo...
Topics                                                                  NaN
Explanation               I'm unable to access external documents, inclu...
Name: 1264, dtype: object

In [107]:
df.loc[2730]

PersonId                               F7AC5738-956C-E711-80E1-005056B067F1
BuildingID                             E2AAEA3F-5E13-E511-80C0-005056B067F1
DistrictID                             33A1EA3F-5E13-E511-80C0-005056B067F1
Text                                           See LinkSee Link\n\nSee Link
SchoolYear                                                        2021-2022
Assigned Topics by LLM    I'm sorry, but it seems that the document you ...
Topics                                                                  NaN
Explanation               I'm unable to access external documents, inclu...
Name: 2730, dtype: object