### 1. Combine 15 parts into a large file

In [1]:
import os
import pandas as pd


base_dir = 'Policy text'

for j in range(1, 6):
    all_dfs = []
    for i in range(1, 16):
        folder_name = f'part {i}'
        csv_path = os.path.join(base_dir, folder_name, f'syllabus_summary_{j}.csv')
        
        if os.path.exists(csv_path):
            df = pd.read_csv(csv_path)
            all_dfs.append(df)
        else:
            print(f"CSV not found: {csv_path}")

    combined_df = pd.concat(all_dfs, ignore_index=True)

    output_path = os.path.join(base_dir, f'all_syllabus_summary_combined_{j}.csv')
    combined_df.to_csv(output_path, index=False, encoding='utf-8-sig')

    print(f"✅ Combined CSV saved to: {output_path}")

✅ Combined CSV saved to: Policy text\all_syllabus_summary_combined_1.csv
✅ Combined CSV saved to: Policy text\all_syllabus_summary_combined_2.csv
✅ Combined CSV saved to: Policy text\all_syllabus_summary_combined_3.csv
✅ Combined CSV saved to: Policy text\all_syllabus_summary_combined_4.csv
✅ Combined CSV saved to: Policy text\all_syllabus_summary_combined_5.csv


### 2. Create a final file by finding the most frequent sentences in 5 attempts

In [2]:
base_dir = 'Policy text'
dfs = []
for i in range(1, 6):
    path = os.path.join(base_dir, f'all_syllabus_summary_combined_{i}.csv')
    df = pd.read_csv(path)
    df['OriginalIndex'] = range(len(df))
    df['Version'] = i  
    dfs.append(df)


all_df = pd.concat(dfs, ignore_index=True)


sort_key = dfs[0][['File', 'OriginalIndex']].copy()
sort_key = sort_key.set_index('File')['OriginalIndex'].to_dict()

final_rows = []

grouped = all_df.groupby('File', sort=False)
for file_name, group in grouped:
    mode_values = group['AI Policy'].mode()
    if not mode_values.empty:
        policy_mode = mode_values.iloc[0]
        matched_row = group[group['AI Policy'] == policy_mode].iloc[0]
    else:
        matched_row = group.iloc[0]
    final_rows.append(matched_row)


final_df = pd.DataFrame(final_rows)


final_df['OriginalIndex'] = final_df['File'].map(sort_key)
final_df = final_df.sort_values('OriginalIndex').drop(columns=['OriginalIndex', 'Version']).reset_index(drop=True)


final_path = os.path.join(base_dir, 'final_syllabus_summary.csv')
final_df.to_csv(final_path, index=False, encoding='utf-8-sig')

print(f"✅ Clean final CSV saved to: {final_path} ")


✅ Clean final CSV saved to: Policy text\final_syllabus_summary.csv 


### 3. Rough analysis and estimation of the amount of manual analysis

In [3]:
print('the amount of files that LLM fails: ')
sum(final_df['LLM Failure']==1)

the amount of files that LLM fails: 


30

In [4]:
# Missing course code in the csv, 'all_syllabus_summary_combined_5.csv' 
# Note that all_syllabus_summary 1, 3, 4, 5 share the same rule to find the code, while all_syllabus_summary 2 use the other method.

print('The missing course code by the rule-based method: ')
print(sum(combined_df['Course Code'].isna()))

The missing course code by the rule-based method: 
136


In [5]:
# Text containing fewer than 50 words that needs to be checked.
final_df['word_count'] = final_df['AI Policy'].fillna('').str.split().str.len()
final_df[(final_df['word_count'] > 0) & (final_df['word_count'] < 50)]

Unnamed: 0,File,Course Code,Department,Knowledge Area,AI Policy,Multiple Clusters,LLM Failure,word_count
6,0CBWL9I53UwogXerQPA4BcTiQgsYcVe4Xw2R5rkY.pdf,FREN 0600,French Studies,Humanities,"For assignments written in French, students mu...",0,0,25
8,0DLS8eklpSbyHbkZaAufNWlB0AQVFSdVkAFj8Aow.docx,,Unknown,Unknown,"""An assignment copied or barely reworded from ...",0,0,26
13,0LSAzakbpaLaUfLtCtUChU585hVxizgRmfnEQXWH.pdf,PHP 1501,Public Health,Life Sciences,Students may not use ChatGPT tools to complete...,0,0,30
22,0qzo0y6addAXK6uOmXpRgE9SInxWjyHtsfuspKbA.pdf,POLS 1225,Political Science,Social Sciences,Plagiarism violates the academic policies of B...,0,0,38
46,1wfQsKPNkMjjw649KJk7Acf4nnKO72QOaBmYbzah.pdf,LITR 0100A,Literary Arts,Humanities,"Additionally, no AI-generated work of any sort...",0,0,28
...,...,...,...,...,...,...,...,...
1457,zaOs0k980PB8UdwnQyKLmkM0O3OnveyIZeG2t7y3.pdf,MPA 2229,Watson Institute,Social Sciences,and,1,0,1
1464,zCw7quGJK5H1ioLe8zWbZOL0ppUgwS2XXpMSdKC1.pdf,TAPS 1500S,Theatre Arts and Performance Studies,Humanities,Any information gathered from AI tools should ...,1,0,17
1468,zh2CCBO6bcV6UNGbIQ1OryyI8VjOnHcestbLDkW1.docx,ARTS 2003,Brown Arts Institute,Arts,"With the advent of AI, a host of questions evo...",0,0,34
1471,zKluYL45YMqrZOkBoUtxQh6DvaWOjv9oU9sJvfGO.pdf,POLS 2000,Political Science,Social Sciences,and,1,0,1


In [6]:
#  Different course code counts yielded by two methods 
all_dfs = []
for i in range(1, 16):
    folder_name = f'part {i}'
    csv_path = os.path.join(base_dir, folder_name, 'syllabus_summary_2.csv')
    
    if os.path.exists(csv_path):
        df = pd.read_csv(csv_path)
        all_dfs.append(df)
    else:
        print(f"CSV not found: {csv_path}")

combined_df_2 = pd.concat(all_dfs, ignore_index=True)

merged = pd.merge(combined_df,combined_df_2, on='File', suffixes=('_df1', '_df2'))

mismatches = merged[merged['Course Code_df1'] != merged['Course Code_df2']]
mismatches_code = mismatches[mismatches['Course Code_df1'].notna()][['Course Code_df1', 'Course Code_df2']] 
print('The number of different codes obtained by the two methods: ', len(mismatches_code) )
mismatches_code

The number of different codes obtained by the two methods:  43


Unnamed: 0,Course Code_df1,Course Code_df2
25,PHP 121,PHP 0850
28,HIST 199,HI 1956A
42,PHYS 2340,PHYS 2030
63,PHYS 1530,PHYS 2050
102,ECON 136,POLS 1440
199,BIOL 2000X,BIOL 2000C
201,HISP 232,HS 2030B
231,PHP 121,PHP 2950
271,NEUR 2021,BIOL 1865
324,EDUC 164,EDUC 2380
