### 1. Combine 15 parts into a large file

In [None]:
import os
import pandas as pd


base_dir = 'Policy text'

for j in range(1, 6):
    all_dfs = []
    for i in range(1, 16):
        folder_name = f'part {i}'
        csv_path = os.path.join(base_dir, folder_name, f'syllabus_summary_{j}.csv')
        
        if os.path.exists(csv_path):
            df = pd.read_csv(csv_path)
            all_dfs.append(df)
        else:
            print(f"CSV not found: {csv_path}")

    combined_df = pd.concat(all_dfs, ignore_index=True)

    output_path = os.path.join(base_dir, f'all_syllabus_summary_combined_{j}.csv')
    combined_df.to_csv(output_path, index=False, encoding='utf-8-sig')

    print(f"✅ Combined CSV saved to: {output_path}")

### 2. Create a final file by finding the most frequent sentences in 5 attempts

In [None]:
base_dir = 'Policy text'
dfs = []
for i in range(1, 6):
    path = os.path.join(base_dir, f'all_syllabus_summary_combined_{i}.csv')
    df = pd.read_csv(path)
    df['OriginalIndex'] = range(len(df))
    df['Version'] = i  
    dfs.append(df)


all_df = pd.concat(dfs, ignore_index=True)


sort_key = dfs[0][['File', 'OriginalIndex']].copy()
sort_key = sort_key.set_index('File')['OriginalIndex'].to_dict()

final_rows = []

grouped = all_df.groupby('File', sort=False)
for file_name, group in grouped:
    mode_values = group['AI Policy'].mode()
    if not mode_values.empty:
        policy_mode = mode_values.iloc[0]
        matched_row = group[group['AI Policy'] == policy_mode].iloc[0]
    else:
        matched_row = group.iloc[0]
    final_rows.append(matched_row)


final_df = pd.DataFrame(final_rows)


final_df['OriginalIndex'] = final_df['File'].map(sort_key)
final_df = final_df.sort_values('OriginalIndex').drop(columns=['OriginalIndex', 'Version']).reset_index(drop=True)


final_path = os.path.join(base_dir, 'final_syllabus_summary.csv')
final_df.to_csv(final_path, index=False, encoding='utf-8-sig')

print(f"✅ Clean final CSV saved to: {final_path} ")


### 3. Rough analysis and estimation of the amount of manual analysis

In [None]:
print('the amount of files that LLM fails: ')
sum(final_df['LLM Failure']==1)

In [None]:
# Missing course code in the csv, 'all_syllabus_summary_combined_5.csv' 
# Note that all_syllabus_summary 1, 3, 4, 5 share the same rule to find the code, while all_syllabus_summary 2 use the other method.

print('The missing course code by the rule-based method: ')
print(sum(combined_df['Course Code'].isna()))

In [None]:
# Text containing fewer than 50 words that needs to be checked.
final_df['word_count'] = final_df['AI Policy'].fillna('').str.split().str.len()
final_df[(final_df['word_count'] > 0) & (final_df['word_count'] < 50)]

In [None]:
#  Different course code counts yielded by two methods 
all_dfs = []
for i in range(1, 16):
    folder_name = f'part {i}'
    csv_path = os.path.join(base_dir, folder_name, 'syllabus_summary_2.csv')
    
    if os.path.exists(csv_path):
        df = pd.read_csv(csv_path)
        all_dfs.append(df)
    else:
        print(f"CSV not found: {csv_path}")

combined_df_2 = pd.concat(all_dfs, ignore_index=True)

merged = pd.merge(combined_df,combined_df_2, on='File', suffixes=('_df1', '_df2'))

mismatches = merged[merged['Course Code_df1'] != merged['Course Code_df2']]
mismatches_code = mismatches[mismatches['Course Code_df1'].notna()][['Course Code_df1', 'Course Code_df2']] 
print('The number of different codes obtained by the two methods: ', len(mismatches_code) )
mismatches_code