In [1]:
import pandas as pd


In [2]:
import re
df = pd.read_csv('Policy_2024.csv')

is_complex = df['Course Code'].str.contains('/', na=False)
complex_codes_df = df[is_complex]

In [3]:
def analyze_code_parts(row):

    code_str = row['Course Code']
    if not isinstance(code_str, str) or not code_str:
        return pd.Series([0, 0, 0], index=['dept_len', 'num_len', 'max_num_val'])
        
    # Find all alphabetic parts (department codes) and numeric parts
    dept_parts = re.findall(r'[A-Z]+', code_str, re.IGNORECASE)
    num_parts = re.findall(r'\d+', code_str)
    
    # Calculate lengths and max value
    max_dept_len = max(len(p) for p in dept_parts) if dept_parts else 0
    max_num_len = max(len(p) for p in num_parts) if num_parts else 0
    max_num_val = max(int(p) for p in num_parts) if num_parts else 0
    
    return pd.Series([max_dept_len, max_num_len, max_num_val], index=['dept_len', 'num_len', 'max_num_val'])

df[['dept_len', 'num_len', 'max_num_val']] = df.apply(analyze_code_parts, axis=1)
df['total_len'] = df['Course Code'].str.len()

In [4]:
short_dept = df[(df['dept_len']>0) & (df['dept_len']<3)]['Course Code']
short_dept

Series([], Name: Course Code, dtype: object)

In [5]:
long_dept = df[df['dept_len']>4]['Course Code']
long_dept

Series([], Name: Course Code, dtype: object)

In [6]:
short_num = df[(df['num_len']>0) & (df['num_len']<3)]['Course Code']
short_num

Series([], Name: Course Code, dtype: object)

In [7]:
long_num = df[(df['num_len']>4)]['Course Code']
long_num

45    HMN 2402A/MCM 25100/POBS 2400
Name: Course Code, dtype: object

In [8]:
large_num = df[df['max_num_val']>3000]['Course Code']
large_num

45    HMN 2402A/MCM 25100/POBS 2400
Name: Course Code, dtype: object

In [9]:
short_code = df[(df['total_len']>0) & (df['total_len']<8)]['Course Code']
short_code

948    PHP 310
Name: Course Code, dtype: object

In [10]:
long_code = df[df['total_len']>10]['Course Code']
long_code

3                            ENVS/EEPS 1615
45            HMN 2402A/MCM 25100/POBS 2400
47                           PHYS 1600/2600
133                          PHYS 1100/2100
134                   COLT 2650Y/ENGL 2561Z
154                          BIOL 1260/2260
217                          CSCI  410/1411
254                          APMA 1740/2610
287                          BIOL 2040/1040
368                          BIOL 1260/2260
374                     ENGN/PHYS 1590/2590
397                          PHYS 1610/2630
408                          PHYS 1600/2600
409                          CSCI 1550/2540
411                        ENGN 0930L/1930L
423                          BIOL 1515/2015
454                     ENGN/PHYS 1590/2590
482                          ANTH 1976/2401
488                          PHYS 0720/1720
492                          HIAA 1721/2721
515                          ENGN 1735/2735
526                    JUDS 0060/RELS 0060C
540                          COS

In [11]:
sum(long_code== complex_codes_df['Course Code']) == len(long_code)

True

In [12]:
unk = df[(df['Course Code'].notna()) &(df['Department'] == 'Unknown')]
unk

Unnamed: 0,File,Course Code,Department,Knowledge Area,AI Policy,Multiple Clusters,LLM Failure,dept_len,num_len,max_num_val,total_len
18,0KNlHq1TXUCS4yGz2hSMsTZD2WQUOBF6Eo2SZqWm.pdf,GREK 1810,Unknown,Unknown,,0,0,4,4,1810,9
45,1oSdGXXX4qTQbXBwU13zCq9bfgBjg4b2d6CHl4dT.pdf,HMN 2402A/MCM 25100/POBS 2400,Unknown,Unknown,,0,0,4,5,25100,29
85,36DDUbawS1vrklHltmNWlCj4eYRLBTou5G1aHWds.pdf,GREK 1150,Unknown,Unknown,,0,0,4,4,1150,9
92,3GZAZtL71jArylKwiVq1sq5OealwWRLmWdjzQI2N.pdf,GREK 1060,Unknown,Unknown,,0,0,4,4,1060,9
118,4Xqw4yle5ItntFel5L3GRXuV2vJORUedkjrRUDkw.pdf,LACA 1900,Unknown,Unknown,Submitting work completed using artificial int...,1,0,4,4,1900,9
122,56UmS471oPGZK54JywMkjFzKzvfNxAnLFmJtoYvQ.pdf,UNIV 1110,Unknown,Unknown,"As such, it is important that the work you pre...",0,0,4,4,1110,9
179,7nfVKqylRoyRkvKXEA4VW0tEPsWxkqr1Xnh0IKYf.pdf,LACA 1504Q,Unknown,Unknown,,0,0,4,4,1504,10
245,ALvvW29UUkapGdqn64rJg13TBHyBckmpREf0qyXx.docx,LACA 1504R,Unknown,Unknown,,0,0,4,4,1504,10
277,BWV7KtnYqi78PgAKV1frgFnq7vnj3xxeJpKqr5oh.pdf,UNIV 1221,Unknown,Unknown,,0,0,4,4,1221,9
458,JIrBKvxQM99w6tqbciEvGiOdN164aYS0G6jMzvxf.pdf,UNIV 1211,Unknown,Unknown,,0,0,4,4,1211,9


In [13]:
def find_union_and_sort(list_of_number_sets):

    if not list_of_number_sets:
        return []

    union_set = set().union(*(set(s) for s in list_of_number_sets))

    return sorted(list(union_set))


sets = [short_dept.index, short_code.index, unk.index]
idx = find_union_and_sort(sets)
print(f" Index that need be checked: {idx}")

 Index that need be checked: [18, 45, 85, 92, 118, 122, 179, 245, 277, 458, 506, 549, 550, 551, 579, 617, 633, 646, 655, 699, 852, 948, 1132, 1154, 1170, 1434]


#### 3.summarize the code we might need check finally

In [14]:
pd.set_option('display.max_rows', 150)
df.iloc[idx]
# final_unknown = df.iloc[idx]
# final_unknown.to_csv('final_unknown.csv')

Unnamed: 0,File,Course Code,Department,Knowledge Area,AI Policy,Multiple Clusters,LLM Failure,dept_len,num_len,max_num_val,total_len
18,0KNlHq1TXUCS4yGz2hSMsTZD2WQUOBF6Eo2SZqWm.pdf,GREK 1810,Unknown,Unknown,,0,0,4,4,1810,9
45,1oSdGXXX4qTQbXBwU13zCq9bfgBjg4b2d6CHl4dT.pdf,HMN 2402A/MCM 25100/POBS 2400,Unknown,Unknown,,0,0,4,5,25100,29
85,36DDUbawS1vrklHltmNWlCj4eYRLBTou5G1aHWds.pdf,GREK 1150,Unknown,Unknown,,0,0,4,4,1150,9
92,3GZAZtL71jArylKwiVq1sq5OealwWRLmWdjzQI2N.pdf,GREK 1060,Unknown,Unknown,,0,0,4,4,1060,9
118,4Xqw4yle5ItntFel5L3GRXuV2vJORUedkjrRUDkw.pdf,LACA 1900,Unknown,Unknown,Submitting work completed using artificial int...,1,0,4,4,1900,9
122,56UmS471oPGZK54JywMkjFzKzvfNxAnLFmJtoYvQ.pdf,UNIV 1110,Unknown,Unknown,"As such, it is important that the work you pre...",0,0,4,4,1110,9
179,7nfVKqylRoyRkvKXEA4VW0tEPsWxkqr1Xnh0IKYf.pdf,LACA 1504Q,Unknown,Unknown,,0,0,4,4,1504,10
245,ALvvW29UUkapGdqn64rJg13TBHyBckmpREf0qyXx.docx,LACA 1504R,Unknown,Unknown,,0,0,4,4,1504,10
277,BWV7KtnYqi78PgAKV1frgFnq7vnj3xxeJpKqr5oh.pdf,UNIV 1221,Unknown,Unknown,,0,0,4,4,1221,9
458,JIrBKvxQM99w6tqbciEvGiOdN164aYS0G6jMzvxf.pdf,UNIV 1211,Unknown,Unknown,,0,0,4,4,1211,9


In [None]:
complex_codes_df

# complex_codes_df.to_csv('complex_code.csv')

In [16]:
sum(df['AI Policy'].notna())/len(df)

0.3406813627254509

In [17]:
df['Department'].value_counts()

Department
Engineering                                          99
Public Health                                        92
English                                              82
Economics                                            73
Political Science                                    64
History                                              63
Biology                                              57
Watson Institute                                     52
Physics                                              47
Applied Mathematics                                  47
Theatre Arts and Performance Studies                 42
East Asian Studies                                   41
Center for Language Studies                          38
Mathematics                                          38
Anthropology                                         36
Classics                                             36
Music                                                31
Cognitive, Linguistic, and Psychologi

In [18]:
df['Knowledge Area'].value_counts()

Knowledge Area
Humanities           568
Social Sciences      427
Physical Sciences    317
Life Sciences        157
Unknown               25
Arts                   3
Name: count, dtype: int64

In [19]:

df['word_count'] = df['AI Policy'].fillna('').str.split().str.len()
df[(df['word_count'] > 0) & (df['word_count'] < 50)]

Unnamed: 0,File,Course Code,Department,Knowledge Area,AI Policy,Multiple Clusters,LLM Failure,dept_len,num_len,max_num_val,total_len,word_count
1,02132Fz4IaY2bIfLT6mFVqih10XNgBKNsfCDqz9H.pdf,MATH 0750,Mathematics,Physical Sciences,Use of AI programs such as ChatGPT is not allo...,0,0,4,4,750,9,17
8,0DfIT6QWVbaGMhjrpgtBYWMynKuxEvcuymDWmfWf.pdf,PHP 2601,Public Health,Life Sciences,It is expected that students taking this cours...,0,0,3,4,2601,8,27
12,0g94gZaFW5zOCKRB1Yyow0IkmXOHNfu6b6PvnnqT.pdf,FREN 1410,French Studies,Humanities,Use of A-I (Chat-GPT or similar) is not author...,0,0,4,4,1410,9,12
23,0qeJIBcKNAKTasYeyYPTxYKjz0SWmuBm8IQcSfoS.pdf,CLPS 1255,"Cognitive, Linguistic, and Psychological Sciences",Social Sciences,"Generally speaking, you are not authorized to ...",1,0,4,4,1255,9,27
30,0ZIn6gxgaNz4oSLh3hnXj6AOs6Wu5ErVC1VLhYK8.pdf,NEUR 1530,Neuroscience,Life Sciences,"AI This course is intended to build research, ...",0,0,4,4,1530,9,25
...,...,...,...,...,...,...,...,...,...,...,...,...
1442,xyRhHQhZAIiwEbkoO8f8B5L14DoT5ZtyEdSDzd4V.docx,HIST 1961F,History,Social Sciences,Please note that use of ChatGPT or other AI pr...,1,0,4,4,1961,10,20
1455,yIiwpJwhLamf1PHgtJg74u2uxAPZ0iKJDp1XIkfS.docx,PHIL 1593,Philosophy,Humanities,My Addendum: Use of AI/Chatbots is not allowed...,0,0,4,4,1593,9,15
1468,yy16Jl8X8OOchCIfmErBdPHZplHsXkauGKjL7Z4E.pdf,PHP 0320,Public Health,Life Sciences,The use of any AI in this course is considered...,0,0,3,4,320,8,46
1486,zgut86QgNelZN8m5Ua3vN6KgPjswleYfSLFfv1fS.doc,KREA 500,East Asian Studies,Humanities,Youre strongly encouraged to exercise honesty ...,0,0,4,3,500,8,19
