In [76]:
#Import necessary libraries, pandas, numpy and mlxtend
import pandas as pd
import numpy as np
import mlxtend
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [77]:
df2 = pd.read_excel("bitre_fatalities_dec2024.xlsx",sheet_name="BITRE_Fatality")
df2.to_csv("temp1.csv", index=False)
df2 = pd.read_csv("temp1.csv", 
                  skiprows=2, 
                  header=2, 
                  dtype=str,  # 将所有列定义为字符串类型
                  low_memory=False)
new_df = df2.drop(['Crash ID', 'Age', 'SA4 Name 2021', 'National LGA Name 2021' ],axis = 'columns')
cleaned_df = new_df
cleaned_df.replace('-9', 'Undetermined', inplace=True)
cleaned_df.fillna('Undetermined', inplace=True)

In [78]:
#Check missing values in the dataframe
cleaned_df.isna()

#Count total missing values at each column in the dataframe
cleaned_df.isna().sum()

State                            0
Month                            0
Year                             0
Dayweek                          0
Time                             0
Crash Type                       0
Bus Involvement                  0
Heavy Rigid Truck Involvement    0
Articulated Truck Involvement    0
Speed Limit                      0
Road User                        0
Gender                           0
National Remoteness Areas        0
National Road Type               0
Christmas Period                 0
Easter Period                    0
Age Group                        0
Day of week                      0
Time of day                      0
dtype: int64

In [79]:
print(cleaned_df.head())

  State Month  Year Dayweek      Time Crash Type Bus Involvement  \
0   NSW    12  2024  Friday  04:00:00     Single              No   
1   NSW    12  2024  Friday  06:15:00     Single              No   
2   Tas    12  2024  Friday  09:43:00   Multiple              No   
3   NSW    12  2024  Friday  10:35:00   Multiple              No   
4   Vic    12  2024  Friday  11:30:00   Multiple    Undetermined   

  Heavy Rigid Truck Involvement Articulated Truck Involvement   Speed Limit  \
0                            No                            No           100   
1                            No                            No            80   
2                            No                            No            50   
3                            No                            No           100   
4                  Undetermined                  Undetermined  Undetermined   

   Road User  Gender National Remoteness Areas         National Road Type  \
0     Driver    Male  Inner Regional Au

In [80]:
import string

# 用英文字母做前缀，比如 A, B, C, ...
prefixes = list(string.ascii_uppercase)

# 遍历每一列，给所有元素加上对应前缀
for idx, col in enumerate(cleaned_df.columns):
    prefix = prefixes[idx % len(prefixes)]  # 防止列数超过26，循环使用字母
    cleaned_df[col] = prefix + '_' + cleaned_df[col].astype(str)


In [81]:

print(cleaned_df.columns)

Index(['State', 'Month', 'Year', 'Dayweek', 'Time', 'Crash Type',
       'Bus Involvement', 'Heavy Rigid Truck Involvement',
       'Articulated Truck Involvement', 'Speed Limit', 'Road User', 'Gender',
       'National Remoteness Areas', 'National Road Type', 'Christmas Period',
       'Easter Period', 'Age Group', 'Day of week', 'Time of day'],
      dtype='object')


In [82]:
cleaned_df = cleaned_df.astype(str)
my_list = cleaned_df.values.tolist()
te = TransactionEncoder()
array_te = te.fit(my_list).transform(my_list)

#Check the array
array_te

#Check the colunms
te.columns_

#Apriori function can handle dataframe only, covert the array to a dataframe
arm_df = pd.DataFrame(array_te, columns = te.columns_)

In [83]:
frequent_itemsets = apriori(arm_df,min_support=0.2,use_colnames =True)

#Check the length of rules
frequent_itemsets['length']=frequent_itemsets['itemsets'].apply(lambda x: len(x))

#Assume the length is 2 and the min support is >= 0.3
frequent_itemsets[ (frequent_itemsets['length']==2) & 
                  (frequent_itemsets['support']>=0.3)]

#Assume the min confidence is 0.5
rules_con = association_rules(frequent_itemsets, metric="confidence",min_threshold=0.5)

#Assume the min lift is 1
rules_lift = association_rules(frequent_itemsets, metric="lift",min_threshold=1)

#Based on min confidence (=0.5), 
#output antecedents, consequents, support, confidence and lift.
result_arm = rules_con[['antecedents','consequents','support','confidence','lift']]

#Find the rules whose confidence >= 0.7
new_result_arm = result_arm[result_arm['confidence']>=0.6]

# Save the result to CSV
new_result_arm.to_csv("association_rules.csv", index=False)
new_result_arm

Unnamed: 0,antecedents,consequents,support,confidence,lift
0,(A_NSW),(G_No),0.296638,0.973739,0.993104
1,(A_NSW),(I_No),0.269297,0.883989,0.982913
2,(A_NSW),(L_Male),0.217903,0.715283,0.996254
3,(A_NSW),(M_Unknown),0.245930,0.807284,1.008644
4,(A_NSW),(N_Undetermined),0.245947,0.807342,0.995248
...,...,...,...,...,...
39123,"(S_Day, I_No, N_Undetermined)","(P_No, M_Unknown, G_No, R_Weekday, O_No)",0.252435,0.621569,1.416197
39125,"(S_Day, N_Undetermined, R_Weekday)","(P_No, M_Unknown, G_No, I_No, O_No)",0.252435,0.796859,1.176240
39126,"(S_Day, R_Weekday, O_No)","(P_No, M_Unknown, G_No, I_No, N_Undetermined)",0.252435,0.659667,0.942543
39128,"(I_No, N_Undetermined, R_Weekday)","(P_No, M_Unknown, G_No, S_Day, O_No)",0.252435,0.612788,1.446308


In [87]:
# 先转换为可搜索的字符串格式
result_str = new_result_arm.copy()
result_str['antecedents'] = result_str['antecedents'].apply(lambda x: str(list(x)).replace('[', '').replace(']', '').replace("'", ""))
result_str['consequents'] = result_str['consequents'].apply(lambda x: str(list(x)).replace('[', '').replace(']', '').replace("'", ""))

# 搜索consequents中包含'K_'的规则
k_rules = result_str[result_str['consequents'].str.contains('K_')]

# 按confidence降序排序
k_rules_sorted = k_rules.sort_values('confidence', ascending=False)

# 打印结果
print("包含K_的规则数量:", len(k_rules))
print("\n规则详情:")
print(k_rules_sorted)

# 可选：保存结果到CSV
k_rules_sorted.to_csv("k_rules.csv", index=False)

# 打印一些基本统计信息
if len(k_rules) > 0:
    print("\n统计信息:")
    print("平均置信度:", round(k_rules['confidence'].mean(), 3))
    print("平均支持度:", round(k_rules['support'].mean(), 3))
    print("平均提升度:", round(k_rules['lift'].mean(), 3))

包含K_的规则数量: 0

规则详情:
Empty DataFrame
Columns: [antecedents, consequents, support, confidence, lift]
Index: []
