In [3]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules


In [4]:
file_path = 'symbipredict_2022.csv'  # Path to your dataset
df = pd.read_csv(file_path)

df.head()

Unnamed: 0,itching,skin_rash,nodal_skin_eruptions,continuous_sneezing,shivering,chills,joint_pain,stomach_pain,acidity,ulcers_on_tongue,...,blackheads,scurring,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze,prognosis
0,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal Infection
1,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal Infection
2,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal Infection
3,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal Infection
4,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal Infection


In [9]:
# Convert each row into a transaction of symptoms and prognosis
transactions = []
for _, row in df.iterrows():
    # Select all symptoms with value 1
    symptoms = row[row == 1].index.tolist()
    # Add the prognosis (disease) to the transaction
    symptoms.append(row['prognosis'])
    transactions.append(symptoms)

# Display a sample of transactions
# print("\nSample Transactions:\n", transactions[:5])
for i in range(100,105):
    print(transactions[i])


['headache', 'chest_pain', 'dizziness', 'loss_of_balance', 'lack_of_concentration', 'Hypertension ']
['chest_pain', 'dizziness', 'loss_of_balance', 'lack_of_concentration', 'Hypertension ']
['headache', 'dizziness', 'loss_of_balance', 'lack_of_concentration', 'Hypertension ']
['headache', 'chest_pain', 'loss_of_balance', 'lack_of_concentration', 'Hypertension ']
['headache', 'chest_pain', 'dizziness', 'lack_of_concentration', 'Hypertension ']


In [10]:
# Transform the transactions into a one-hot encoded DataFrame
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
df_encoded = pd.DataFrame(te_ary, columns=te.columns_)

# Display the one-hot encoded DataFrame
print("\nOne-Hot Encoded DataFrame:\n", df_encoded.head())



One-Hot Encoded DataFrame:
     AIDS   Acne  Alcoholic Hepatitis  Allergy  Arthritis  Bronchial Asthma  \
0  False  False                False    False      False             False   
1  False  False                False    False      False             False   
2  False  False                False    False      False             False   
3  False  False                False    False      False             False   
4  False  False                False    False      False             False   

   Cervical Spondylosis  Chickenpox  Chronic Cholestasis  Common Cold  ...  \
0                 False       False                False        False  ...   
1                 False       False                False        False  ...   
2                 False       False                False        False  ...   
3                 False       False                False        False  ...   
4                 False       False                False        False  ...   

   vomiting  watering_from_eyes  

In [14]:
# Set a minimum support threshold
min_support = 0.05  # Adjust based on your needs

# Generate frequent itemsets
frequent_itemsets = apriori(df_encoded, min_support=min_support, use_colnames=True)

# Display frequent itemsets
frequent_itemsets.head()


Unnamed: 0,support,itemsets
0,0.209837,(abdominal_pain)
1,0.069542,(blurred_and_distorted_vision)
2,0.091514,(breathlessness)
3,0.141504,(chest_pain)
4,0.162266,(chills)


In [26]:
# Set a minimum confidence threshold
min_confidence = 0.6  # Adjust based on your needs
num_itemsets = df_encoded.shape[0]

# Generate association rules
rules = association_rules(
    frequent_itemsets,
    num_itemsets=num_itemsets,
    metric="confidence",
    min_threshold=min_confidence
)

# Sort rules by confidence in descending order
rules_sorted = rules.sort_values(by="confidence", ascending=False)

# Save the sorted association rules to a CSV file
output_file = "association_rules_sorted.csv"
rules_sorted[['antecedents', 'consequents', 'support', 'confidence', 'lift']].to_csv(output_file, index=False)

print(f"Association rules sorted by confidence saved to {output_file}")


Association rules sorted by confidence saved to association_rules_sorted.csv


In [27]:
# Filter rules with high lift values
high_lift_rules = rules[rules['lift'] > 1.5]

# Sort high-lift rules by confidence in descending order
high_lift_rules_sorted = high_lift_rules.sort_values(by="confidence", ascending=False)

# Save the high-lift rules to a CSV file
output_file_high_lift = "high_lift_rules_sorted.csv"
high_lift_rules_sorted[['antecedents', 'consequents', 'support', 'confidence', 'lift']].to_csv(output_file_high_lift, index=False)

print(f"High-lift rules (lift > 1.5) sorted by confidence saved to {output_file_high_lift}")


High-lift rules (lift > 1.5) sorted by confidence saved to high_lift_rules_sorted.csv
