In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder

In [2]:
# -------------------------------
# STEP 2: Load Dataset
# -------------------------------
# Replace this with your dataset path
df = pd.read_csv('C:/Users/91702/Downloads/archive.zip')
print("Dataset Loaded Successfully ✅")
print(df.head())

Dataset Loaded Successfully ✅
   Patient_ID  Age  Gender      Condition                 Procedure   Cost  \
0           1   45  Female  Heart Disease               Angioplasty  15000   
1           2   60    Male       Diabetes           Insulin Therapy   2000   
2           3   32  Female  Fractured Arm          X-Ray and Splint    500   
3           4   75    Male         Stroke    CT Scan and Medication  10000   
4           5   50  Female         Cancer  Surgery and Chemotherapy  25000   

   Length_of_Stay Readmission    Outcome  Satisfaction  
0               5          No  Recovered             4  
1               3         Yes     Stable             3  
2               1          No  Recovered             5  
3               7         Yes     Stable             2  
4              10          No  Recovered             4  


In [3]:
# 1️⃣ Combine text data
df['combined_text'] = df['Condition'].astype(str) + " " + df['Procedure'].astype(str) + " " + df['Outcome'].astype(str)

# 2️⃣ TF-IDF
tfidf = TfidfVectorizer(stop_words='english', max_features=500)
X_tfidf = tfidf.fit_transform(df['combined_text'])

In [4]:
# 3️⃣ Apply LSA (Dimensionality Reduction)
lsa = TruncatedSVD(n_components=5, random_state=42)
X_lsa = lsa.fit_transform(X_tfidf)

In [5]:
# 4️⃣ Clustering using LSA features
kmeans = KMeans(n_clusters=3, random_state=42)
df['Cluster'] = kmeans.fit_predict(X_lsa)

In [6]:
# 5️⃣ Apriori per cluster
all_rules = []
for cluster_id in df['Cluster'].unique():
    cluster_data = df[df['Cluster'] == cluster_id]
    basket = cluster_data[['Condition', 'Procedure', 'Outcome']].astype(str)
    transactions = basket.values.tolist()

    te = TransactionEncoder()
    te_ary = te.fit(transactions).transform(transactions)
    cluster_df = pd.DataFrame(te_ary, columns=te.columns_)

    frequent_itemsets = apriori(cluster_df, min_support=0.1, use_colnames=True)
    rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.0)
    rules['Cluster'] = cluster_id
    all_rules.append(rules)

final_rules = pd.concat(all_rules)
print(final_rules[['Cluster', 'antecedents', 'consequents', 'support', 'confidence', 'lift']])


    Cluster                                antecedents  \
0         1                        (Allergic Reaction)   
1         1                    (Epinephrine Injection)   
2         1                                (Recovered)   
3         1                        (Allergic Reaction)   
4         1                              (Angioplasty)   
..      ...                                        ...   
43        2        (Stable, Medication and Counseling)   
44        2  (Hypertension, Medication and Counseling)   
45        2                                   (Stable)   
46        2                             (Hypertension)   
47        2                (Medication and Counseling)   

                                  consequents   support  confidence      lift  
0                     (Epinephrine Injection)  0.125954    1.000000  7.939394  
1                         (Allergic Reaction)  0.125954    1.000000  7.939394  
2                         (Allergic Reaction)  0.125954    0.12

  cert_metric = np.where(certainty_denom == 0, 0, certainty_num / certainty_denom)
  cert_metric = np.where(certainty_denom == 0, 0, certainty_num / certainty_denom)
