# Introduction

Here we use association_rules to conduct sequential pattern mining. This allows us to identify ordered patterns such as A->B->C->D. We will be able to see how patients transition from certain drugs to other drugs.

In [3]:
#pip install mlxtend

In [4]:
import numpy as np
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder


In [5]:
df = pd.read_csv('all_rx_drugs.csv')
df.head()

Unnamed: 0,patient.identifier,patient.gender,patient.birthYear,claim.fillDate,claim.medication_code,claim.type,claim.moleculeName,product_name,pharm_classes,claim.diagnosisCode,diagnosis_description
0,1000052730,F,1963,2016-05-20,74433902,RJ,Adalimumab,HUMIRA,other,696.0,PSORIATIC ARTHROPATHY
1,1000052730,F,1963,2016-05-23,74433902,RJ,Adalimumab,HUMIRA,other,696.0,PSORIATIC ARTHROPATHY
2,1000052730,F,1963,2016-06-01,74433902,RJ,Adalimumab,HUMIRA,other,696.0,PSORIATIC ARTHROPATHY
3,1000052730,F,1963,2016-06-01,74433902,RJ,Adalimumab,HUMIRA,other,696.0,PSORIATIC ARTHROPATHY
4,1000052730,F,1963,2016-06-02,74433902,PD,Adalimumab,HUMIRA,other,696.0,PSORIATIC ARTHROPATHY


In [37]:
# a = df[df['patient.identifier']==29353157]

# Data cleaning
The apriori function expects data in a one-hot encoded pandas DataFrame like this:

dataset = [['A', 'B', 'C', 'D'],
           ['B', 'E', 'A'],
           ['C', 'D']]
           
Each list (e.g. ['A', 'B', 'C', 'D']) is the drug journey of a unique patient. ABCD took place in sequential order.

In [6]:
# sort df by date
df['claim.fillDate'] = pd.to_datetime(df['claim.fillDate'])
df.sort_values(by='claim.fillDate')
df.head()

Unnamed: 0,patient.identifier,patient.gender,patient.birthYear,claim.fillDate,claim.medication_code,claim.type,claim.moleculeName,product_name,pharm_classes,claim.diagnosisCode,diagnosis_description
0,1000052730,F,1963,2016-05-20,74433902,RJ,Adalimumab,HUMIRA,other,696.0,PSORIATIC ARTHROPATHY
1,1000052730,F,1963,2016-05-23,74433902,RJ,Adalimumab,HUMIRA,other,696.0,PSORIATIC ARTHROPATHY
2,1000052730,F,1963,2016-06-01,74433902,RJ,Adalimumab,HUMIRA,other,696.0,PSORIATIC ARTHROPATHY
3,1000052730,F,1963,2016-06-01,74433902,RJ,Adalimumab,HUMIRA,other,696.0,PSORIATIC ARTHROPATHY
4,1000052730,F,1963,2016-06-02,74433902,PD,Adalimumab,HUMIRA,other,696.0,PSORIATIC ARTHROPATHY


In [34]:
#2. Groupy patient id, get the list of unique drugs for each patient
pt = pd.pivot_table(df,
                    values=['claim.moleculeName'],
                    index='patient.identifier',
                    aggfunc={'claim.moleculeName': list})
pt.reset_index(inplace=True)

# need to get the unique elements in each list while preserving order
def unique(sequence):
    seen = set()
    return [x for x in sequence if not (x in seen or seen.add(x))]

pt['claim.moleculeName'] = [unique(x) for x in pt['claim.moleculeName']]
pt.head()

Unnamed: 0,patient.identifier,claim.moleculeName
0,29344423,"[Fluticasone, Ciclopirox, Triamcinolone]"
1,29353157,"[Betamethasone, Clobetasol, Triamcinolone, Flu..."
2,29353986,"[Fluticasone, Calcipotriene-Betamethasone, Eta..."
3,29357021,"[Methotrexate, Adalimumab]"
4,29365473,"[Hydrocortisone, Clobetasol, Adalimumab, Betam..."


In [38]:
# keep only the drug (molecules)
df1 = pt['claim.moleculeName']
df1.head()

0             [Fluticasone, Ciclopirox, Triamcinolone]
1    [Betamethasone, Clobetasol, Triamcinolone, Flu...
2    [Fluticasone, Calcipotriene-Betamethasone, Eta...
3                           [Methotrexate, Adalimumab]
4    [Hydrocortisone, Clobetasol, Adalimumab, Betam...
Name: claim.moleculeName, dtype: object

In [39]:
# one hot encoding
te = TransactionEncoder()
oht_ary = te.fit(df1).transform(df1, sparse=True)
sparse_df = pd.DataFrame.sparse.from_spmatrix(oht_ary, columns=te.columns_)
sparse_df

Unnamed: 0,Acitretin,Adalimumab,Alclometasone,Amcinonide,Anthralin,Antiseborrheic,Apremilast,Azathioprine,Benzocaine-Resorcinol,Betamethasone,...,Sertaconazole,Soap,Starch,Sulfacetamide,Tacrolimus,Tazarotene,Tildrakizumab-Asmn,Triamcinolone,Ustekinumab,Witch
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71569,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
71570,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
71571,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
71572,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Association rules

In [47]:
frequent_itemsets = apriori(sparse_df, min_support=0.01, use_colnames=True)
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))
frequent_itemsets = frequent_itemsets.sort_values(by='support', ascending =False)
frequent_itemsets.head(20)

Unnamed: 0,support,itemsets,length
24,0.553036,(Methotrexate),1
32,0.417819,(Triamcinolone),1
10,0.331936,(Clobetasol),1
17,0.331559,(Fluticasone),1
0,0.227345,(Adalimumab),1
196,0.224607,"(Triamcinolone, Methotrexate)",2
165,0.181602,"(Methotrexate, Fluticasone)",2
124,0.181518,"(Clobetasol, Methotrexate)",2
23,0.174351,(Ketoconazole),1
129,0.168539,"(Clobetasol, Triamcinolone)",2


In [48]:
rules = association_rules(frequent_itemsets, metric ="lift", min_threshold = 1)
rules = rules.sort_values(['support', 'confidence'], ascending =[False, False])
rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Clobetasol),(Triamcinolone),0.331936,0.417819,0.168539,0.507745,1.215226,0.02985,1.182681
1,(Triamcinolone),(Clobetasol),0.417819,0.331936,0.168539,0.403377,1.215226,0.02985,1.119743
3,(Fluticasone),(Triamcinolone),0.331559,0.417819,0.154036,0.464582,1.111921,0.015505,1.087339
2,(Triamcinolone),(Fluticasone),0.417819,0.331559,0.154036,0.368667,1.111921,0.015505,1.058778
5,(Adalimumab),(Methotrexate),0.227345,0.553036,0.136782,0.601647,1.087898,0.011051,1.12203
