In [62]:
# Import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from apyori import apriori
from collections import defaultdict
import subprocess
import re


In [63]:
#Case Study 1

df = pd.read_csv('D1.csv')
df

Unnamed: 0,patient_id,global_num,date,location,latitude,longitude
0,1000000001,2.0,22/01/2020,Gyeonggi-do_Gimpo-si,37.615246,126.715632
1,1000000001,2.0,24/01/2020,Seoul_Jung-gu,37.567241,127.005659
2,1000000002,5.0,26/01/2020,Seoul_Seongdong-gu,37.563992,127.029534
3,1000000002,5.0,27/01/2020,Seoul_Dongdaemun-gu,37.566262,127.065815
4,1000000002,5.0,28/01/2020,Seoul_Gangnam-gu,37.523674,127.046543
...,...,...,...,...,...,...
1504,6100000083,,6/03/2020,Daegu_Buk-gu,35.891794,128.588890
1505,6100000085,,16/03/2020,Gyeongsangnam-do_Changwon-si,35.227956,128.685595
1506,6100000086,,14/03/2020,Daegu_Dalseong-gun,35.857185,128.466686
1507,6100000090,,24/03/2020,Incheon_Jung-gu,37.460191,126.440696


In [64]:
# info and the first 10 transactions
print(df.info())
df.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1509 entries, 0 to 1508
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   patient_id  1509 non-null   int64  
 1   global_num  959 non-null    float64
 2   date        1509 non-null   object 
 3   location    1509 non-null   object 
 4   latitude    1509 non-null   float64
 5   longitude   1509 non-null   float64
dtypes: float64(3), int64(1), object(2)
memory usage: 70.9+ KB
None


Unnamed: 0,patient_id,global_num,date,location,latitude,longitude
0,1000000001,2.0,22/01/2020,Gyeonggi-do_Gimpo-si,37.615246,126.715632
1,1000000001,2.0,24/01/2020,Seoul_Jung-gu,37.567241,127.005659
2,1000000002,5.0,26/01/2020,Seoul_Seongdong-gu,37.563992,127.029534
3,1000000002,5.0,27/01/2020,Seoul_Dongdaemun-gu,37.566262,127.065815
4,1000000002,5.0,28/01/2020,Seoul_Gangnam-gu,37.523674,127.046543
5,1000000004,7.0,30/01/2020,Seoul_Jungnang-gu,37.612772,127.098167
6,1000000005,9.0,31/01/2020,Seoul_Jungnang-gu,37.612772,127.098167
7,1000000006,10.0,30/01/2020,Gyeonggi-do_Goyang-si,37.641141,126.791968
8,1000000007,11.0,30/01/2020,Gyeonggi-do_Goyang-si,37.641141,126.791968
9,1000000008,13.0,31/01/2020,Seoul_Jung-gu,37.567241,127.005659


In [65]:
df['patient_id'].describe()

count    1.509000e+03
mean     2.198445e+09
std      1.945771e+09
min      1.000000e+09
25%      1.000000e+09
50%      1.100000e+09
75%      3.009000e+09
max      6.100000e+09
Name: patient_id, dtype: float64

In [66]:
#151 locations & Top location is Incheon_Jung-gu
df['location'].describe()

count                1509
unique                151
top       Incheon_Jung-gu
freq                  133
Name: location, dtype: object

In [67]:
#1. Pre-processing - datatype change
df['date'] = pd.to_datetime(df['date'], dayfirst=True)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1509 entries, 0 to 1508
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   patient_id  1509 non-null   int64         
 1   global_num  959 non-null    float64       
 2   date        1509 non-null   datetime64[ns]
 3   location    1509 non-null   object        
 4   latitude    1509 non-null   float64       
 5   longitude   1509 non-null   float64       
dtypes: datetime64[ns](1), float64(3), int64(1), object(1)
memory usage: 70.9+ KB
None


In [68]:
#1. Pre-processing - get sequential rules by group by patient_id
transactions = df.groupby(['patient_id'])['location'].apply(list)
print(transactions.head(5))

patient_id
1000000001                [Gyeonggi-do_Gimpo-si, Seoul_Jung-gu]
1000000002    [Seoul_Seongdong-gu, Seoul_Dongdaemun-gu, Seou...
1000000004                                  [Seoul_Jungnang-gu]
1000000005                                  [Seoul_Jungnang-gu]
1000000006                              [Gyeonggi-do_Goyang-si]
Name: location, dtype: object


In [69]:
#2.a. What ‘min_support’ and `min_confidence’ thresholds were set for this mining exercise? 
#     Rationalize why these values were chosen.

transaction_list = list(transactions)
results = list(apriori(transaction_list, min_support=0.01, min_confidence=0.1))

# print first 5 rules
print(results[:5])

[RelationRecord(items=frozenset({'Incheon_Jung-gu'}), support=0.14927048260381592, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'Incheon_Jung-gu'}), confidence=0.14927048260381592, lift=1.0)]), RelationRecord(items=frozenset({'Daegu_Buk-gu', 'Daegu_Jung-gu'}), support=0.01122334455667789, ordered_statistics=[OrderedStatistic(items_base=frozenset({'Daegu_Buk-gu'}), items_add=frozenset({'Daegu_Jung-gu'}), confidence=0.2941176470588235, lift=5.34813925570228), OrderedStatistic(items_base=frozenset({'Daegu_Jung-gu'}), items_add=frozenset({'Daegu_Buk-gu'}), confidence=0.2040816326530612, lift=5.34813925570228)]), RelationRecord(items=frozenset({'Seoul_Dongjak-gu', 'Incheon_Jung-gu'}), support=0.010101010101010102, ordered_statistics=[OrderedStatistic(items_base=frozenset({'Seoul_Dongjak-gu'}), items_add=frozenset({'Incheon_Jung-gu'}), confidence=0.1139240506329114, lift=0.7632054820595795)]), RelationRecord(items=frozenset({'Seoul_Gangnam-gu', 'Incheon_J

In [70]:
def convert_apriori_results_to_pandas_df(results):
    rules = []
    
    for rule_set in results:
        for rule in rule_set.ordered_statistics:
            # items_base = left side of rules, items_add = right side
            # support, confidence and lift for respective rules
            rules.append([','.join(rule.items_base), ','.join(rule.items_add),
                         rule_set.support, rule.confidence, rule.lift]) 
    
    # typecast it to pandas df
    return pd.DataFrame(rules, columns=['Left_side', 'Right_side', 'Support', 
                                        'Confidence', 'Lift']) 

result_df = convert_apriori_results_to_pandas_df(results)

print(result_df.head(6))

          Left_side        Right_side   Support  Confidence      Lift
0                     Incheon_Jung-gu  0.149270    0.149270  1.000000
1      Daegu_Buk-gu     Daegu_Jung-gu  0.011223    0.294118  5.348139
2     Daegu_Jung-gu      Daegu_Buk-gu  0.011223    0.204082  5.348139
3  Seoul_Dongjak-gu   Incheon_Jung-gu  0.010101    0.113924  0.763205
4   Incheon_Jung-gu  Seoul_Gangnam-gu  0.028058    0.187970  3.349624
5  Seoul_Gangnam-gu   Incheon_Jung-gu  0.028058    0.500000  3.349624


In [71]:
# sort all acquired rules descending by lift
result_df = result_df.sort_values(by='Lift', ascending=False)
print(result_df.head(5))

          Left_side         Right_side   Support  Confidence      Lift
1      Daegu_Buk-gu      Daegu_Jung-gu  0.011223    0.294118  5.348139
2     Daegu_Jung-gu       Daegu_Buk-gu  0.011223    0.204082  5.348139
4   Incheon_Jung-gu   Seoul_Gangnam-gu  0.028058    0.187970  3.349624
5  Seoul_Gangnam-gu    Incheon_Jung-gu  0.028058    0.500000  3.349624
6   Incheon_Jung-gu  Seoul_Jungnang-gu  0.019080    0.127820  1.441610


In [72]:
#Get min_support & min_confidence
min_support = result_df['Support'].min()*100
min_confidence = result_df['Confidence'].mean()*100 

print('The min_support is ' + '{:.4f}%'.format(min_support)) 
print('The min_confidence is ' + '{:.4f}%'.format(min_confidence))    

The min_support is 1.0101%
The min_confidence is 22.1786%


In [73]:
# apply the Apriori algorithm on the dataset
results = list(apriori(transaction_list, min_support=0.001, min_confidence=0.02))

# print first 5 rules
print(results[:5])

[RelationRecord(items=frozenset({'Busan_Yeonje-gu'}), support=0.05723905723905724, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'Busan_Yeonje-gu'}), confidence=0.05723905723905724, lift=1.0)]), RelationRecord(items=frozenset({'Chungcheongnam-do_Cheonan-si'}), support=0.030303030303030304, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'Chungcheongnam-do_Cheonan-si'}), confidence=0.030303030303030304, lift=1.0)]), RelationRecord(items=frozenset({'Daegu_Buk-gu'}), support=0.038159371492704826, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'Daegu_Buk-gu'}), confidence=0.038159371492704826, lift=1.0)]), RelationRecord(items=frozenset({'Daegu_Dong-gu'}), support=0.02132435465768799, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'Daegu_Dong-gu'}), confidence=0.02132435465768799, lift=1.0)]), RelationRecord(items=frozenset({'Daegu_Jung-gu'}), support=

In [74]:
# Change output format
def convert_apriori_results_to_pandas_df(results):
    rules = []
    
    for rule_set in results:
        for rule in rule_set.ordered_statistics:
            # items_base = left side of rules, items_add = right side
            # support, confidence and lift for respective rules
            rules.append([','.join(rule.items_base), ','.join(rule.items_add),
                         rule_set.support, rule.confidence, rule.lift]) 
    
    # typecast it to pandas df
    return pd.DataFrame(rules, columns=['Left_side', 'Right_side', 'Support', 
                                        'Confidence', 'Lift']) 

result_df = convert_apriori_results_to_pandas_df(results)

print("Number of association rules acquired", len(result_df))
print("")
print(result_df.head(5))

Number of association rules acquired 5381

  Left_side                    Right_side   Support  Confidence  Lift
0                         Busan_Yeonje-gu  0.057239    0.057239   1.0
1            Chungcheongnam-do_Cheonan-si  0.030303    0.030303   1.0
2                            Daegu_Buk-gu  0.038159    0.038159   1.0
3                           Daegu_Dong-gu  0.021324    0.021324   1.0
4                           Daegu_Jung-gu  0.054994    0.054994   1.0


In [75]:
# sort all acquired rules descending by lift and output top-5 rules
result_df = result_df.sort_values(by='Lift', ascending=False)
result_df.head(5)

Unnamed: 0,Left_side,Right_side,Support,Confidence,Lift
3910,"Daegu_Dong-gu,Gyeongsangnam-do_Uiryeong-gun","Daegu_Jung-gu,Gyeongsangbuk-do_Goryeong-gun",0.001122,1.0,891.0
3462,"Busan_Nam-gu,Chungcheongbuk-do_Goesan-gun","Busan_Yeongdo-gu,Chungcheongbuk-do_Cheongju-si",0.001122,1.0,891.0
4838,"Busan_Saha-gu,Gyeonggi-do_Hanam-si","Seoul_Gangseo-gu,Busan_Seo-gu,Incheon_Seo-gu",0.001122,1.0,891.0
3718,"Daegu_Jung-gu,Gwangju_Seo-gu","Daegu_Buk-gu,Gwangju_Dong-gu",0.001122,1.0,891.0
3717,"Daegu_Jung-gu,Gwangju_Dong-gu","Daegu_Buk-gu,Gwangju_Seo-gu",0.001122,1.0,891.0


In [78]:
#3.  List four most interesting routes taken by individuals who have tested positive for COVID19 and have travelled from Buk-gu City in Busan Province.
route_from_Busan_Buk_gu = result_df.loc[result_df.Left_side == 'Busan_Buk-gu']
route_from_Busan_Buk_gu[:4]

Unnamed: 0,Left_side,Right_side,Support,Confidence,Lift
3085,Busan_Buk-gu,"Seoul_Gangseo-gu,Busan_Yeonje-gu,Busan_Gangseo-gu",0.001122,0.2,178.2
3059,Busan_Buk-gu,"Busan_Yeonje-gu,Busan_Gangseo-gu,Gwangju_Buk-gu",0.001122,0.2,178.2
3098,Busan_Buk-gu,"Seoul_Gangseo-gu,Busan_Gangseo-gu,Gwangju_Buk-gu",0.001122,0.2,178.2
3112,Busan_Buk-gu,"Gyeongsangnam-do_Uiryeong-gun,Busan_Gangseo-gu...",0.001122,0.2,178.2


In [79]:
#4. Can you perform sequence analysis on this dataset? If yes, present your results. If not, rationalize why.
# sorting the date of visit
df = df.sort_values(['patient_id','date'])
df['trip'] = df.groupby('patient_id').cumcount()+1
df.head(10)

Unnamed: 0,patient_id,global_num,date,location,latitude,longitude,trip
0,1000000001,2.0,2020-01-22,Gyeonggi-do_Gimpo-si,37.615246,126.715632,1
1,1000000001,2.0,2020-01-24,Seoul_Jung-gu,37.567241,127.005659,2
2,1000000002,5.0,2020-01-26,Seoul_Seongdong-gu,37.563992,127.029534,1
3,1000000002,5.0,2020-01-27,Seoul_Dongdaemun-gu,37.566262,127.065815,2
4,1000000002,5.0,2020-01-28,Seoul_Gangnam-gu,37.523674,127.046543,3
5,1000000004,7.0,2020-01-30,Seoul_Jungnang-gu,37.612772,127.098167,1
6,1000000005,9.0,2020-01-31,Seoul_Jungnang-gu,37.612772,127.098167,1
7,1000000006,10.0,2020-01-30,Gyeonggi-do_Goyang-si,37.641141,126.791968,1
8,1000000007,11.0,2020-01-30,Gyeonggi-do_Goyang-si,37.641141,126.791968,1
9,1000000008,13.0,2020-01-31,Seoul_Jung-gu,37.567241,127.005659,1


In [81]:
trip = df.groupby(['patient_id'])['location'].apply(list)
sequences = trip.values.tolist()
print(sequences[:5])

[['Gyeonggi-do_Gimpo-si', 'Seoul_Jung-gu'], ['Seoul_Seongdong-gu', 'Seoul_Dongdaemun-gu', 'Seoul_Gangnam-gu'], ['Seoul_Jungnang-gu'], ['Seoul_Jungnang-gu'], ['Gyeonggi-do_Goyang-si']]


In [101]:
''' Uses SPMF to find association rules in supplied transactions '''
def get_association_rules(sequences, min_sup, min_conf):
    # step 1: create required input for SPMF
    
    # prepare a dict to uniquely assign each item in the transactions to an int ID
    item_dict = defaultdict(int)
    output_dict = defaultdict(str)
    item_id = 1
    
    # write your sequences in SPMF format
    with open('seq_rule_input.txt', 'w+') as f:
        for sequence in sequences:
            z = []
            for itemset in sequence:
                # if there are multiple items in one itemset
                if isinstance(itemset, list):
                    for item in itemset:
                        if item not in item_dict:
                            item_dict[item] = item_id
                            item_id += 1

                        z.append(item_dict[item])
                else:
                    if itemset not in item_dict:
                        item_dict[itemset] = item_id
                        output_dict[str(item_id)] = itemset
                        item_id += 1
                    z.append(item_dict[itemset])
                    
                # end of itemset
                z.append(-1)
            
            # end of a sequence
            z.append(-2)
            f.write(' '.join([str(x) for x in z]))
            f.write('\n')
    
    # run SPMF with supplied parameters
    supp_param = '{}%'.format(int(min_sup * 100))
    conf_param = '{}%'.format(int(min_conf * 100))
    subprocess.call(['java', '-jar', 'spmf.jar', 'run', 'RuleGrowth', 
                     'seq_rule_input.txt', 'seq_rule_output.txt', 
                     supp_param, conf_param], shell=True)
    
    # read back the output rules
    outputs = open('seq_rule_output.txt', 'r').read().strip().split('\n')
    output_rules = []
    for rule in outputs:
        left, right, sup, conf = re.search(pattern=r'([0-9\,]+) ==> ([0-9\,]+) #SUP: ([0-9]+) #CONF: ([0-9\.]+)', string=rule).groups()
        sup = int(sup) / len(sequences)
        conf = float(conf)
        output_rules.append([[output_dict[x] for x in left.split(',')], [output_dict[x] for x in right.split(',')], sup, conf])
    
    # return pandas DataFrame
    return pd.DataFrame(output_rules, columns = ['Left_rule', 'Right_rule', 'Support', 'Confidence'])

In [102]:
# Using min_supp of 0.1 and min_conf of 0.1.
get_association_rules = get_association_rules(sequences, 0.1, 0.1)
print("Number of sequential rules acquired " + str(len(get_association_rules)))
get_association_rules

Number of sequential rules acquired 17


Unnamed: 0,Left_rule,Right_rule,Support,Confidence
0,[Gyeonggi-do_Gimpo-si],[Seoul_Jung-gu],4.858586,0.63151
1,[Gyeonggi-do_Gimpo-si],"[Seoul_Jung-gu, Seoul_Seongdong-gu]",2.228956,0.289716
2,[Gyeonggi-do_Gimpo-si],"[Seoul_Jung-gu, Seoul_Seongbuk-gu]",1.278339,0.166156
3,[Gyeonggi-do_Gimpo-si],"[Seoul_Jung-gu, Gyeonggi-do_Uijeongbu-si]",1.0,0.129978
4,[Gyeonggi-do_Gimpo-si],[Seoul_Seongdong-gu],3.245791,0.421882
5,"[Gyeonggi-do_Gimpo-si, Seoul_Jung-gu]",[Seoul_Seongdong-gu],2.228956,0.458766
6,[Gyeonggi-do_Gimpo-si],[Seoul_Dongdaemun-gu],1.397306,0.181619
7,[Gyeonggi-do_Gimpo-si],[Gyeonggi-do_Goyang-si],1.013468,0.131729
8,[Gyeonggi-do_Gimpo-si],[Seoul_Seongbuk-gu],1.882155,0.244639
9,"[Gyeonggi-do_Gimpo-si, Seoul_Jung-gu]",[Seoul_Seongbuk-gu],1.278339,0.263109
