### 5.9 Activity - Assignment 2

#### Imports and viewing data

In [1]:
#imports
import pandas as pd
from apyori import apriori
from collections import defaultdict
import subprocess
import re

In [2]:
#load dataset
route_df = pd.read_csv('PatientRoute-1.csv')

#info and the first 10 transactions
print(route_df.info())
print(route_df.head(10))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6714 entries, 0 to 6713
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   patient_id  6714 non-null   int64  
 1   global_num  3571 non-null   float64
 2   date        6714 non-null   object 
 3   location    6714 non-null   object 
 4   latitude    6714 non-null   float64
 5   longitude   6714 non-null   float64
dtypes: float64(3), int64(1), object(2)
memory usage: 314.8+ KB
None
   patient_id  global_num        date              location   latitude  \
0  1000000001         2.0  22/01/2020  Gyeonggi-do_Gimpo-si  37.615246   
1  1000000001         2.0  24/01/2020         Seoul_Jung-gu  37.567241   
2  1000000002         5.0  25/01/2020     Seoul_Seongbuk-gu  37.592560   
3  1000000002         5.0  26/01/2020     Seoul_Seongbuk-gu  37.591810   
4  1000000002         5.0  26/01/2020    Seoul_Seongdong-gu  37.563992   
5  1000000002         5.0  26/01/2020     Seoul_Seong

#### Justifying variables
Location was chosen over using the latitude and longitude as it is more specific. Due to latitude and longitude being recorded to six significant figures, the locations are too specific to find association rules for the routes. Patient ID was chosen over the global number ID as not all patients had a global_number which increased the number of routes that could be examined.

In [3]:
#group by patient_id, list all locations
transactions = route_df.groupby(['patient_id'])['location'].apply(list)

print(transactions.head(5))

patient_id
1000000001                [Gyeonggi-do_Gimpo-si, Seoul_Jung-gu]
1000000002    [Seoul_Seongbuk-gu, Seoul_Seongbuk-gu, Seoul_S...
1000000003                   [Seoul_Jongno-gu, Seoul_Jongno-gu]
1000000004                                  [Seoul_Jungnang-gu]
1000000005                                  [Seoul_Jungnang-gu]
Name: location, dtype: object


### Association Mining

#### Min_support justification
As discussed by https://www.sciencedirect.com/topics/computer-science/minimum-confidence (chapter 1.1.3) the performance of the Apriori algorithm is improved by reducing the size of candidate sets. A minimum support which was low enough to generate at least 10 common routes from Daegu_Buk-gu was required. Additionally, min_confidence was also set as it prevents too many rules from being generated that would be practically useful. https://data-mining.philippe-fournier-viger.com/how-to-auto-adjust-the-minimum-support-threshold-according-to-the-data-size/ discussed how often the determination of the minimum is completed by trial and error. As such, minimum support was set to 0.002 and the minimum confidence was set to 0.5 to find a reasonable number of rules > 10.

In [4]:
# type cast the tranasctions from pandas into normal list format and run apriori
transaction_list = list(transactions)
results = list(apriori(transaction_list, min_support = 0.002, min_confidence = 0.5))

#print first 5 rules
print(results[:5])

[RelationRecord(items=frozenset({'Busan_Yeonje-gu', 'Busan_Buk-gu'}), support=0.005780346820809248, ordered_statistics=[OrderedStatistic(items_base=frozenset({'Busan_Buk-gu'}), items_add=frozenset({'Busan_Yeonje-gu'}), confidence=0.7, lift=11.612328767123287)]), RelationRecord(items=frozenset({'Busan_Yeonje-gu', 'Busan_Busanjin-gu'}), support=0.010734929810074319, ordered_statistics=[OrderedStatistic(items_base=frozenset({'Busan_Busanjin-gu'}), items_add=frozenset({'Busan_Yeonje-gu'}), confidence=0.6190476190476192, lift=10.269406392694066)]), RelationRecord(items=frozenset({'Busan_Yeonje-gu', 'Busan_Dong-gu'}), support=0.008257638315441783, ordered_statistics=[OrderedStatistic(items_base=frozenset({'Busan_Dong-gu'}), items_add=frozenset({'Busan_Yeonje-gu'}), confidence=0.7142857142857143, lift=11.849315068493151)]), RelationRecord(items=frozenset({'Busan_Geumjeong-gu', 'Busan_Dongnae-gu'}), support=0.005780346820809248, ordered_statistics=[OrderedStatistic(items_base=frozenset({'Busan

In [5]:
# to make the format readable to pandas
def convert_apriori_results_to_pandas_df(results):
    rules = []
    
    for rule_set in results:
        for rule in rule_set.ordered_statistics:
            # items_base = left side of rules, item_add = right side
            # support, confidence, and lift for respective rules
            rules.append([','.join(rule.items_base), ','.join(rule.items_add), rule_set.support, rule.confidence, rule.lift])
            
    # type cast it to pandas df
    return pd.DataFrame(rules, columns=['Left_side', 'Right_side', 'Support', 'Confidence', 'Lift'])

result_df = convert_apriori_results_to_pandas_df(results)

print(result_df.head(20))

print(len(result_df))

                         Left_side                     Right_side   Support  \
0                     Busan_Buk-gu                Busan_Yeonje-gu  0.005780   
1                Busan_Busanjin-gu                Busan_Yeonje-gu  0.010735   
2                    Busan_Dong-gu                Busan_Yeonje-gu  0.008258   
3               Busan_Geumjeong-gu               Busan_Dongnae-gu  0.005780   
4                 Busan_Dongnae-gu                Busan_Yeonje-gu  0.017341   
5                 Busan_Gangseo-gu                Busan_Yeonje-gu  0.004955   
6               Busan_Geumjeong-gu                Busan_Yeonje-gu  0.004955   
7                Busan_Haeundae-gu                Busan_Yeonje-gu  0.014038   
8                    Busan_Saha-gu                   Busan_Seo-gu  0.004129   
9                    Busan_Saha-gu                Busan_Yeonje-gu  0.004955   
10                  Gwangju_Buk-gu                Busan_Yeonje-gu  0.003303   
11  Chungcheongbuk-do_Jincheon-gun  Chungcheongbuk-d

#### Determining the top 5 frequently occuring rules

In [6]:
#Support is a measure of how often locations appear together
result_df = result_df.sort_values(by = 'Support', ascending = False)
result_df.head(5)

Unnamed: 0,Left_side,Right_side,Support,Confidence,Lift
12,Chungcheongnam-do_Asan-si,Chungcheongnam-do_Cheonan-si,0.017341,0.75,9.560526
4,Busan_Dongnae-gu,Busan_Yeonje-gu,0.017341,0.5,8.294521
7,Busan_Haeundae-gu,Busan_Yeonje-gu,0.014038,0.53125,8.812928
1,Busan_Busanjin-gu,Busan_Yeonje-gu,0.010735,0.619048,10.269406
2,Busan_Dong-gu,Busan_Yeonje-gu,0.008258,0.714286,11.849315


From the top five frequently occuring rules, Busan_Yeonje-gu appears as the destination. This suggests that this location is a frequented location and could be a hotspot for COVID-19. It is also of note that the lift is > 1 and so there is a positive correlation and that the association is not that of random chance.

#### Identify common routes that positive patients from Daegu Buk-gu travelled

In [7]:
#Lift gives the predictive power of the rules , therefore sorted by lift to show it is not random, and is a common route
result_df = result_df.sort_values(by = 'Lift', ascending = False)

result_df = result_df[result_df["Left_side"].str.contains("Daegu_Buk-gu") | result_df["Right_side"].str.contains("Daegu_Buk-gu")]

result_df.head(10)

Unnamed: 0,Left_side,Right_side,Support,Confidence,Lift
167,"Daegu_Buk-gu,Seoul_Jung-gu","Daegu_Jung-gu,Daegu_Nam-gu,Daegu_Seo-gu",0.002477,1.0,403.666667
142,"Daegu_Buk-gu,Daegu_Seo-gu","Daegu_Nam-gu,Seoul_Jung-gu",0.002477,1.0,403.666667
145,"Daegu_Nam-gu,Seoul_Jung-gu","Daegu_Buk-gu,Daegu_Seo-gu",0.002477,1.0,403.666667
173,"Daegu_Jung-gu,Daegu_Buk-gu,Daegu_Seo-gu","Daegu_Nam-gu,Seoul_Jung-gu",0.002477,1.0,403.666667
166,"Daegu_Buk-gu,Daegu_Seo-gu","Daegu_Jung-gu,Daegu_Nam-gu,Seoul_Jung-gu",0.002477,1.0,403.666667
178,"Daegu_Jung-gu,Daegu_Nam-gu,Daegu_Seo-gu","Daegu_Buk-gu,Seoul_Jung-gu",0.002477,1.0,403.666667
170,"Daegu_Nam-gu,Seoul_Jung-gu","Daegu_Jung-gu,Daegu_Buk-gu,Daegu_Seo-gu",0.002477,1.0,403.666667
179,"Daegu_Jung-gu,Daegu_Nam-gu,Seoul_Jung-gu","Daegu_Buk-gu,Daegu_Seo-gu",0.002477,1.0,403.666667
169,"Daegu_Nam-gu,Daegu_Seo-gu","Daegu_Jung-gu,Daegu_Buk-gu,Seoul_Jung-gu",0.002477,0.75,302.75
174,"Daegu_Jung-gu,Daegu_Buk-gu,Seoul_Jung-gu","Daegu_Nam-gu,Daegu_Seo-gu",0.002477,1.0,302.75


The results are sorted by lift to determine the most common routes as lift provides the predicitive power of a rule compared to just random chance. As the lift is very high for these top 10 it shows strong positive correlation therefore likely to be a common route.

In [8]:
#this code was just added to determine the amount of rules generated for min_support question
count_daegu_l = result_df["Left_side"].str.contains("Daegu_Buk-gu").sum()
print(count_daegu_l)

count_daegu_r = result_df["Right_side"].str.contains("Daegu_Buk-gu").sum()
print(count_daegu_r)

44
26


#### Sequence Analysis
Sequence analysis utilises time stamps. As this data has dates it is possible to perform sequence analysis as below

In [9]:
#produce sequences in order
sequences = transactions.values.tolist()
print(sequences)

[['Gyeonggi-do_Gimpo-si', 'Seoul_Jung-gu'], ['Seoul_Seongbuk-gu', 'Seoul_Seongbuk-gu', 'Seoul_Seongdong-gu', 'Seoul_Seongbuk-gu', 'Seoul_Seongbuk-gu', 'Seoul_Seongbuk-gu', 'Seoul_Dongdaemun-gu', 'Seoul_Seongbuk-gu', 'Seoul_Jungnang-gu', 'Seoul_Jungnang-gu', 'Seoul_Gangnam-gu', 'Seoul_Jungnang-gu', 'Seoul_Jungnang-gu'], ['Seoul_Jongno-gu', 'Seoul_Jongno-gu'], ['Seoul_Jungnang-gu'], ['Seoul_Jungnang-gu'], ['Gyeonggi-do_Goyang-si'], ['Gyeonggi-do_Goyang-si'], ['Seoul_Jung-gu'], ['Gyeonggi-do_Seongnam-si', 'Gyeonggi-do_Seongnam-si', 'Seoul_Songpa-gu', 'Seoul_Songpa-gu', 'Seoul_Songpa-gu', 'Incheon_Yeonsu-gu', 'Incheon_Yeonsu-gu', 'Gyeonggi-do_Seongnam-si', 'Gyeonggi-do_Seongnam-si', 'Seoul_Jungnang-gu'], ['Seoul_Jongno-gu', 'Seoul_Jongno-gu', 'Seoul_Seongbuk-gu', 'Seoul_Jongno-gu'], ['Seoul_Seodaemun-gu', 'Seoul_Jung-gu', 'Seoul_Seodaemun-gu', 'Seoul_Mapo-gu', 'Seoul_Seodaemun-gu'], ['Seoul_Jongno-gu', 'Seoul_Seongbuk-gu', 'Seoul_Jongno-gu', 'Gyeonggi-do_Uijeongbu-si', 'Seoul_Jongno-gu', '

In [10]:
''' Uses SPMF to find association rules in supplied transactions '''
def get_association_rules(sequences, min_sup, min_conf):
    # step 1: create required input for SPMF

    # prepare a dict to uniquely assign each item in the transactions to an int ID
    item_dict = defaultdict(int)
    output_dict = defaultdict(str)
    item_id = 1

    # write your sequences in SPMF format
    with open('seq_rule_input.txt', 'w+') as f:
        for sequence in sequences:
            z = []
            for itemset in sequence:
                # if there are multiple items in one itemset
                if isinstance(itemset, list):
                    for item in itemset:
                        if item not in item_dict:
                            item_dict[item] = item_id
                            item_id += 1

                        z.append(item_dict[item])
                else:
                    if itemset not in item_dict:
                        item_dict[itemset] = item_id
                        output_dict[str(item_id)] = itemset
                        item_id += 1
                    z.append(item_dict[itemset])

                # end of itemset
                z.append(-1)

            # end of a sequence
            z.append(-2)
            f.write(' '.join([str(x) for x in z]))
            f.write('\n')

    # run SPMF with supplied parameters
    supp_param = '{}%'.format(int(min_sup * 100))
    conf_param = '{}%'.format(int(min_conf * 100))
    subprocess.call(['java', '-jar', 'spmf.jar', 'run', 'RuleGrowth', 'seq_rule_input.txt', 'seq_rule_output.txt', supp_param, conf_param], shell=True)

    # read back the output rules
    outputs = open('seq_rule_output.txt', 'r').read().strip().split('\n')
    output_rules = []
    for rule in outputs:
        left, right, sup, conf = re.search(pattern=r'([0-9\,]+) ==> ([0-9\,]+) #SUP: ([0-9]+) #CONF: ([0-9\.]+)', string=rule).groups()
        sup = int(sup) / len(sequences)
        conf = float(conf)
        output_rules.append([[output_dict[x] for x in left.split(',')], [output_dict[x] for x in right.split(',')], sup, conf])

    # return pandas DataFrame
    return pd.DataFrame(output_rules, columns = ['Left_rule', 'Right_rule', 'Support', 'Confidence'])

In [11]:
#sort by confidence as confidence determines how dependent each item is on another.
get_association_rules(sequences, 0.01, 0.01).sort_values(by='Confidence', ascending=False)

Unnamed: 0,Left_rule,Right_rule,Support,Confidence
11,[Chungcheongnam-do_Asan-si],[Chungcheongnam-do_Cheonan-si],0.016515,0.714286
10,[Busan_Busanjin-gu],[Busan_Yeonje-gu],0.010735,0.619048
9,[Busan_Haeundae-gu],[Busan_Yeonje-gu],0.012386,0.46875
8,[Busan_Dongnae-gu],[Busan_Yeonje-gu],0.014864,0.428571
0,[Seoul_Songpa-gu],[Seoul_Jungnang-gu],0.01569,0.345455
2,[Incheon_Jung-gu],[Seoul_Gangnam-gu],0.029727,0.24
7,[Daegu_Jung-gu],[Daegu_Buk-gu],0.014038,0.226667
6,[Seoul_Guro-gu],[Seoul_Dongjak-gu],0.013212,0.197531
3,[Incheon_Jung-gu],[Seoul_Songpa-gu],0.020644,0.166667
1,[Incheon_Jung-gu],[Seoul_Jungnang-gu],0.017341,0.14


#### Study outcomes
The study outcome of this study can be used to determine locations which are hotspots, and determine locations which are linked which would further spread COVID-19. These outcomes could help determine mitigation strategies. 