In [1]:
## Install external libraries here


In [1]:
## Import packages here

from IPython.display import display
import pandas as pd 
import numpy as np 
from apyori import apriori

# Project (a): Association Mining 

### Task 1: What variables I used in my analysis? Justify my choice

<font color=red> As requested to identify the common routes of COVID-19 positive patients travelled in the given dates, it is essential to include **Location** and **Patient_id** as the variables for the association mining rules </font>


The reason is simple, location is the only string indicating the routes' components (only column displays the information of people's destination) while patient_id is unique, helping to differentiate transactions by specific individuals

<p style="text-align: center;">-----------------------------------------------</p>

### Task 2: What pre-processing step was required on the dataset before building the assosciation 

In [2]:
df = pd.read_csv("D1.csv", keep_default_na = False, na_values = ["", "nan"])
df['date']=pd.to_datetime(df['date'])
df.set_index('date',inplace=True)
df=df[['patient_id','location']]

In [3]:
df=df[['patient_id','location']]

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1509 entries, 2020-01-22 to 2020-03-24
Data columns (total 2 columns):
patient_id    1509 non-null int64
location      1509 non-null object
dtypes: int64(1), object(1)
memory usage: 35.4+ KB


In [5]:
display(df.tail(5))

Unnamed: 0_level_0,patient_id,location
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-06-03,6100000083,Daegu_Buk-gu
2020-03-16,6100000085,Gyeongsangnam-do_Changwon-si
2020-03-14,6100000086,Daegu_Dalseong-gun
2020-03-24,6100000090,Incheon_Jung-gu
2020-03-24,6100000090,Busan_Gangseo-gu


**It can be seen that <font color=red>'Date'</font> should be formatted into datetime type for future index implemenetation. There are missing values only in the column <font color=red>'Global num'</font>, depending on its importance, we will inform the appropriate preprocessing method later**

<p style="text-align: center;">-----------------------------------------------</p>

### Task 3: Association Mining Conduct:

#### a. What is 'min_support' threshold set and why you choose it?

**Frequent itemsets are the ones which occur at least a minimum number of times in the transactions. Technically, these are the itemsets for which support value (fraction of transactions containing the itemset) is above a minimum threshold — minsup.**

In [6]:
def convert_apriori_results_to_pandas_df(results):
    rules = []

    for rule_set in results:

        for rule in rule_set.ordered_statistics:
            rules.append([','.join(rule.items_base), ','.join(rule.items_add),rule_set.support, rule.confidence, rule.lift])
    
    return pd.DataFrame(rules, columns=['Left_side', 'Right_side', 'Support', 'Confidence', 'Lift'])

In [7]:
group = df.groupby(['patient_id'])['location'].apply(list)
groupList = list(group)

result = list(apriori(groupList, min_support=0.001122))

resultDf = convert_apriori_results_to_pandas_df(result)

resultDf = resultDf.sort_values(by="Lift", ascending=False)

resultDf.head(5)

Unnamed: 0,Left_side,Right_side,Support,Confidence,Lift
6153,"Gyeonggi-do_Icheon-si,Seoul_Yongsan-gu","Seoul_Yeongdeungpo-gu,Seoul_Mapo-gu",0.001122,1.0,891.0
6355,"Busan_Dong-gu,Busan_Nam-gu,Chungcheongbuk-do_C...",Busan_Yeongdo-gu,0.001122,1.0,891.0
6352,"Chungcheongbuk-do_Cheongju-si,Busan_Yeongdo-gu...","Busan_Dong-gu,Busan_Nam-gu",0.001122,1.0,891.0
6351,"Busan_Nam-gu,Chungcheongbuk-do_Cheongju-si,Chu...","Busan_Dong-gu,Busan_Yeongdo-gu",0.001122,1.0,891.0
6350,"Busan_Nam-gu,Busan_Yeongdo-gu,Chungcheongbuk-d...","Busan_Dong-gu,Chungcheongbuk-do_Cheongju-si",0.001122,1.0,891.0


**It implies that 2.8% people went to Incheon_Jung-gu after living in Seoul_Gangnam-gu and people going to Seoul Gangnam-gu have 5% probability of going to Incheon_jung-gu as their next destination**

#### b. Top 10 common routes that COVID-19 patients from Seoul_Dongjak-gu have travelled

In [15]:
result_seoul = resultDf[resultDf['Left_side']=='Seoul_Dongjak-gu']

In [16]:
result_seoul.head(10)

Unnamed: 0,Left_side,Right_side,Support,Confidence,Lift
6671,Seoul_Dongjak-gu,"Daegu_Dalseo-gu,Daegu_Dong-gu,Seoul_Yangcheon-...",0.001122,0.012658,11.278481
4290,Seoul_Dongjak-gu,"Seoul_Jung-gu,Seoul_Yongsan-gu",0.001122,0.012658,11.278481
6641,Seoul_Dongjak-gu,"Daegu_Buk-gu,Gyeongsangbuk-do_Chilgok-gun,Seou...",0.001122,0.012658,11.278481
4283,Seoul_Dongjak-gu,"Seoul_Guro-gu,Seoul_Geumcheon-gu",0.001122,0.012658,11.278481
5789,Seoul_Dongjak-gu,"Gyeongsangbuk-do_Chilgok-gun,Seoul_Jung-gu,Dae...",0.001122,0.012658,11.278481
1290,Seoul_Dongjak-gu,Gyeongsangbuk-do_Chilgok-gun,0.001122,0.012658,11.278481
5564,Seoul_Dongjak-gu,"Daegu_Jung-gu,Seoul_Jung-gu,Daegu_Nam-gu",0.001122,0.012658,11.278481
6487,Seoul_Dongjak-gu,"Daegu_Jung-gu,Daegu_Buk-gu,Daegu_Nam-gu,Daegu_...",0.001122,0.012658,11.278481
5189,Seoul_Dongjak-gu,"Daegu_Buk-gu,Seoul_Jung-gu,Daegu_Jung-gu",0.001122,0.012658,11.278481
6983,Seoul_Dongjak-gu,"Daegu_Buk-gu,Daegu_Seo-gu,Daegu_Nam-gu,Seoul_J...",0.001122,0.012658,11.278481


In [43]:
transactions = df.groupby(['patient_id'])['location'].apply(list)
sequences = transactions.values.tolist()

# show the first 5 sequences
print(sequences[:5])

[['Gyeonggi-do_Gimpo-si', 'Seoul_Jung-gu'], ['Seoul_Seongdong-gu', 'Seoul_Dongdaemun-gu', 'Seoul_Gangnam-gu'], ['Seoul_Jungnang-gu'], ['Seoul_Jungnang-gu'], ['Gyeonggi-do_Goyang-si']]


In [45]:
z=get_association_rules(sequences, 0.001122, 0.001)

In [42]:
from collections import defaultdict
import subprocess
import re

''' Uses SPMF to find association rules in supplied transactions '''
def get_association_rules(sequences, min_sup, min_conf):
    # step 1: create required input for SPMF
    
    # prepare a dict to uniquely assign each item in the transactions to an int ID
    item_dict = defaultdict(int)
    output_dict = defaultdict(str)
    item_id = 1
    
    # write your sequences in SPMF format
    with open('seq_rule_input.txt', 'w+') as f:
        for sequence in sequences:
            z = []
            for itemset in sequence: 
                # if there are multiple items in one itemset
                if isinstance(itemset, list):
                    for item in itemset:
                        if item not in item_dict:
                            item_dict[item] = item_id
                            item_id += 1

                        z.append(item_dict[item])
                else:
                    if itemset not in item_dict:
                        item_dict[itemset] = item_id
                        output_dict[str(item_id)] = itemset
                        item_id += 1
                    z.append(item_dict[itemset])
                    
                # end of itemset
                z.append(-1)
            
            # end of a sequence
            z.append(-2)
            f.write(' '.join([str(x) for x in z]))
            f.write('\n')
    
    # run SPMF with supplied parameters
    supp_param = '{}%'.format(int(min_sup * 100))
    conf_param = '{}%'.format(int(min_conf * 100))
    subprocess.call(['java', '-jar', 'spmf.jar', 'run', 'RuleGrowth', 
                     'seq_rule_input.txt', 'seq_rule_output.txt', 
                     supp_param, conf_param], shell=True)
    
    # read back the output rules
    outputs = open('seq_rule_output.txt', 'r').read().strip().split('\n')
    output_rules = []
    for rule in outputs:
        left, right, sup, conf = re.search(pattern=r'([0-9\,]+) ==> ([0-9\,]+) #SUP: ([0-9]+) #CONF: ([0-9\.]+)', string=rule).groups()
        sup = int(sup) / len(sequences)
        conf = float(conf)
        output_rules.append([[output_dict[x] for x in left.split(',')], [output_dict[x] for x in right.split(',')], sup, conf])
    
    # return pandas DataFrame
    return pd.DataFrame(output_rules, columns = ['Left_rule', 'Right_rule', 'Support', 'Confidence'])

In [46]:
z.head(10)

Unnamed: 0,Left_rule,Right_rule,Support,Confidence
0,[Gyeonggi-do_Gimpo-si],[Seoul_Jung-gu],0.002245,0.2
1,[Gyeonggi-do_Gimpo-si],"[Seoul_Jung-gu, Daejeon_Dong-gu]",0.001122,0.1
2,[Gyeonggi-do_Gimpo-si],[Gyeonggi-do_Goyang-si],0.001122,0.1
3,"[Gyeonggi-do_Gimpo-si, Daegu_Jung-gu]",[Gyeonggi-do_Goyang-si],0.001122,0.5
4,[Gyeonggi-do_Gimpo-si],[Seoul_Mapo-gu],0.001122,0.1
5,"[Gyeonggi-do_Gimpo-si, Seoul_Yongsan-gu]",[Seoul_Mapo-gu],0.001122,1.0
6,"[Gyeonggi-do_Gimpo-si, Seoul_Yongsan-gu, Seoul...",[Seoul_Mapo-gu],0.001122,1.0
7,"[Gyeonggi-do_Gimpo-si, Seoul_Yeongdeungpo-gu]",[Seoul_Mapo-gu],0.001122,1.0
8,[Gyeonggi-do_Gimpo-si],"[Seoul_Mapo-gu, Gyeonggi-do_Icheon-si]",0.001122,0.1
9,"[Gyeonggi-do_Gimpo-si, Seoul_Yongsan-gu]","[Seoul_Mapo-gu, Gyeonggi-do_Icheon-si]",0.001122,1.0


The first rule is Gyeonggi_do_Gimpo-si => Seoul_Jung-gu with 0.002245 support and 0.2 confidence. This is a low-rate rule. The
support value implies that 0.22% of patients go to Seoul_Jung-gu after visiting Gyeonggi-do_Gimpo-si . The confidence value
implies that if a patient has been to Gyeonggi-do_Gimpo-si , the probability of them going to Seoul_Jung-gu subsequently is
20%%