# ▒ 연관규칙분석(Association Rule) ▒


## 0. 환경설정

association_rules : 빈발후보군에서 규칙을 찾는 rule
* TransactionEncoder : 장바구니 분석으로 인코딩

In [2]:
import pandas as pd
import os
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder

In [3]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


## 1. 데이터 준비

### 데이터 설명
#### 미국 Census Bureau의 Census Income데이터 베이스에 추출한 설문조사 자료
- 관측치의 개수 : 48842개
- 나이, 직업군, 교육정도 등의 주로 범주형인 15개의 변수 포함

* age와 hours-per-week를 연속형 변수에서 범주형으로 변환시켜줌
* NA 자체도 범주로 여긴다고? 

In [5]:
Adult_file = "/content/gdrive/MyDrive/datamining/data/adult.csv"
Adult_df = pd.read_csv(Adult_file)
Adult_df = Adult_df.iloc[:1000]

In [6]:
Adult_df.head()

Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,Middle-aged,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,Low,,Full-time,United-States,small
1,Senior,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,,,Part-time,United-States,small
2,Middle-aged,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,,,Full-time,United-States,small
3,Senior,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,,,Full-time,United-States,small
4,Middle-aged,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,,,Full-time,Cuba,small


### 데이터 변환

In [7]:
for i in range(len(Adult_df.columns)):
  Adult_df.iloc[:,i] = (Adult_df.columns[i]+'='+Adult_df.iloc[:,i]).astype(str)

Adult_df.head()

Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,age=Middle-aged,workclass=State-gov,education=Bachelors,marital-status=Never-married,occupation=Adm-clerical,relationship=Not-in-family,race=White,sex=Male,capital-gain=Low,capital-loss=None,hours-per-week=Full-time,native-country=United-States,income=small
1,age=Senior,workclass=Self-emp-not-inc,education=Bachelors,marital-status=Married-civ-spouse,occupation=Exec-managerial,relationship=Husband,race=White,sex=Male,capital-gain=None,capital-loss=None,hours-per-week=Part-time,native-country=United-States,income=small
2,age=Middle-aged,workclass=Private,education=HS-grad,marital-status=Divorced,occupation=Handlers-cleaners,relationship=Not-in-family,race=White,sex=Male,capital-gain=None,capital-loss=None,hours-per-week=Full-time,native-country=United-States,income=small
3,age=Senior,workclass=Private,education=11th,marital-status=Married-civ-spouse,occupation=Handlers-cleaners,relationship=Husband,race=Black,sex=Male,capital-gain=None,capital-loss=None,hours-per-week=Full-time,native-country=United-States,income=small
4,age=Middle-aged,workclass=Private,education=Bachelors,marital-status=Married-civ-spouse,occupation=Prof-specialty,relationship=Wife,race=Black,sex=Female,capital-gain=None,capital-loss=None,hours-per-week=Full-time,native-country=Cuba,income=small


In [8]:
# TransactionEncoder()은 list 형태로만 받음

Adult_list = Adult_df.values.tolist()    # 값만 추출해서, list로 바꿔라!
Adult_list[0:2]

[['age=Middle-aged',
  'workclass=State-gov',
  'education=Bachelors',
  'marital-status=Never-married',
  'occupation=Adm-clerical',
  'relationship=Not-in-family',
  'race=White',
  'sex=Male',
  'capital-gain=Low',
  'capital-loss=None',
  'hours-per-week=Full-time',
  'native-country=United-States',
  'income=small'],
 ['age=Senior',
  'workclass=Self-emp-not-inc',
  'education=Bachelors',
  'marital-status=Married-civ-spouse',
  'occupation=Exec-managerial',
  'relationship=Husband',
  'race=White',
  'sex=Male',
  'capital-gain=None',
  'capital-loss=None',
  'hours-per-week=Part-time',
  'native-country=United-States',
  'income=small']]

In [12]:
oht = TransactionEncoder()
oht_ary = oht.fit(Adult_list).transform(Adult_list)
df = pd.DataFrame(oht_ary, columns=oht.columns_)
df.head()

Unnamed: 0,age=Middle-aged,age=Old,age=Senior,age=Young,capital-gain=High,capital-gain=Low,capital-gain=None,capital-loss=High,capital-loss=Low,capital-loss=None,...,relationship=Unmarried,relationship=Wife,sex=Female,sex=Male,workclass=Federal-gov,workclass=Local-gov,workclass=Private,workclass=Self-emp-inc,workclass=Self-emp-not-inc,workclass=State-gov
0,True,False,False,False,False,True,False,False,False,True,...,False,False,False,True,False,False,False,False,False,True
1,False,False,True,False,False,False,True,False,False,True,...,False,False,False,True,False,False,False,False,True,False
2,True,False,False,False,False,False,True,False,False,True,...,False,False,False,True,False,False,True,False,False,False
3,False,False,True,False,False,False,True,False,False,True,...,False,False,False,True,False,False,True,False,False,False
4,True,False,False,False,False,False,True,False,False,True,...,False,True,True,False,False,False,True,False,False,False


## 2. 연관규칙분석

### 지지도가 0.5 이상인 빈발품목집합

* 원래는 2의 101승 => 70 개로 줄어듬

In [15]:
frequent_itemsets = apriori(df, min_support=0.5, use_colnames=True)
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.530,(age=Middle-aged)
1,0.919,(capital-gain=None)
2,0.950,(capital-loss=None)
3,0.580,(hours-per-week=Full-time)
4,0.768,(income=small)
...,...,...
65,0.554,"(capital-gain=None, native-country=United-Stat..."
66,0.564,"(capital-loss=None, native-country=United-Stat..."
67,0.514,"(capital-loss=None, sex=Male, native-country=U..."
68,0.510,"(capital-loss=None, native-country=United-Stat..."


# 연관규칙을 찾기! (신뢰도를 가지고)

### 지지도가 0.5이상이면서 신뢰도(confidenc)가 0.7(min_threshold) 이상인 item
* 263개의 연관규칙
* ex) (age=Middle-aged)이면, 	(capital-loss=None)	이다.
* support : (A와 B 교집합)의 지지도

In [16]:
rules = association_rules(frequent_itemsets, metric="confidence", 
                          min_threshold=0.7)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(age=Middle-aged),(capital-loss=None),0.530,0.950,0.501,0.945283,0.995035,-0.002500,0.913793
1,(capital-loss=None),(capital-gain=None),0.950,0.919,0.869,0.914737,0.995361,-0.004050,0.950000
2,(capital-gain=None),(capital-loss=None),0.919,0.950,0.869,0.945593,0.995361,-0.004050,0.919000
3,(hours-per-week=Full-time),(capital-gain=None),0.580,0.919,0.536,0.924138,1.005591,0.002980,1.067727
4,(capital-gain=None),(income=small),0.919,0.768,0.732,0.796518,1.037133,0.026208,1.140150
...,...,...,...,...,...,...,...,...,...
258,"(capital-loss=None, native-country=United-Stat...","(capital-gain=None, race=White)",0.670,0.774,0.534,0.797015,1.029735,0.015420,1.113382
259,"(race=White, income=small)","(capital-loss=None, capital-gain=None, native-...",0.640,0.785,0.534,0.834375,1.062898,0.031600,1.298113
260,"(capital-gain=None, income=small)","(capital-loss=None, native-country=United-Stat...",0.732,0.740,0.534,0.729508,0.985822,-0.007680,0.961212
261,"(native-country=United-States, income=small)","(capital-loss=None, capital-gain=None, race=Wh...",0.693,0.732,0.534,0.770563,1.052681,0.026724,1.168075


### 지지도가 0.5이상이면서 신뢰도가 0.8 이상, 향상도 0.9이상인 item

In [18]:
rules[ (rules['confidence'] >= 0.8) &
       (rules['lift'] >= 0.9) ]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(age=Middle-aged),(capital-loss=None),0.530,0.950,0.501,0.945283,0.995035,-0.002500,0.913793
1,(capital-loss=None),(capital-gain=None),0.950,0.919,0.869,0.914737,0.995361,-0.004050,0.950000
2,(capital-gain=None),(capital-loss=None),0.919,0.950,0.869,0.945593,0.995361,-0.004050,0.919000
3,(hours-per-week=Full-time),(capital-gain=None),0.580,0.919,0.536,0.924138,1.005591,0.002980,1.067727
5,(income=small),(capital-gain=None),0.768,0.919,0.732,0.953125,1.037133,0.026208,1.728000
...,...,...,...,...,...,...,...,...,...
250,"(capital-gain=None, race=White, income=small)","(capital-loss=None, native-country=United-States)",0.610,0.861,0.534,0.875410,1.016736,0.008790,1.115658
253,"(native-country=United-States, race=White, inc...","(capital-loss=None, capital-gain=None)",0.584,0.869,0.534,0.914384,1.052225,0.026504,1.530080
254,"(capital-loss=None, race=White, income=small)","(capital-gain=None, native-country=United-States)",0.619,0.826,0.534,0.862682,1.044409,0.022706,1.267129
256,"(capital-gain=None, native-country=United-Stat...","(capital-loss=None, race=White)",0.658,0.805,0.534,0.811550,1.008137,0.004310,1.034758
