# 연관분석

### 대용량의 트랜잭션 데이터로부터 “X이면 Y이다” 형식의 연관 관계를 발견하는 기법

### 대형마트에 방문하여 물건을 구입하는 손님들중 손님 한명이 산 물품을 트랜잭션(transaction) 이라고 한다. 대형마트는 데이터베이스를 구축하고 있으며, 수많은 트랜잭션들을 분석하여 의미있는 패턴을 찾아낼 수 있다.

In [None]:
import csv
with open('myenv/basket.csv', 'r', encoding='utf-8') as f:
    transactions = []
    rows = csv.reader(f)
    for row in rows:
        transactions.append(row)

In [None]:
transactions

In [None]:
pip install apyori

### apriori(transactions, **kwargs)
* transactions : 연관규칙을 생성할 transaction data. list형식
* min_support : 최소 지지도 설정. 기본값은 0.1
* min_confidence : 최소 신뢰도 설정. 기본값은 0.0
* min_lift : 최소 향상도 설정. 기본값은 0.0
* max_length : 관계의 최대 길이를 정수형으롤 설정. 기본값은 None

### 연관규칙(Association rule)의 대표적인 형태로써, 데이터들에 대한 발생빈도(빈발, frequent) 를 기반으로 각 데이터 간의 연관관계를 밝히기 위한 방법을 말하며, 장바구니 분석을 예로 들수 있다.

In [None]:
from apyori import apriori
rules = apriori(transactions, min_support = 0.1, min_confidence = 0.1)
results = list(rules)

In [None]:
type(results)

In [None]:
len(results)

In [None]:
results[0]

In [None]:
results[10]

In [None]:
print("lhs\trhs\tsupport\t\tconfidence\tlift")
print('-' * 40)
for row in results:
    support = row[1]
    ordered_stat = row[2]
    for ordered_item in ordered_stat:
        lhs = [x for x in ordered_item[0]]
        rhs = [x for x in ordered_item[1]]
        confidence = ordered_item[2]
        lift = ordered_item[3]
        print(lhs, " => ", rhs, "\t{:>5.4f}\t{:>5.4f}\t{:>5.4f}". \
              format(support, confidence, lift))

In [None]:
import requests

In [None]:
url = 'http://fs.jtbc.joins.com/RSS/economy.xml'
jtbc_economy = requests.get(url)

In [None]:
from bs4 import BeautifulSoup
economy_news_list = BeautifulSoup(jtbc_economy.content, 'xml')
link_list = economy_news_list.select('item > link')

In [None]:
len(link_list)

In [None]:
link_list[0].text

In [None]:
from konlpy.tag import Mecab
mecab = Mecab()

In [None]:
news = list()
for link in link_list:
    news_url = link.text
    news_response = requests.get(news_url)
    news_soup = BeautifulSoup(news_response.content, 'html.parser')
    news_content = news_soup.select_one('#articlebody > .article_content')
    nouns_list = mecab.nouns(news_content.text)
    news.append([word for word in nouns_list if len(word) > 1])

In [None]:
news

In [None]:
type(news)

In [None]:
from apyori import apriori
rules = apriori(news, min_support = 0.2, min_confidence = 0.2)
results = list(rules)
len(results)

In [None]:
import pandas as pd
df = pd.DataFrame(None, columns = ["lhs", "rhs", "support", "confidence", "lift"])
index = 0
for row in results:
    support = row[1]
    ordered_stat = row[2]
    for ordered_item in ordered_stat:
        lhs = " ".join(x.strip() for x in ordered_item[0])
        rhs = " ".join(x.strip() for x in ordered_item[1])
        confidence = ordered_item[2]
        lift = ordered_item[3]
        df.loc[index] = [lhs, rhs, support, confidence, lift]
        index = index + 1

In [None]:
df.loc[df.lhs == ""].sort_values(by = ["support"], ascending = False)

In [None]:
df.loc[(df.lhs.str.contains("일본"))& (df.rhs=="한국")].sort_values(by=["lift"], ascending = False)

## 연관분석 연습하기

In [None]:
import csv
with open('myenv/mybasket.csv', 'r', encoding = 'utf8') as f:
    transactions = []
    rows = csv.reader(f)
    for row in rows : 
        transactions.append(row)

In [None]:
len(transactions)

In [None]:
from apyori import apriori
rules = apriori(transactions, min_support = 0.1, min_confidence = 0.1)
results = list(rules)

In [None]:
len(results)

In [None]:
results[0]

In [None]:
results[10]

In [None]:
for row in results:
    support = row[1]
    ordered_stat = row[2]
    for ordered_item in ordered_stat:
        lhs = [x for x in ordered_item[0]]
        rhs = [x for x in ordered_item[1]]
        confidence = ordered_item[2]
        lift = ordered_item[3]
        print(lhs, " => ", rhs, "\t{:>5.4f}\t{:>5.4f}\t{:>5.4f}".format(support, confidence, lift))

In [None]:
import pandas as pd
df = pd.DataFrame(None, columns=["lhs", "rhs", "support", "confidence", "lift"])
index = 0
for row in results:
    support = row[1]
    ordered_stat = row[2]
    for ordered_item in ordered_stat:
        lhs = " ".join(x.strip() for x in ordered_item[0])
        rhs = " ".join(x.strip() for x in ordered_item[1])
        confidence = ordered_item[2]
        lift = ordered_item[3]
        df.loc[index] = [lhs, rhs, support, confidence, lift]
        index = index + 1

In [None]:
df.sort_values(by = ["lift"], ascending = False).head()   #연관규칙을 향상도(lift)의 내림차순으로 정렬

In [None]:
df.loc[df.lhs == ""].sort_values(by = ["support"], ascending = False)   #가장 많이 구매한 아이템 순으로 내림차순 정렬

In [None]:
df.loc[df.lhs==""].sort_values(by = ["support"], ascending = False)   #가장 많이 팔린 두 제품은?

In [None]:
#가장 많이 팔린 두 제품사이의 신뢰도와 향상도는?
df.loc[(df.lhs=="clothes") & (df.rhs=="snack")].sort_values(by=["support"],	ascending=False)  

In [None]:
df.loc[(df.lhs=="snack") & (df.rhs=="clothes")].sort_values(by=["support"], ascending=False)

In [None]:
#두 제품(clothes와 snack)을 포함하는 제품들 사이의 신뢰도와 향상도는?
df.loc[(df.lhs.str.contains("snack")) & (df.rhs=="clothes")].sort_values(by=["support"], ascending=False)

In [None]:
df.loc[(df.lhs.str.contains("clothes")) & (df.rhs=="snack")].sort_values(by=["support"], ascending=False)

# Exercise
#### https://github.com/swacademy/NLP-Lab/tree/master/Chapter6/basket1.csv
#### 연관분석을 수행하기