# 数据挖掘作业二：关联规则挖掘

## 数据集：Chicago Building Violations

### (1)先对数据集经行预处理，转换成适合关联规则挖掘的形式，保存在pre_dm.csv

In [1]:
import pandas as pd
import numpy as np
from itertools import chain, combinations
from collections import defaultdict

In [2]:
csv_file = pd.read_csv(r"building-violations.csv", low_memory=False)

In [3]:
csv_file

Unnamed: 0,ID,VIOLATION LAST MODIFIED DATE,VIOLATION DATE,VIOLATION CODE,VIOLATION STATUS,VIOLATION STATUS DATE,VIOLATION DESCRIPTION,VIOLATION LOCATION,VIOLATION INSPECTOR COMMENTS,VIOLATION ORDINANCE,...,ADDRESS,STREET NUMBER,STREET DIRECTION,STREET NAME,STREET TYPE,PROPERTY GROUP,SSA,LATITUDE,LONGITUDE,LOCATION
0,1001846,2008-05-20T11:55:49,2007-03-15T00:00:00,EV0065,COMPLIED,2008-04-28T00:00:00,TEST G & S PASS ELE,BOTH CARS,FULL LOAD,Test governor and car safety and submit copy o...,...,330 N JEFFERSON ST,330,N,JEFFERSON,ST,1901,,41.887764,-87.642977,"{'needs_recoding': False, 'longitude': '-87.64..."
1,1001847,2008-05-20T11:55:49,2007-03-15T00:00:00,EV0117,COMPLIED,2008-04-28T00:00:00,REP/REPL DEF ALARM BELL PASS,BOTH CARS,PROPERLY PROGRAM EMERGENCY PHONES IN BOTH CARS,Repair or replace defective emergency signal f...,...,330 N JEFFERSON ST,330,N,JEFFERSON,ST,1901,,41.887764,-87.642977,"{'needs_recoding': False, 'longitude': '-87.64..."
2,1002009,2011-12-07T11:58:27,2009-04-27T00:00:00,EV0252,COMPLIED,2011-10-25T00:00:00,REMOVE DEBRIS FROM PIT PASS,,,Remove accumulated debris from pit for passeng...,...,5530 N WINTHROP AVE,5530,N,WINTHROP,AVE,12102,26.0,41.982772,-87.658138,"{'needs_recoding': False, 'longitude': '-87.65..."
3,1002038,2006-11-16T11:14:46,2006-10-02T00:00:00,199029,OPEN,,OPEN TYPE VIOLATION,,RE: BOTH CARS-1. REPAIR DOOR RESTRICTORS TO OP...,,...,1055 W CATALPA AVE,1055,W,CATALPA,AVE,12037,,41.981733,-87.657299,"{'needs_recoding': False, 'longitude': '-87.65..."
4,1002039,2006-11-16T11:37:58,2006-10-02T00:00:00,199029,OPEN,,OPEN TYPE VIOLATION,,2. INSTALL MISSING SIGHT GUARDS ON 1ST FLOOR L...,,...,1055 W CATALPA AVE,1055,W,CATALPA,AVE,12037,,41.981733,-87.657299,"{'needs_recoding': False, 'longitude': '-87.65..."
5,1002040,2006-11-16T11:37:58,2006-10-02T00:00:00,199029,OPEN,,OPEN TYPE VIOLATION,,3. REPAIR TELEPHONES TO OPERATE PROPERLY. BOT...,,...,1055 W CATALPA AVE,1055,W,CATALPA,AVE,12037,,41.981733,-87.657299,"{'needs_recoding': False, 'longitude': '-87.65..."
6,1002042,2006-11-16T11:16:37,2006-10-02T00:00:00,199029,OPEN,,OPEN TYPE VIOLATION,,5. CLEAN ELEVATOR PITS OF DEBRIS. (13-20-120 &...,,...,1055 W CATALPA AVE,1055,W,CATALPA,AVE,12037,,41.981733,-87.657299,"{'needs_recoding': False, 'longitude': '-87.65..."
7,1002044,2006-11-16T11:37:58,2006-10-02T00:00:00,199029,OPEN,,OPEN TYPE VIOLATION,,"PROPERLY REPAIR SAFETY DEVICE ON CAR DOORS, CA...",,...,1055 W CATALPA AVE,1055,W,CATALPA,AVE,12037,,41.981733,-87.657299,"{'needs_recoding': False, 'longitude': '-87.65..."
8,1002047,2008-04-08T12:19:51,2008-03-06T00:00:00,199029,OPEN,,OPEN TYPE VIOLATION,ELEVATOR CAB,INSTALL LIGHT PROTECTION IN ELEVATOR CAB.,,...,5534 N KENMORE AVE,5534,N,KENMORE,AVE,12066,26.0,41.982909,-87.656743,"{'needs_recoding': False, 'longitude': '-87.65..."
9,1002048,2008-04-08T12:19:59,2008-03-06T00:00:00,199029,OPEN,,OPEN TYPE VIOLATION,,OBTAIN PERMIT AND INSTALL CAR DOOR RESTRICTOR ...,,...,5534 N KENMORE AVE,5534,N,KENMORE,AVE,12066,26.0,41.982909,-87.656743,"{'needs_recoding': False, 'longitude': '-87.65..."


In [4]:
# 显示各属性的属性值的数量
for column in csv_file.columns:
    print(column,csv_file[column].value_counts().__len__())

ID 1611061
VIOLATION LAST MODIFIED DATE 916005
VIOLATION DATE 4244
VIOLATION CODE 1456
VIOLATION STATUS 3
VIOLATION STATUS DATE 3971
VIOLATION DESCRIPTION 1302
VIOLATION LOCATION 51895
VIOLATION INSPECTOR COMMENTS 1020229
VIOLATION ORDINANCE 1293
INSPECTOR ID 355
INSPECTION NUMBER 360621
INSPECTION STATUS 4
INSPECTION WAIVED 1
INSPECTION CATEGORY 4
DEPARTMENT BUREAU 15
ADDRESS 149142
STREET NUMBER 9866
STREET DIRECTION 4
STREET NAME 1187
STREET TYPE 14
PROPERTY GROUP 134072
SSA 53
LATITUDE 148875
LONGITUDE 148860
LOCATION 148909


### 选取属性值种类（1,100）的属性作为关联规则挖掘的属性

In [5]:
column_usable=[]
for column in csv_file.columns:
    if csv_file[column].value_counts().__len__() < 100 and csv_file[column].value_counts().__len__() >1:
        column_usable.append(column)
print(column_usable)

['VIOLATION STATUS', 'INSPECTION STATUS', 'INSPECTION CATEGORY', 'DEPARTMENT BUREAU', 'STREET DIRECTION', 'STREET TYPE', 'SSA']


### 映射方式为：将原数据集中的各个元素(对应到一个对象的一个属性)取值拼接其属性名称作为该元素新的取值
### 对数据进行预处理，截取只包含合适属性的数据集，保存在pre_dm.csv

In [6]:
change_column=column_usable
data = csv_file[column_usable]
# 存储最终目标
trans_dict = {}
record_num = data.index.__len__()
for column in column_usable:
    new_line = [""]*record_num
    for index in data.index:
        item = data[column][index]
        try:
            if np.isnan(item):
                new_line[index] = ""
            else:
                # 拼接属性和属性值作为新属性值
                new_line[index] = change_column[column_usable.index(column)] + "_"+ str(item)
        except BaseException as e:
            new_line[index] = change_column[column_usable.index(column)] + "_" + str(item)
    trans_dict[column] = new_line

csv_write = pd.DataFrame(trans_dict)
# print(csv_write)
csv_write.to_csv('pre_dm.csv', index=False, header=False)

### （2）对预处理文件进行操作
### 找出频繁项集，导出关联规则，计算其支持度和置信度
### 使用Lift、Leverage 2种方式对规则进行评价
### Leverage(X->Y)=support(X->Y)-support(X)*support（Y）;Lift(X->Y)=support(X∩Y)/(support(X)*support（Y）)

In [7]:
# 读取记录文件
def dataFromFile(fname):
    file_iter = open(fname, 'rU')
    for line in file_iter:
            line = line.strip().rstrip(',')
            shopping_basket = line.split(',')
            while '' in shopping_basket:
                shopping_basket.remove('')
            record = frozenset(shopping_basket)
            yield record

In [8]:
# 返回数组arr的非空子集
def subsets(arr):
    return chain(*[combinations(arr, i + 1) for i, a in enumerate(arr)])

# 返回满足最小支持度的子集
def returnItemsWithMinSupport(itemSet, transactionList, minSupport, freqSet):
        _itemSet = set()
        localSet = defaultdict(int)

        for item in itemSet:
                for transaction in transactionList:
                        if item.issubset(transaction):
                                freqSet[item] += 1
                                localSet[item] += 1

        for item, count in localSet.items():
                support = float(count)/len(transactionList)

                if support >= minSupport:
                        _itemSet.add(item)

        return _itemSet

# 返回指定长度的自连接集合
def joinSet(itemSet, length):
    return set([i.union(j) for i in itemSet for j in itemSet if len(i.union(j)) == length])

# 获取总集合及总记录
def getItemSetTransactionList(data_iterator):
    transactionList = list()
    itemSet = set()
    for record in data_iterator:
        transaction = frozenset(record)
        transactionList.append(transaction)
        for item in transaction:
            itemSet.add(frozenset([item]))              # Generate 1-itemSets
    return itemSet, transactionList

### 返回频繁项集:((频繁项集), 支持度)，关联规则:(((源频繁项集),(目标频繁项集)),支持度,置信度,lift,leverage)

In [16]:
def runApriori(data_iter, minSupport, minConfidence):
    itemSet, transactionList = getItemSetTransactionList(data_iter)
    # 所有项的频数 频繁项和非频繁项，1项和K项
    freqSet = defaultdict(int)
    # 存储各元频繁项集合 key=K，value=K项频繁项集合
    largeSet = dict()
    # 一元频繁项集合
    oneCSet = returnItemsWithMinSupport(itemSet, transactionList, minSupport, freqSet)

    currentLSet = oneCSet
    k = 2
    # 递归逐层求解
    while(currentLSet != set([])):
        # print(k)
        largeSet[k-1] = currentLSet
        currentLSet = joinSet(currentLSet, k)
        currentCSet = returnItemsWithMinSupport(currentLSet, transactionList, minSupport, freqSet)
        currentLSet = currentCSet
        k = k + 1

    def getSupport(item):
            return float(freqSet[item])/len(transactionList)

    toRetItems = []
    for key, value in list(largeSet.items()):
        toRetItems.extend([(tuple(item), getSupport(item))
                           for item in value])

    toRetRules = []
    for key, value in list(largeSet.items())[1:]:
        for item in value:
            _subsets = map(frozenset, [x for x in subsets(item)])
            for element in _subsets:
                remain = item.difference(element)
                if len(remain) > 0:
                    confidence = getSupport(item)/getSupport(element)
                    if confidence >= minConfidence:
                        toRetRules.append(((tuple(element), tuple(remain)),getSupport(item),
                                           confidence, confidence/getSupport(remain), confidence/getSupport(remain)-getSupport(remain)))
    return toRetItems, toRetRules

In [17]:
inFile = dataFromFile('pre_dm.csv')
minSupport = 0.2
minConfidence = 0.6
items, rules = runApriori(inFile, minSupport, minConfidence)

  This is separate from the ipykernel package so we can avoid doing imports until


### 频繁项集 、 支持度

In [18]:
items

[(('STREET DIRECTION_W',), 0.29859452869878916),
 (('VIOLATION STATUS_OPEN',), 0.6221856279805669),
 (('VIOLATION STATUS_COMPLIED',), 0.3744774406431538),
 (('STREET TYPE_ST',), 0.31167597005948255),
 (('DEPARTMENT BUREAU_CONSERVATION',), 0.6657873289714046),
 (('STREET DIRECTION_N',), 0.23668874114636254),
 (('STREET TYPE_AVE',), 0.5610631751373784),
 (('INSPECTION STATUS_FAILED',), 0.6950053412006125),
 (('INSPECTION CATEGORY_COMPLAINT',), 0.7070390258345277),
 (('STREET DIRECTION_S',), 0.4065091265942134),
 (('INSPECTION CATEGORY_PERIODIC',), 0.249014779701079),
 (('VIOLATION STATUS_COMPLIED', 'INSPECTION CATEGORY_COMPLAINT'),
  0.22759473415345538),
 (('INSPECTION STATUS_FAILED', 'STREET TYPE_AVE'), 0.39299877534121924),
 (('STREET DIRECTION_S', 'VIOLATION STATUS_OPEN'), 0.2681444091812787),
 (('STREET TYPE_AVE', 'VIOLATION STATUS_OPEN'), 0.35037034600179634),
 (('STREET DIRECTION_S', 'STREET TYPE_AVE'), 0.28754156422382515),
 (('DEPARTMENT BUREAU_CONSERVATION', 'VIOLATION STATUS_O

### 关联项集 、 支持度 、 置信度 、 lift 、leverage

In [19]:
rules

[((('VIOLATION STATUS_COMPLIED',), ('INSPECTION CATEGORY_COMPLAINT',)),
  0.22759473415345538,
  0.6077662081928573,
  0.8595935810975959,
  0.15255455526306816),
 ((('STREET TYPE_AVE',), ('INSPECTION STATUS_FAILED',)),
  0.39299877534121924,
  0.7004536971170707,
  1.0078393007844317,
  0.3128339595838192),
 ((('STREET DIRECTION_S',), ('VIOLATION STATUS_OPEN',)),
  0.2681444091812787,
  0.6596270332915465,
  1.0601772262604385,
  0.43799159827987166),
 ((('STREET TYPE_AVE',), ('VIOLATION STATUS_OPEN',)),
  0.35037034600179634,
  0.6244757480581521,
  1.0036807665985765,
  0.38149513861800965),
 ((('STREET DIRECTION_S',), ('STREET TYPE_AVE',)),
  0.28754156422382515,
  0.7073434405590988,
  1.2607197761391187,
  0.6996566010017403),
 ((('DEPARTMENT BUREAU_CONSERVATION',), ('VIOLATION STATUS_OPEN',)),
  0.4162126697871775,
  0.6251435731439908,
  1.0047541200413526,
  0.3825684920607857),
 ((('VIOLATION STATUS_OPEN',), ('DEPARTMENT BUREAU_CONSERVATION',)),
  0.4162126697871775,
  0.6689

### 对频繁项集按support降序排列——频繁项集	支持度

In [20]:
# 以support降序排列
for item, support in sorted(items, key=lambda x:x[1], reverse=True):

    print("%s %.3f\n" % (str(item), support))
    

('INSPECTION CATEGORY_COMPLAINT',) 0.707

('INSPECTION STATUS_FAILED',) 0.695

('DEPARTMENT BUREAU_CONSERVATION',) 0.666

('VIOLATION STATUS_OPEN',) 0.622

('STREET TYPE_AVE',) 0.561

('INSPECTION STATUS_FAILED', 'DEPARTMENT BUREAU_CONSERVATION') 0.542

('INSPECTION STATUS_FAILED', 'VIOLATION STATUS_OPEN') 0.523

('DEPARTMENT BUREAU_CONSERVATION', 'INSPECTION CATEGORY_COMPLAINT') 0.490

('INSPECTION STATUS_FAILED', 'INSPECTION CATEGORY_COMPLAINT') 0.481

('INSPECTION CATEGORY_COMPLAINT', 'VIOLATION STATUS_OPEN') 0.479

('DEPARTMENT BUREAU_CONSERVATION', 'VIOLATION STATUS_OPEN') 0.416

('STREET DIRECTION_S',) 0.407

('STREET TYPE_AVE', 'INSPECTION CATEGORY_COMPLAINT') 0.405

('INSPECTION STATUS_FAILED', 'DEPARTMENT BUREAU_CONSERVATION', 'VIOLATION STATUS_OPEN') 0.397

('INSPECTION STATUS_FAILED', 'STREET TYPE_AVE') 0.393

('INSPECTION STATUS_FAILED', 'INSPECTION CATEGORY_COMPLAINT', 'VIOLATION STATUS_OPEN') 0.392

('STREET TYPE_AVE', 'DEPARTMENT BUREAU_CONSERVATION') 0.384

('INSPECTION

### 导出关联规则，计算其支持度和置信度，按置信度降序排列——关联规则，支持度，置信度，lift，leverage

In [22]:
 #以confidence降序排列
for rule, support, confidence, lift,leverage in sorted(rules, key=lambda x:x[2], reverse=True):
    pre, post = rule
    print("%s ==>%s\n%.3f %.3f %.3f %.3f\n" % (str(pre), str(post), support, confidence, lift, leverage))

('STREET TYPE_AVE', 'DEPARTMENT BUREAU_CONSERVATION', 'VIOLATION STATUS_OPEN') ==>('INSPECTION STATUS_FAILED',)
0.229 0.955 1.374 0.679

('DEPARTMENT BUREAU_CONSERVATION', 'VIOLATION STATUS_OPEN') ==>('INSPECTION STATUS_FAILED',)
0.397 0.955 1.373 0.678

('DEPARTMENT BUREAU_CONSERVATION', 'INSPECTION CATEGORY_COMPLAINT', 'VIOLATION STATUS_OPEN') ==>('INSPECTION STATUS_FAILED',)
0.301 0.943 1.357 0.662

('STREET TYPE_AVE', 'VIOLATION STATUS_OPEN') ==>('INSPECTION STATUS_FAILED',)
0.296 0.845 1.215 0.520

('VIOLATION STATUS_OPEN',) ==>('INSPECTION STATUS_FAILED',)
0.523 0.841 1.210 0.515

('STREET DIRECTION_S', 'VIOLATION STATUS_OPEN') ==>('INSPECTION CATEGORY_COMPLAINT',)
0.225 0.840 1.188 0.481

('STREET TYPE_AVE', 'INSPECTION CATEGORY_COMPLAINT', 'VIOLATION STATUS_OPEN') ==>('INSPECTION STATUS_FAILED',)
0.225 0.823 1.184 0.489

('INSPECTION CATEGORY_COMPLAINT', 'VIOLATION STATUS_OPEN') ==>('INSPECTION STATUS_FAILED',)
0.392 0.818 1.177 0.482

('INSPECTION STATUS_FAILED', 'INSPECTION C

### 使用Lift提升度公式定义对每条规则打分
### 将结果按照Lift值降序排列——关联规则，支持度，置信度，lift，leverage

In [23]:
#以lift降序排列
for rule, support, confidence, lift,leverage in sorted(rules, key=lambda x:x[3], reverse=True):
    pre, post = rule
    print("%s ==> %s\n%.3f %.3f %.3f %.3f\n" % (str(pre), str(post), support, confidence, lift, leverage))

('DEPARTMENT BUREAU_CONSERVATION', 'VIOLATION STATUS_OPEN') ==> ('INSPECTION STATUS_FAILED', 'INSPECTION CATEGORY_COMPLAINT')
0.301 0.724 1.505 1.024

('INSPECTION STATUS_FAILED', 'INSPECTION CATEGORY_COMPLAINT') ==> ('DEPARTMENT BUREAU_CONSERVATION', 'VIOLATION STATUS_OPEN')
0.301 0.626 1.505 1.089

('STREET TYPE_AVE', 'DEPARTMENT BUREAU_CONSERVATION', 'VIOLATION STATUS_OPEN') ==> ('INSPECTION STATUS_FAILED',)
0.229 0.955 1.374 0.679

('DEPARTMENT BUREAU_CONSERVATION', 'VIOLATION STATUS_OPEN') ==> ('INSPECTION STATUS_FAILED',)
0.397 0.955 1.373 0.678

('DEPARTMENT BUREAU_CONSERVATION', 'INSPECTION CATEGORY_COMPLAINT', 'VIOLATION STATUS_OPEN') ==> ('INSPECTION STATUS_FAILED',)
0.301 0.943 1.357 0.662

('STREET TYPE_AVE', 'VIOLATION STATUS_OPEN') ==> ('INSPECTION STATUS_FAILED', 'INSPECTION CATEGORY_COMPLAINT')
0.225 0.641 1.332 0.851

('VIOLATION STATUS_OPEN',) ==> ('INSPECTION STATUS_FAILED', 'INSPECTION CATEGORY_COMPLAINT')
0.392 0.630 1.310 0.829

('INSPECTION STATUS_FAILED', 'INSPE

### 使用Leverage杠杆率进行评价对每条规则打分
### 将结果按照Leverage值降序排列——关联规则，支持度，置信度，lift，leverage

In [24]:
#以leverage降序排列
for rule, support, confidence, lift,leverage in sorted(rules, key=lambda x:x[4], reverse=True):
    pre, post = rule
    print("%s ==> %s\n%.3f %.3f %.3f %.3f\n" % (str(pre), str(post), support, confidence, lift, leverage))

('INSPECTION STATUS_FAILED', 'INSPECTION CATEGORY_COMPLAINT') ==> ('DEPARTMENT BUREAU_CONSERVATION', 'VIOLATION STATUS_OPEN')
0.301 0.626 1.505 1.089

('DEPARTMENT BUREAU_CONSERVATION', 'VIOLATION STATUS_OPEN') ==> ('INSPECTION STATUS_FAILED', 'INSPECTION CATEGORY_COMPLAINT')
0.301 0.724 1.505 1.024

('STREET TYPE_AVE', 'VIOLATION STATUS_OPEN') ==> ('INSPECTION STATUS_FAILED', 'INSPECTION CATEGORY_COMPLAINT')
0.225 0.641 1.332 0.851

('VIOLATION STATUS_OPEN',) ==> ('INSPECTION STATUS_FAILED', 'INSPECTION CATEGORY_COMPLAINT')
0.392 0.630 1.310 0.829

('STREET DIRECTION_S',) ==> ('STREET TYPE_AVE',)
0.288 0.707 1.261 0.700

('STREET DIRECTION_S', 'INSPECTION CATEGORY_COMPLAINT') ==> ('STREET TYPE_AVE',)
0.227 0.707 1.260 0.699

('INSPECTION STATUS_FAILED', 'INSPECTION CATEGORY_COMPLAINT') ==> ('VIOLATION STATUS_OPEN',)
0.392 0.815 1.310 0.688

('INSPECTION STATUS_FAILED', 'STREET TYPE_AVE', 'INSPECTION CATEGORY_COMPLAINT') ==> ('VIOLATION STATUS_OPEN',)
0.225 0.812 1.305 0.683

('STREET 