In [1]:
import pandas as pd
from sklearn.cluster import KMeans
datafile = 'data/data.xls'
processedfile = 'result/data_processed.xls'
typelabel = {u'肝气郁结证型系数': 'A', u'热毒蕴结证型系数': 'B', u'冲任失调证型系数': 'C', u'气血两虚证型系数': 'D', u'脾胃虚弱证型系数': 'E', u'肝肾阴虚证型系数': 'F'}
k = 4
data = pd.read_excel(datafile)
keys = list(typelabel.keys())
result = pd.DataFrame()

In [12]:
import warnings
warnings.filterwarnings("ignore")
for i in range(len(keys)):
    print(u'正在进行“%s”的聚类...' % keys[i])
    kmodel = KMeans(n_clusters=k, n_jobs=4)
    kmodel.fit(data[[keys[i]]].as_matrix())
    r1 = pd.DataFrame(kmodel.cluster_centers_, columns=[typelabel[keys[i]]])
    r2 = pd.Series(kmodel.labels_).value_counts()
    r2 = pd.DataFrame(r2, columns=[typelabel[keys[i]]+'n'])
    r = pd.concat([r1, r2], axis=1).sort_values(typelabel[keys[i]])
    r.index = [1, 2, 3, 4]
    r[typelabel[keys[i]]] = r[typelabel[keys[i]]].rolling(2).mean()
    r[typelabel[keys[i]]][1] = 0.0
    result = result.append(r.T)
result = result.sort_index()
result.to_excel(processedfile)

正在进行“肝气郁结证型系数”的聚类...
正在进行“热毒蕴结证型系数”的聚类...
正在进行“冲任失调证型系数”的聚类...
正在进行“气血两虚证型系数”的聚类...
正在进行“脾胃虚弱证型系数”的聚类...
正在进行“肝肾阴虚证型系数”的聚类...


In [14]:
import time
inputfile = 'data/apriori.txt'
data = pd.read_csv(inputfile, header=None, dtype=object)

In [16]:
data.head()

Unnamed: 0,0,1,2,3,4,5,6
0,A2,B1,C3,D3,E1,F1,H1
1,A2,B1,C3,D3,E1,F1,H1
2,A2,B1,C3,D3,E1,F1,H1
3,A2,B1,C3,D3,E1,F1,H1
4,A2,B2,C3,D3,E1,F1,H1


In [17]:
def connect_string(x, ms):
    x = list(map(lambda i:sorted(i.split(ms)), x))
    l = len(x[0])
    r = []
    for i in range(len(x)):
        for j in range(i,len(x)):
            if x[i][:l-1] == x[j][:l-1] and x[i][l-1] != x[j][l-1]:
                r.append(x[i][:l-1]+sorted([x[j][l-1],x[i][l-1]]))
    return r
def find_rule(d, support, confidence, ms=u'--'):
    result = pd.DataFrame(index=['support', 'confidence'])
    support_series = 1.0*d.sum()/len(d)
    column = list(support_series[support_series>support].index)
    k = 0
    while len(column)>1:
        k = k+1
        print(u'\n正在进行第%s次搜索...' % k)
        column = connect_string(column, ms)
        print(u'数目：%s...' % len(column))
        sf = lambda i: d[i].prod(axis=1, numeric_only=True)
        d_2 = pd.DataFrame(list(map(sf, column)), index=[ms.join(i) for i in column]).T
        support_series_2 = 1.0*d_2[[ms.join(i) for i in column]].sum()/len(d)
        column = list(support_series_2[support_series_2 > support].index)
        support_series = support_series.append(support_series_2)
        column2 = []
        for i in column: 
            i = i.split(ms)
            for j in range(len(i)):
                column2.append(i[:j]+i[j+1:]+i[j:j+1])
        cofidence_series = pd.Series(index=[ms.join(i) for i in column2])
        for i in column2:
            cofidence_series[ms.join(i)] = support_series[ms.join(sorted(i))]/support_series[ms.join(i[:len(i)-1])]
        for i in cofidence_series[cofidence_series > confidence].index:
            result[i] = 0.0
            result[i]['confidence'] = cofidence_series[i]
            result[i]['support'] = support_series[ms.join(sorted(i.split(ms)))]
    result = result.T.sort_values(['confidence','support'], ascending = False)
    print(u'\n结果为：')
    print(result)
    return result

In [21]:
start = time.clock()
print(u'\n转换原始数据至0-1矩阵...')
ct = lambda x: pd.Series(1, index=x[pd.notnull(x)])
b = map(ct, data.as_matrix())
c = list(b)
data = pd.DataFrame(c).fillna(0)
end = time.clock()
print(u'\n转换完毕，用时：%0.2f秒' % (end-start))
del b
support = 0.06
confidence = 0.75
ms = '---'
start = time.clock()
print(u'\n开始搜索关联规则...')
find_rule(data, support, confidence, ms)
end = time.clock()
print(u'\n搜索完成，用时：%0.2f秒' % (end-start))


转换原始数据至0-1矩阵...

转换完毕，用时：0.47秒

开始搜索关联规则...

正在进行第1次搜索...
数目：276...

正在进行第2次搜索...
数目：947...

正在进行第3次搜索...
数目：41...

结果为：
                    support  confidence
A3---F4---H4       0.078495    0.879518
C3---F4---H4       0.075269    0.875000
B2---F4---H4       0.062366    0.794521
C2---E3---D2       0.092473    0.754386
D2---F3---H4---A2  0.062366    0.753247

搜索完成，用时：2.25秒
