In [1]:
#分析movieLens 电影分类中的频繁项集和关联规则
import pandas as pd
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [2]:
# 数据加载
movies = pd.read_csv('./movies.csv')
print(movies.head())

   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  


In [3]:
#将genres进行one-hot编码（离散特征有多少取值，就用多少维来表示这个特征）
# get_dummies 是利用pandas实现one hot encode的方式
movies_hot_encoded = movies.drop('genres',1).join(movies.genres.str.get_dummies(sep='|'))
#print(movies_hot_encoded)

In [5]:
pd.options.display.max_columns = 100

In [6]:
# 将movieId，title 设置为index
movies_hot_encoded.set_index(['movieId','title'],inplace=True)
print(movies_hot_encoded.head())

                                            (no genres listed)  Action  \
movieId title                                                            
1       Toy Story (1995)                                     0       0   
2       Jumanji (1995)                                       0       0   
3       Grumpier Old Men (1995)                              0       0   
4       Waiting to Exhale (1995)                             0       0   
5       Father of the Bride Part II (1995)                   0       0   

                                            Adventure  Animation  Children  \
movieId title                                                                
1       Toy Story (1995)                            1          1         1   
2       Jumanji (1995)                              1          0         1   
3       Grumpier Old Men (1995)                     0          0         0   
4       Waiting to Exhale (1995)                    0          0         0   
5       Fathe

In [16]:
# 挖掘频繁项集，最小支持度为0.02
itemsets = apriori(movies_hot_encoded,use_colnames=True,min_support=0.002)

In [17]:
# 按照支持度从大到小进行
itemsets = itemsets.sort_values(by='support',ascending=False)
print('-'*20,'频繁项集','-'*20)
print(itemsets)

-------------------- 频繁项集 --------------------
      support                                itemsets
8    0.489185                                 (Drama)
5    0.306987                                (Comedy)
17   0.153164                              (Thriller)
15   0.151294                               (Romance)
1    0.129042                                (Action)
6    0.107743                                 (Crime)
11   0.095718                                (Horror)
89   0.094325                        (Drama, Romance)
65   0.093335                         (Drama, Comedy)
7    0.090586                           (Documentary)
2    0.085380                             (Adventure)
70   0.069470                       (Comedy, Romance)
91   0.068480                       (Drama, Thriller)
16   0.063898                                (Sci-Fi)
75   0.062761                          (Crime, Drama)
14   0.055503                               (Mystery)
9    0.051763                      

In [19]:
# 根据频繁项集计算关联规则，设置最小提升度为2
rules = association_rules(itemsets,metric='lift',min_threshold=2)

In [21]:
# 按照提升度大小进行排序
rules = rules.sort_values(by='lift',ascending=False)
print('_'*20,'关联规则','_'*20)
print(rules)

____________________ 关联规则 ____________________
                         antecedents                      consequents  \
273              (Children, Fantasy)           (Adventure, Animation)   
268           (Adventure, Animation)              (Children, Fantasy)   
272            (Adventure, Children)             (Fantasy, Animation)   
269             (Fantasy, Animation)            (Adventure, Children)   
210   (Comedy, Adventure, Animation)                       (Children)   
221                       (Children)   (Comedy, Adventure, Animation)   
281                       (Children)             (Animation, Musical)   
280             (Animation, Musical)                       (Children)   
214              (Comedy, Animation)            (Adventure, Children)   
217            (Adventure, Children)              (Comedy, Animation)   
279              (Children, Musical)                      (Animation)   
282                      (Animation)              (Children, Musical)   
220 