In [1]:
#分析movieLens 电影分类中的频繁项集和关联规则
import pandas as pd
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [2]:
# 数据加载
movies = pd.read_csv('./movies.csv')
print(movies.head())

   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  


In [3]:
#将genres进行one-hot编码（离散特征有多少取值，就用多少维来表示这个特征）
# get_dummies 是利用pandas实现one hot encode的方式
movies_hot_encoded = movies.drop('genres',1).join(movies.genres.str.get_dummies(sep='|'))
#print(movies_hot_encoded)

In [5]:
pd.options.display.max_columns = 100

In [6]:
# 将movieId，title 设置为index
movies_hot_encoded.set_index(['movieId','title'],inplace=True)
print(movies_hot_encoded.head())

                                            (no genres listed)  Action  \
movieId title                                                            
1       Toy Story (1995)                                     0       0   
2       Jumanji (1995)                                       0       0   
3       Grumpier Old Men (1995)                              0       0   
4       Waiting to Exhale (1995)                             0       0   
5       Father of the Bride Part II (1995)                   0       0   

                                            Adventure  Animation  Children  \
movieId title                                                                
1       Toy Story (1995)                            1          1         1   
2       Jumanji (1995)                              1          0         1   
3       Grumpier Old Men (1995)                     0          0         0   
4       Waiting to Exhale (1995)                    0          0         0   
5       Fathe

In [11]:
# 挖掘频繁项集，最小支持度为0.02
itemsets = apriori(movies_hot_encoded,use_colnames=True,min_support=0.002)
print(itemsets)

      support                                   itemsets
0    0.009018                       ((no genres listed))
1    0.129042                                   (Action)
2    0.085380                                (Adventure)
3    0.037649                                (Animation)
4    0.041755                                 (Children)
5    0.306987                                   (Comedy)
6    0.107743                                    (Crime)
7    0.090586                              (Documentary)
8    0.489185                                    (Drama)
9    0.051763                                  (Fantasy)
10   0.012098                                (Film-Noir)
11   0.095718                                   (Horror)
12   0.007185                                     (IMAX)
13   0.037979                                  (Musical)
14   0.055503                                  (Mystery)
15   0.151294                                  (Romance)
16   0.063898                  

In [12]:
# 按照支持度从大到小进行
itemsets = itemsets.sort_values(by='support',ascending=True)
print('-'*20,'频繁项集','-'*20)
print(itemsets)

-------------------- 频繁项集 --------------------
     support itemsets
0        0.0       ()
1        0.0       ()
2        0.0       ()
3        0.0       ()
4        0.0       ()
5        0.0       ()
6        0.0       ()
7        0.0       ()
8        0.0       ()
9        0.0       ()
10       0.0       ()
11       0.0       ()
12       0.0       ()
13       0.0       ()
14       0.0       ()
15       0.0       ()
16       0.0       ()
17       0.0       ()
18       0.0       ()
19       0.0       ()
20       0.0       ()
21       0.0       ()
22       0.0       ()
23       0.0       ()
24       0.0       ()
25       0.0       ()
26       0.0       ()
27       0.0       ()
28       0.0       ()
29       0.0       ()
..       ...      ...
190      0.0       ()
191      0.0       ()
192      0.0       ()
193      0.0       ()
194      0.0       ()
195      0.0       ()
196      0.0       ()
197      0.0       ()
198      0.0       ()
199      0.0       ()
200      0.0       ()
201    