In [1]:
import pandas as pd
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [10]:
pd.options.display.max_columns=100

In [2]:
#数据加载
movies = pd.read_csv('../../datasets/MovieLens/movies.csv')

In [3]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27278 entries, 0 to 27277
Data columns (total 3 columns):
movieId    27278 non-null int64
title      27278 non-null object
genres     27278 non-null object
dtypes: int64(1), object(2)
memory usage: 639.5+ KB


In [24]:
# 将genres进行one-hot编码（离散特征有多少取值，就用多少维来表示这个特征）
movies_hot_encoded = movies.drop('genres',1).join(movies.genres.str.get_dummies(sep = '|'))

In [25]:
movies_hot_encoded.head()

Unnamed: 0,movieId,title,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale (1995),0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II (1995),0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [26]:
#将movie数据转换成transformation
#重新设置索引
movies_hot_encoded.set_index(['movieId', 'title'], inplace= True)

In [27]:
movies_hot_encoded

Unnamed: 0_level_0,Unnamed: 1_level_0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,title,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Toy Story (1995),0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,Jumanji (1995),0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3,Grumpier Old Men (1995),0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
4,Waiting to Exhale (1995),0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
5,Father of the Bride Part II (1995),0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
131254,Kein Bund für's Leben (2007),0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
131256,"Feuer, Eis & Dosenbier (2002)",0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
131258,The Pirates (2014),0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
131260,Rentun Ruusu (2001),1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [28]:
itemsets = apriori(movies_hot_encoded, use_colnames=True, min_support=0.02)

In [31]:
#降序排列
itemsets.sort_values(by='support', ascending=False, inplace= True)

In [34]:
itemsets.head()

Unnamed: 0,support,itemsets
7,0.489185,(Drama)
4,0.306987,(Comedy)
14,0.153164,(Thriller)
12,0.151294,(Romance)
0,0.129042,(Action)


In [36]:
#设置提升度为2
rules = association_rules(itemsets, metric='lift', min_threshold=2)
rules.sort_values(by = 'lift', ascending = False)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Thriller),(Crime),0.153164,0.107743,0.045165,0.294878,2.736877,0.028662,1.265394
1,(Crime),(Thriller),0.107743,0.153164,0.045165,0.41919,2.736877,0.028662,1.458027
2,(Action),(Thriller),0.129042,0.153164,0.040655,0.315057,2.056994,0.020891,1.23636
3,(Thriller),(Action),0.153164,0.129042,0.040655,0.265438,2.056994,0.020891,1.185684
4,(Thriller),(Horror),0.153164,0.095718,0.039336,0.256821,2.6831,0.024675,1.216776
5,(Horror),(Thriller),0.095718,0.153164,0.039336,0.410954,2.6831,0.024675,1.437639
6,(Action),(Adventure),0.129042,0.08538,0.035633,0.276136,3.234198,0.024616,1.263525
7,(Adventure),(Action),0.08538,0.129042,0.035633,0.417347,3.234198,0.024616,1.494813
8,(Thriller),(Mystery),0.153164,0.055503,0.029144,0.190282,3.428352,0.020643,1.166453
9,(Mystery),(Thriller),0.055503,0.153164,0.029144,0.525099,3.428352,0.020643,1.783185


In [37]:
association_rules?