# Frequent Patterns Analysis

In [35]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, fpgrowth, association_rules

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [36]:
df = pd.read_csv('data/tripadvisor_review.csv')
df.head()

Unnamed: 0,User ID,Category 1,Category 2,Category 3,Category 4,Category 5,Category 6,Category 7,Category 8,Category 9,Category 10
0,User 1,0.93,1.8,2.29,0.62,0.8,2.42,3.19,2.79,1.82,2.42
1,User 2,1.02,2.2,2.66,0.64,1.42,3.18,3.21,2.63,1.86,2.32
2,User 3,1.22,0.8,0.54,0.53,0.24,1.54,3.18,2.8,1.31,2.5
3,User 4,0.45,1.8,0.29,0.57,0.46,1.52,3.18,2.96,1.57,2.86
4,User 5,0.51,1.2,1.18,0.57,1.54,2.02,3.18,2.78,1.18,2.54


## Preprocessing

In [26]:
# rename columns as their short names
cols_dict = {
	"Category 1": "AG", # art galleries
	"Category 2": "DC", # dance clubs
	"Category 3": "JB", # juice bars
	"Category 4": "RS", # restaurants
	"Category 5": "MU", # museums
	"Category 6": "RE", # resorts
	"Category 7": "PP", # parks/picnic spots
	"Category 8": "BE", # beaches
	"Category 9": "TH", # theaters
	"Category 10": "RI", # religious institutions
}

df.rename(columns=cols_dict, inplace=True)

In [27]:
# encode decimal review scores to binary values
threshold = 2
for col in df.columns[1:]:
	df[col] = df[col].apply(lambda x: 1 if x > threshold else 0)
df.head()

Unnamed: 0,User ID,AG,DC,JB,RS,MU,RE,PP,BE,TH,RI
0,User 1,0,0,1,0,0,1,1,1,0,1
1,User 2,0,1,1,0,0,1,1,1,0,1
2,User 3,0,0,0,0,0,0,1,1,0,1
3,User 4,0,0,0,0,0,0,1,1,0,1
4,User 5,0,0,0,0,0,1,1,1,0,1


In [28]:
# create a list of lists of reviews
reviews = []
for i, r in df.iterrows():
	review = [col for col in df.columns[1:] if r[col] == 1]
	reviews.append(review)
reviews[:5]

[['JB', 'RE', 'PP', 'BE', 'RI'],
 ['DC', 'JB', 'RE', 'PP', 'BE', 'RI'],
 ['PP', 'BE', 'RI'],
 ['PP', 'BE', 'RI'],
 ['RE', 'PP', 'BE', 'RI']]

In [29]:
# transform rewiews into a transaction matrix
te = TransactionEncoder()
te_ary = te.fit(reviews).transform(reviews)
review_df = pd.DataFrame(te_ary, columns=te.columns_)
review_df.head()

Unnamed: 0,AG,BE,DC,JB,MU,PP,RE,RI,RS,TH
0,False,True,False,True,False,True,True,True,False,False
1,False,True,True,True,False,True,True,True,False,False
2,False,True,False,False,False,True,False,True,False,False
3,False,True,False,False,False,True,False,True,False,False
4,False,True,False,False,False,True,True,True,False,False


## Apriori Algorithm

In [30]:
# calculate support for each item with min_sup = 0.1 using Apriori algorithm
min_sup = 0.1
frequent_itemsets_ap = apriori(review_df, min_support=min_sup, use_colnames=True)
frequent_itemsets_ap

Unnamed: 0,support,itemsets
0,1.0,(BE)
1,0.146,(JB)
2,1.0,(PP)
3,0.365,(RE)
4,1.0,(RI)
5,0.115,(TH)
6,0.146,"(BE, JB)"
7,1.0,"(BE, PP)"
8,0.365,"(BE, RE)"
9,1.0,"(BE, RI)"


In [31]:
# generate association rules
min_threshold = 0.5
rules = association_rules(frequent_itemsets_ap, metric="confidence", min_threshold=min_threshold)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(JB),(BE),0.146,1.0,0.146,1.0,1.0,0.0,inf,0.0
1,(BE),(PP),1.0,1.0,1.0,1.0,1.0,0.0,inf,0.0
2,(PP),(BE),1.0,1.0,1.0,1.0,1.0,0.0,inf,0.0
3,(RE),(BE),0.365,1.0,0.365,1.0,1.0,0.0,inf,0.0
4,(BE),(RI),1.0,1.0,1.0,1.0,1.0,0.0,inf,0.0
5,(RI),(BE),1.0,1.0,1.0,1.0,1.0,0.0,inf,0.0
6,(TH),(BE),0.115,1.0,0.115,1.0,1.0,0.0,inf,0.0
7,(JB),(PP),0.146,1.0,0.146,1.0,1.0,0.0,inf,0.0
8,(JB),(RI),0.146,1.0,0.146,1.0,1.0,0.0,inf,0.0
9,(RE),(PP),0.365,1.0,0.365,1.0,1.0,0.0,inf,0.0


In [32]:
# evaluate initial results
print(f"Number of rules: {len(rules)}")

Number of rules: 69


## FP-Growth Algorithm

In [33]:
# calculate support for each item with min_sup = 0.1 using FP-Growth algorithm
min_sup = 0.1
frequent_itemsets_fp = fpgrowth(review_df, min_support=min_sup, use_colnames=True)
frequent_itemsets_fp

Unnamed: 0,support,itemsets
0,1.0,(RI)
1,1.0,(PP)
2,1.0,(BE)
3,0.365,(RE)
4,0.146,(JB)
5,0.115,(TH)
6,1.0,"(RI, PP)"
7,1.0,"(BE, PP)"
8,1.0,"(BE, RI)"
9,1.0,"(BE, RI, PP)"


In [34]:
# generate association rules
min_threshold = 0.5
rules = association_rules(frequent_itemsets_fp, metric="confidence", min_threshold=min_threshold)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(RI),(PP),1.0,1.0,1.0,1.0,1.0,0.0,inf,0.0
1,(PP),(RI),1.0,1.0,1.0,1.0,1.0,0.0,inf,0.0
2,(BE),(PP),1.0,1.0,1.0,1.0,1.0,0.0,inf,0.0
3,(PP),(BE),1.0,1.0,1.0,1.0,1.0,0.0,inf,0.0
4,(BE),(RI),1.0,1.0,1.0,1.0,1.0,0.0,inf,0.0
5,(RI),(BE),1.0,1.0,1.0,1.0,1.0,0.0,inf,0.0
6,"(BE, RI)",(PP),1.0,1.0,1.0,1.0,1.0,0.0,inf,0.0
7,"(BE, PP)",(RI),1.0,1.0,1.0,1.0,1.0,0.0,inf,0.0
8,"(RI, PP)",(BE),1.0,1.0,1.0,1.0,1.0,0.0,inf,0.0
9,(BE),"(RI, PP)",1.0,1.0,1.0,1.0,1.0,0.0,inf,0.0
