# Objective:
### To mine association rules between Genres of all movie users rated


In [1]:
import numpy as np
import pandas as pd
import pyfpgrowth
from apyori import apriori
from ast import literal_eval

# Loading Data (merged from reviews and movies datasets)

In [2]:
# load reviews
reviews = pd.read_csv('data/reviews.csv', index_col = 0)

# only 2 columns are needed from reviews, dataset too large, take samples of 10000 rows only
reviews = reviews.sample(10000)[['user_id', 'movie_id']]

reviews.sample(5)

Unnamed: 0,user_id,movie_id
70982,ur111457052,tt1502397
470160,ur67362690,tt9243946
403828,ur13052125,tt8106534
453637,ur0345596,tt8936646
178377,ur88745586,tt3778644


In [3]:
# load movies
movies = pd.read_csv('data/movies.csv', index_col = 0)

# only 2 columns are needed from movies
movies = movies[['movie_id', 'genre']]

movies.sample(5)

Unnamed: 0,movie_id,genre
250,tt10419266,"['Animation', 'Action', 'Adventure']"
1381,tt8503618,"['Biography', 'Drama', 'History']"
3442,tt7857374,"['Horror', 'Thriller']"
1603,tt8647310,"['Comedy', 'Drama']"
162,tt7832848,"['Action', 'Crime', 'Thriller']"


In [4]:
# link the user_id to the genres of the movies they reviewed by merging both dataset
user_review_genres = pd.merge(reviews, movies, on = 'movie_id', how = 'inner')

# convert the genre from string to list
user_review_genres['genre'] = user_review_genres['genre'].astype(str).apply(literal_eval)

# groupby user_id and merge all genre into single list
user_review_genres = user_review_genres.groupby('user_id')['genre'].agg('sum').reset_index()

# remove repeated genre in the list
user_review_genres['genre'] = user_review_genres['genre'].map(lambda x: list(set(x)))

user_review_genres.sample(5)

Unnamed: 0,user_id,genre
6269,ur95193034,"[Comedy, Action, Adventure]"
4538,ur63262063,"[Drama, Mystery, Adventure]"
4031,ur54448273,[Comedy]
4237,ur57998263,"[Thriller, Horror, Mystery]"
5014,ur7033332,"[Comedy, Crime]"


# Apriori Algorithm

In [5]:
# Apply apriori algorithm to the genre column
apriori_association_rules = apriori(user_review_genres['genre'], min_support = 0.05)
apriori_association_results = list(apriori_association_rules)


In [6]:
# view results in dataframe order by support desc
pd.DataFrame(apriori_association_results).sort_values(by ='support', ascending = False).head(10)

Unnamed: 0,items,support,ordered_statistics
5,(Drama),0.513946,"[((), (Drama), 0.5139460448102423, 1.0)]"
0,(Action),0.386831,"[((), (Action), 0.3868312757201646, 1.0)]"
1,(Adventure),0.341259,"[((), (Adventure), 0.34125895442767873, 1.0)]"
3,(Comedy),0.276025,"[((), (Comedy), 0.27602499618960524, 1.0)]"
12,(Thriller),0.242951,"[((), (Thriller), 0.2429507696997409, 1.0)]"
13,"(Action, Adventure)",0.241427,"[((), (Action, Adventure), 0.24142661179698216..."
4,(Crime),0.223442,"[((), (Crime), 0.2234415485444292, 1.0)]"
8,(Horror),0.180613,"[((), (Horror), 0.18061271147690902, 1.0)]"
26,"(Drama, Crime)",0.154245,"[((), (Drama, Crime), 0.15424477975918305, 1.0..."
11,(Sci-Fi),0.139918,"[((), (Sci-Fi), 0.13991769547325103, 1.0)]"


In [7]:
# For better view, unpack ordered_statistics
# codes below is taken from https://stackoverflow.com/questions/52688220/python-apyori-sorting-by-lift

apriori_rules = pd.DataFrame(columns=('items','antecedent','consequent','support','confidence','lift'))

Support =[]
Confidence = []
Lift = []
Items = []
Antecedent = []
Consequent=[]

for RelationRecord in apriori_association_results:
    for ordered_stat in RelationRecord.ordered_statistics:
        Support.append(RelationRecord.support)
        Items.append(RelationRecord.items)
        Antecedent.append(ordered_stat.items_base)
        Consequent.append(ordered_stat.items_add)
        Confidence.append(ordered_stat.confidence)
        Lift.append(ordered_stat.lift)

apriori_rules['items'] = list(map(set, Items))                                   
apriori_rules['antecedent'] = list(map(set, Antecedent))
apriori_rules['consequent'] = list(map(set, Consequent))
apriori_rules['support'] = Support
apriori_rules['confidence'] = Confidence
apriori_rules['lift']= Lift


## Frequently occurring itemsets

In [8]:
# Find frequent occurring items
apriori_itemsets = pd.DataFrame(
    apriori_rules['items'].apply(
        lambda x: sorted(set(x))
    )
)

# create a temp string to drop duplicates
apriori_itemsets['temp'] = apriori_itemsets['items'].apply("_".join)
apriori_itemsets = apriori_itemsets.drop_duplicates(['temp'])[['items']]

# first 20 results, not in any particular order
apriori_itemsets.head(20)

Unnamed: 0,items
0,[Action]
1,[Adventure]
2,[Biography]
3,[Comedy]
4,[Crime]
5,[Drama]
6,[Family]
7,[Fantasy]
8,[Horror]
9,[Mystery]


## Association Rules

In [9]:

apriori_rules.sort_values(by ='lift', ascending = False, inplace = True)

# Top 20 results sorted by Lift descending
apriori_rules.head(20)


Unnamed: 0,items,antecedent,consequent,support,confidence,lift
68,"{Horror, Mystery}",{Horror},{Mystery},0.089011,0.492827,3.712328
69,"{Horror, Mystery}",{Mystery},{Horror},0.089011,0.670494,3.712328
82,"{Fantasy, Action, Adventure}",{Adventure},"{Fantasy, Action}",0.054108,0.158553,2.850043
85,"{Fantasy, Action, Adventure}","{Fantasy, Action}",{Adventure},0.054108,0.972603,2.850043
100,"{Thriller, Drama, Crime}","{Thriller, Drama}",{Crime},0.067673,0.559899,2.505797
95,"{Thriller, Drama, Crime}",{Crime},"{Thriller, Drama}",0.067673,0.302865,2.505797
89,"{Action, Sci-Fi, Adventure}",{Adventure},"{Action, Sci-Fi}",0.079866,0.234033,2.468635
92,"{Action, Sci-Fi, Adventure}","{Action, Sci-Fi}",{Adventure},0.079866,0.842444,2.468635
93,"{Action, Sci-Fi, Adventure}","{Sci-Fi, Adventure}",{Action},0.079866,0.927434,2.397515
88,"{Action, Sci-Fi, Adventure}",{Action},"{Adventure, Sci-Fi}",0.079866,0.206462,2.397515


# FP Growth Algorithm

## Frequent Pattern

In [10]:
fpgrowth_patterns = pyfpgrowth.find_frequent_patterns(user_review_genres['genre'], 50)

In [11]:
fpgrowth_itemsets = pd.DataFrame(fpgrowth_patterns.items())
fpgrowth_itemsets.columns = ['items', 'support']

# first 20 results, not in any particular order
fpgrowth_itemsets.head(20)

Unnamed: 0,items,support
0,"(History,)",60
1,"(Music,)",97
2,"(Drama, Music)",64
3,"(War,)",139
4,"(Drama, War)",133
5,"(Action, Animation)",85
6,"(Action, Adventure, Animation)",78
7,"(Animation, Drama)",94
8,"(Adventure, Animation, Drama)",93
9,"(Animation, Comedy)",186


## Association Rules

In [12]:
fpgrowth_rules = pd.DataFrame(pyfpgrowth.generate_association_rules(fpgrowth_patterns, 0.5).items())

In [13]:
fpgrowth_rules.columns = ['antecedent', 'consequent_confidence']

In [14]:
fpgrowth_rules[['consequent','confidence']] = pd.DataFrame(
    fpgrowth_rules['consequent_confidence'].tolist(), index=fpgrowth_rules.index
)
fpgrowth_rules.drop(['consequent_confidence'], axis = 1, inplace = True)
fpgrowth_rules.sort_values(by ='confidence', ascending = False, inplace = True)

# Top 20 results sorted by Confidence descending
fpgrowth_rules.head(20)

Unnamed: 0,antecedent,consequent,confidence
3,"(Animation, Drama)","(Adventure,)",0.989362
11,"(Action, Biography)","(Drama,)",0.988095
12,"(Biography, Crime)","(Drama,)",0.983957
15,"(Action, Fantasy)","(Adventure,)",0.972603
9,"(Family, Fantasy)","(Adventure,)",0.96732
1,"(War,)","(Drama,)",0.956835
10,"(Biography, Comedy)","(Drama,)",0.936508
5,"(Animation, Comedy)","(Adventure,)",0.930108
22,"(Adventure, Sci-Fi)","(Action,)",0.927434
2,"(Action, Animation)","(Adventure,)",0.917647


In [15]:
user_review_genres.to_csv('data/user_review_genres.csv')
apriori_itemsets.to_csv('data/apriori_itemsets.csv')
apriori_rules.to_csv('data/apriori_rules.csv')
fpgrowth_itemsets.to_csv('data/fpgrowth_itemsets.csv')
fpgrowth_rules.to_csv('data/fpgrowth_rules.csv')