In [1]:
# import pandas
import pandas as pd  

# from apyori import apriori
from apyori import apriori

In [4]:
# load data 
df = pd.read_csv('https://raw.githubusercontent.com/pirandello/apriori/master/movie_dataset.csv', 
                         header = None)

* explore the dataset

In [5]:
df.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,The Revenant,13 Hours,Allied,Zootopia,Jigsaw,Achorman,Grinch,Fast and Furious,Ghostbusters,Wolverine,Mad Max,John Wick,La La Land,The Good Dunosaur,Ninja Turtles,The Good Dunosaur Bad Moms,2 Guns,Inside Out,Valerian,Spiderman 3
1,Beirut,Martian,Get Out,,,,,,,,,,,,,,,,,
2,Deadpool,,,,,,,,,,,,,,,,,,,
3,X-Men,Allied,,,,,,,,,,,,,,,,,,
4,Ninja Turtles,Moana,Ghost in the Shell,Ralph Breaks the Internet,John Wick,,,,,,,,,,,,,,,


In [6]:
df.shape

(7501, 20)

7500 movies in records of 20 - looking to see which movies were watched together in significant frequency

* transform dataframe to list of lists (suitable format for apyori)

In [7]:
# make an empty list
records = []
for i in range(0, 7501):
    records.append([str(df.values[i,j]) for j in range(0, 20)])

* instantiate apriori and set the input params based on following constraints:
    * we want only movies that are purchased at least 40 times
    * the minimum confidence for the rules is 20%
    * the minumum lift is 3

In [8]:
min_support = 40 / 7501
min_confidence = 0.2
min_lift = 3
min_length = 2 # my parameter, to get more than 2 items in the rule

association_rules = apriori(records, min_support=min_support, min_confidence=min_confidence, min_lift=min_lift, min_length=min_length)
association_results = list(association_rules)

* how many association rules did we obtained ?

In [9]:
print(len(association_results))

32


* print the first association rule

In [10]:
print(association_results[0])

RelationRecord(items=frozenset({'Red Sparrow', 'Green Lantern'}), support=0.005732568990801226, ordered_statistics=[OrderedStatistic(items_base=frozenset({'Red Sparrow'}), items_add=frozenset({'Green Lantern'}), confidence=0.3006993006993007, lift=3.790832696715049)])


Above: Relation records show Red Sparrow and Green Lantern watched together

* convert association rules to DataFrame 
    * use these columns: title_1, title_2, support, confidence, lift

In [36]:
results = []

# lets play around with these association results to extract what we need
print(association_results[0]) # the whole rule

for item in association_results:
    pair = item[0]
    items = [x for x in pair] # this gets them into a list format which can be indexed for the string values

    value0 = str(items[0])
    value1 = str(items[1])

    support = str(item[1])[:8] # this is the support of the rule, we will add this to 6 decimal places, with the 0.

    confidence = str(item[2][0][2])[:8] # this is the confidence of the rule
    lift = str(item[2][0][3])[:8] # this is the lift of the rule

    rows = (value0, value1, support, confidence, lift)
    results.append(rows)

col_labels = ['Title 1', 'Title 2', 'Support', 'Confidence', 'Lift']
results_df = pd.DataFrame(results, columns=col_labels)

results_df.head(10)


RelationRecord(items=frozenset({'Red Sparrow', 'Green Lantern'}), support=0.005732568990801226, ordered_statistics=[OrderedStatistic(items_base=frozenset({'Red Sparrow'}), items_add=frozenset({'Green Lantern'}), confidence=0.3006993006993007, lift=3.790832696715049)])


Unnamed: 0,Title 1,Title 2,Support,Confidence,Lift
0,Red Sparrow,Green Lantern,0.005732,0.300699,3.790832
1,Star Wars,Green Lantern,0.005865,0.372881,4.700811
2,Jumanji,Kung Fu Panda,0.015997,0.32345,3.291993
3,Jumanji,Wonder Woman,0.005332,0.377358,3.840659
4,The Spy Who Dumped Me,Spiderman 3,0.007998,0.271493,4.12241
5,Intern,The Revenant,0.005332,0.232558,3.254512
6,,Red Sparrow,0.005732,0.300699,3.790832
7,Star Wars,,0.005865,0.372881,4.700811
8,Jumanji,Intern,0.008665,0.311004,3.165328
9,The Revenant,Ninja Turtles,0.007199,0.305084,3.200616


I used the string formatting once we extracted the numerical values to clean up the large trailing decimal places