In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from apyori import apriori

# reading users file:
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('u.user', sep='|', names=u_cols,encoding='latin-1')

# reading ratings file:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('u.data', sep='\t', names=r_cols,encoding='latin-1')

# reading items file:
i_cols = ['movie id', 'movie title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
items = pd.read_csv('u.item', sep='|', names=i_cols,
encoding='latin-1')

# After loading the dataset, we should look at the content of each file (users, ratings, items).

# Looking at the user file
print("\nUser Data :")
print("shape : ", users.shape)
#print(users.head())

# We have 943 users in the dataset and each user has 5 features, i.e. user_ID, age, sex, occupation and zip_code. Now let’s look at the ratings file.

# Ratings Data
print("\nRatings Data :")
print("shape : ", ratings.shape)
#print(ratings.head())

# We have 100k ratings for different user and movie combinations. Now finally examine the items file.

# Item Data
print("\nItem Data :")
print("shape : ", items.shape)
#print(items.head())


User Data :
shape :  (943, 5)

Ratings Data :
shape :  (100000, 4)

Item Data :
shape :  (1682, 24)


In [9]:
users.head()

Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [10]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [None]:
items.head()

Unnamed: 0,movie id,movie title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0


# **Apriori Algorithm**

In [11]:
# I just want to deal with favorable reviews in my dataset
ratings["favorable_rev"] =ratings["rating"] > 3
ratings[10:15]

Unnamed: 0,user_id,movie_id,rating,unix_timestamp,favorable_rev
10,62,257,2,879372434,False
11,286,1014,5,879781125,True
12,200,222,5,876042340,True
13,210,40,3,891035994,False
14,224,29,3,888104457,False


In [None]:
#Total number of unique users
len(ratings['user_id'].unique())

943

In [None]:
# The dataset has been sampled because of the longer running time hence we would deal with 220 users with preferrable ratings
ratings = ratings[ratings['user_id'].isin(range(220))] 
# We start by creating a dataset of each user's favourable reviews
favorable_ratings = ratings[ratings["favorable_rev"]]
favorable_ratings[:5]

In [13]:
# Extracting the users having more than one reviews
favorable_reviews_by_users = dict((k, v.values) for k, v in favorable_ratings.groupby("user_id")["movie_id"])
len(favorable_reviews_by_users)

219

In [None]:
#favorable_reviews_by_users depicts the dictionary having key as user_id and values as the movie_ids they have highly rated
favorable_reviews_by_users[1]

In [15]:
# Find out how many movies have favourable ratingsthe descending order by which movies have got the number of favorable ratings
favorable_ratings_by_movie = ratings[["movie_id", "favorable_rev"]].groupby("movie_id").sum()
favorable_ratings_by_movie.sort_values(by=["favorable_rev"], ascending=False)[:5]

Unnamed: 0_level_0,favorable_rev
movie_id,Unnamed: 1_level_1
50,108
100,94
258,89
181,84
174,81


In [None]:
#Here's a look at starting movieids and the number of received favorable ratings
favorable_ratings_by_movie.head(8)

In [16]:
frequent_itemsets = {}
min_support = 40
frequent_itemsets[1] = dict((frozenset((movie_id,)), row["favorable_rev"]) for movie_id, row in favorable_ratings_by_movie.iterrows() if row["favorable_rev"] > min_support)

In [None]:
#Now frequent_itemsets[1] depcits a dictionary having movieids as a key and number of favorable reviews as values
frequent_itemsets[1]

In [18]:
from collections import defaultdict
def find_frequent_itemsets(favorable_reviews_by_users, k_1_itemsets, min_support): #k_1 implies k-1
    counts = defaultdict(int)
    for user, reviews in favorable_reviews_by_users.items():
      #itemset contains only those movies which have been given favorable ratings above a threshold min_support value
        for itemset in k_1_itemsets:
            if itemset.issubset(reviews):
                for other_reviewed_movie in set(reviews) - itemset:
                    #Now the other reviewed movie and the itemset movie has been watched by user, so increase its count by 1
                    current_superset = itemset | frozenset((other_reviewed_movie,))
                    counts[current_superset] += 1
                    #print('counts=',counts)
                    #print('current_superset=',current_superset)
    #Now frequency greater than min_support is taken as a condition because obviously we desire that the confidence of watching b given 
    # a should be atleast greater than a threshold   
    #print(counts)             
    return dict([(itemset, frequency) for itemset, frequency in counts.items() if frequency >= min_support])

In [21]:
import sys
frequent_itemsets = {}  # itemsets are sorted by length
min_support = 60

# k=1 candidates are the movies with more than min_support favourable reviews
frequent_itemsets[1] = dict((frozenset((movie_id,)), row["favorable_rev"])
                                for movie_id, row in favorable_ratings_by_movie.iterrows()
                                if row["favorable_rev"] > min_support)

print("There are {} movies with more than {} favorable reviews".format(len(frequent_itemsets[1]), min_support))
sys.stdout.flush()
for k in range(2, 20):
    # Generate candidates of length k, using the frequent itemsets of length k-1
    # Only store the frequent itemsets having a frequency greater than min_support
    cur_frequent_itemsets = find_frequent_itemsets(favorable_reviews_by_users, frequent_itemsets[k-1],min_support)
    if len(cur_frequent_itemsets) == 0:
        print("Did not find any frequent itemsets of length {}".format(k))
        sys.stdout.flush()
        break
    else:
        print("{} frequent itemsets of length {} have been found".format(len(cur_frequent_itemsets), k))
        sys.stdout.flush()
        frequent_itemsets[k] = cur_frequent_itemsets        
# Obviously in market basket analysis, it doesn't make sense to take consideration of unique movies as we desire a set of atleast two watched movies.
del frequent_itemsets[1]

There are 15 movies with more than 60 favorable reviews
82 frequent itemsets of length 2 have been found
246 frequent itemsets of length 3 have been found
450 frequent itemsets of length 4 have been found
517 frequent itemsets of length 5 have been found
374 frequent itemsets of length 6 have been found
165 frequent itemsets of length 7 have been found
40 frequent itemsets of length 8 have been found
4 frequent itemsets of length 9 have been found
Did not find any frequent itemsets of length 10


In [None]:
#The below line will display the movie ids in keys along with their frequencies as values and its gonna increase from 1 movie to 2 movies which are 
#highly watched/rated together 
print(frequent_itemsets) 

In [24]:
frequent_itemsets[9]

{frozenset({7, 50, 56, 64, 79, 98, 100, 172, 174}): 117,
 frozenset({7, 50, 56, 64, 79, 98, 172, 174, 181}): 135,
 frozenset({7, 50, 56, 64, 98, 100, 172, 174, 181}): 108,
 frozenset({7, 50, 56, 79, 98, 172, 174, 181, 258}): 126}

In [25]:
# for lets look at the movies in frequent_itemsets[9]. These have the hisghest chances of occuring together 
u=[]
for itemset in frequent_itemsets[9]:
  v=[]
  for k in set(itemset):
    v.append(list(items[items['movie id']==k]['movie title']))
  u.append(v) 
u=np.array(u)  
u  
    


array([[['Shawshank Redemption, The (1994)'],
        ['Silence of the Lambs, The (1991)'],
        ['Fargo (1996)'],
        ['Twelve Monkeys (1995)'],
        ['Empire Strikes Back, The (1980)'],
        ['Raiders of the Lost Ark (1981)'],
        ['Fugitive, The (1993)'],
        ['Star Wars (1977)'],
        ['Pulp Fiction (1994)']],

       [['Shawshank Redemption, The (1994)'],
        ['Silence of the Lambs, The (1991)'],
        ['Twelve Monkeys (1995)'],
        ['Empire Strikes Back, The (1980)'],
        ['Raiders of the Lost Ark (1981)'],
        ['Fugitive, The (1993)'],
        ['Star Wars (1977)'],
        ['Return of the Jedi (1983)'],
        ['Pulp Fiction (1994)']],

       [['Shawshank Redemption, The (1994)'],
        ['Silence of the Lambs, The (1991)'],
        ['Fargo (1996)'],
        ['Twelve Monkeys (1995)'],
        ['Empire Strikes Back, The (1980)'],
        ['Raiders of the Lost Ark (1981)'],
        ['Star Wars (1977)'],
        ['Return of the Jedi (198