In [5]:
import pandas as pd
import os
data_folder=os.path.join(os.path.expanduser("~"),'Data','ml-100k')
ratings_filename=os.path.join(data_folder,'u.data')

In [7]:
all_ratings=pd.read_csv(ratings_filename,delimiter='\t',header=None,names=['userID','MovieID','Rating','Datetime'])
all_ratings['Datetime']=pd.to_datetime(all_ratings['Datetime'],unit='s')
all_ratings[:5]

Unnamed: 0,userID,MovieID,Rating,Datetime
0,196,242,3,1997-12-04 15:55:49
1,186,302,3,1998-04-04 19:22:22
2,22,377,1,1997-11-07 07:18:36
3,244,51,2,1997-11-27 05:02:03
4,166,346,1,1998-02-02 05:33:16


In [8]:
all_ratings['Favorable']=all_ratings['Rating']>3
all_ratings[10:15]

Unnamed: 0,userID,MovieID,Rating,Datetime,Favorable
10,62,257,2,1997-11-12 22:07:14,False
11,286,1014,5,1997-11-17 15:38:45,True
12,200,222,5,1997-10-05 09:05:40,True
13,210,40,3,1998-03-27 21:59:54,False
14,224,29,3,1998-02-21 23:40:57,False


In [9]:
ratings=all_ratings[all_ratings['userID'].isin(range(200))]

In [10]:
ratings[:5]

Unnamed: 0,userID,MovieID,Rating,Datetime,Favorable
0,196,242,3,1997-12-04 15:55:49,False
1,186,302,3,1998-04-04 19:22:22,False
2,22,377,1,1997-11-07 07:18:36,False
4,166,346,1,1998-02-02 05:33:16,False
6,115,265,2,1997-12-03 17:51:28,False


In [12]:
favorable_ratings=ratings[ratings['Favorable']]
ratings[ratings['Favorable']][0:5]

Unnamed: 0,userID,MovieID,Rating,Datetime,Favorable
16,122,387,5,1997-11-11 17:47:39,True
20,119,392,4,1998-01-30 16:13:34,True
21,167,486,4,1998-04-16 14:54:12,True
26,38,95,5,1998-04-13 01:14:54,True
28,63,277,4,1997-10-01 23:10:01,True


In [18]:
favorable_reviews_by_users=dict((k,frozenset(v.values)) 
                                for k,v in favorable_ratings.groupby('userID')['MovieID'])
                                


In [85]:
num_favorable_by_movie = ratings[["MovieID", "Favorable"]].groupby("MovieID").sum()
num_favorable_by_movie.sort_values(by="Favorable", ascending=False)[:5]

Unnamed: 0_level_0,Favorable
MovieID,Unnamed: 1_level_1
50,100.0
100,89.0
258,83.0
181,79.0
174,74.0


In [41]:
frequent_itemsets={}
min_support=50


In [57]:
frequent_itemsets[1] = dict((frozenset((movie_id,)), row["Favorable"])
                                for movie_id, row in num_favorable_by_movie.iterrows()
                                if row["Favorable"] > min_support)
    

In [60]:
from collections import defaultdict
def find_frequent_itemsets(favorable_reviews_by_users,k_1_itemsets,min_support):
    counts=defaultdict(int)
    for user,reviews in favorable_reviews_by_users.items():
        for itemset in k_1_itemsets:
            if itemset.issubset(reviews):
                for other_reviewed_movie in reviews-itemset:
                    current_superset=itemset | frozenset((other_reviewed_movie,))
                    counts[current_superset] +=1
    return dict([(itemset,frequency) for itemset,frequency in counts.items() if frequency>=min_support])
    
                

In [64]:
import sys
for k in range(2,20):
    cur_frequent_itemsets= \
    find_frequent_itemsets(favorable_reviews_by_users,frequent_itemsets[k-1],min_support)
    frequent_itemsets[k]=cur_frequent_itemsets
    
    
    if len(cur_frequent_itemsets) == 0:
        print("Did not find any frequent itemsets of length {}".format(k))
        sys.stdout.flush()
        break
    else:
        print("I found {} frequent itemsets of length {}".format(len(cur_frequent_itemsets), k))
        #print(cur_frequent_itemsets)
        sys.stdout.flush()
        frequent_itemsets[k] = cur_frequent_itemsets
del frequent_itemsets[1]

I found 93 frequent itemsets of length 2
I found 295 frequent itemsets of length 3
I found 593 frequent itemsets of length 4
I found 785 frequent itemsets of length 5
I found 677 frequent itemsets of length 6
I found 373 frequent itemsets of length 7
I found 126 frequent itemsets of length 8
I found 24 frequent itemsets of length 9
I found 2 frequent itemsets of length 10
Did not find any frequent itemsets of length 11


In [66]:
candidate_rules=[]
for itemset_length,itemset_counts in frequent_itemsets.items():
    for itemset in itemset_counts.keys():
        for conclusion in itemset:
            
            premise=itemset-set((conclusion,))
            candidate_rules.append((premise,conclusion))
print candidate_rules[:5]

    

[(frozenset([50]), 64), (frozenset([64]), 50), (frozenset([127]), 181), (frozenset([181]), 127), (frozenset([127]), 1)]


In [71]:
correct_counts=defaultdict(int)
incorrect_counts=defaultdict(int)
for user,reviews in favorable_reviews_by_users.items():
    
    
    for candidate_rule in candidate_rules:
        premise,conclusion=candidate_rule

        if premise.issubset(reviews):
            if conclusion in reviews:
                correct_counts[candidate_rule]+=1
            else:
               incorrect_counts[candidate_rule]+=1 
                
rule_confidence={candidate_rule:correct_counts[candidate_rule] / float(correct_counts[candidate_rule]+ \
                                                                      incorrect_counts[candidate_rule]) for candidate_rule in candidate_rules}
            

    

In [73]:
from operator import itemgetter
sorted_confidence=sorted(rule_confidence.items(),key=itemgetter(1),reverse=True)
for index in range(5):
    print ('Rule #{0}'.format(index+1))
    (premise,conclusion)=sorted_confidence[index][0]
    print ('Rule: if a person recommends {0} they will also recommend {1}'.format(premise,conclusion))
    print(" - Confidence: {0:.3f}".format(rule_confidence[(premise, conclusion)]))
    print("") 


Rule #1
Rule: if a person recommends frozenset([56, 258, 172, 181, 7]) they will also recommend 50
 - Confidence: 1.000

Rule #2
Rule: if a person recommends frozenset([98, 127, 172, 174, 7]) they will also recommend 64
 - Confidence: 1.000

Rule #3
Rule: if a person recommends frozenset([56, 1, 64, 127]) they will also recommend 98
 - Confidence: 1.000

Rule #4
Rule: if a person recommends frozenset([64, 100, 181, 174, 79]) they will also recommend 56
 - Confidence: 1.000

Rule #5
Rule: if a person recommends frozenset([56, 100, 181, 174, 127]) they will also recommend 50
 - Confidence: 1.000



In [74]:
movie_name_filename=os.path.join(data_folder,'u.item')
movie_name_data=pd.read_csv(movie_name_filename,delimiter='|',header=None,encoding='mac-roman')

In [75]:
movie_name_data.columns = ["MovieID", "Title", "Release Date", "Video Release", "IMDB", "<UNK>", "Action", "Adventure",
                           "Animation", "Children's", "Comedy", "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir",
                           "Horror", "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"]

In [76]:
def get_movie_name(movie_id):
    title_object = movie_name_data[movie_name_data["MovieID"] == movie_id]["Title"]
    title = title_object.values[0]
    return title

In [77]:
for index in range(5):
    print("Rule #{0}".format(index + 1))
    (premise, conclusion) = sorted_confidence[index][0]
    premise_names = ", ".join(get_movie_name(idx) for idx in premise)
    conclusion_name = get_movie_name(conclusion)
    print("Rule: If a person recommends {0} they will also recommend {1}".format(premise_names, conclusion_name))
    print(" - Confidence: {0:.3f}".format(rule_confidence[(premise, conclusion)]))
    print("")

Rule #1
Rule: If a person recommends Pulp Fiction (1994), Contact (1997), Empire Strikes Back, The (1980), Return of the Jedi (1983), Twelve Monkeys (1995) they will also recommend Star Wars (1977)
 - Confidence: 1.000

Rule #2
Rule: If a person recommends Silence of the Lambs, The (1991), Godfather, The (1972), Empire Strikes Back, The (1980), Raiders of the Lost Ark (1981), Twelve Monkeys (1995) they will also recommend Shawshank Redemption, The (1994)
 - Confidence: 1.000

Rule #3
Rule: If a person recommends Pulp Fiction (1994), Toy Story (1995), Shawshank Redemption, The (1994), Godfather, The (1972) they will also recommend Silence of the Lambs, The (1991)
 - Confidence: 1.000

Rule #4
Rule: If a person recommends Shawshank Redemption, The (1994), Fargo (1996), Return of the Jedi (1983), Raiders of the Lost Ark (1981), Fugitive, The (1993) they will also recommend Pulp Fiction (1994)
 - Confidence: 1.000

Rule #5
Rule: If a person recommends Pulp Fiction (1994), Fargo (1996), Ret

In [80]:
# Evaluation using test data
test_dataset = all_ratings[~all_ratings['userID'].isin(range(200))]
test_favorable = test_dataset[test_dataset["Favorable"]]
#test_not_favourable = test_dataset[~test_dataset["Favourable"]]
test_favorable_by_users = dict((k, frozenset(v.values)) for k, v in test_favorable.groupby("userID")["MovieID"])
#test_not_favourable_by_users = dict((k, frozenset(v.values)) for k, v in test_not_favourable.groupby("UserID")["MovieID"])
#test_users = test_dataset["UserID"].unique()

In [81]:
correct_counts = defaultdict(int)
incorrect_counts = defaultdict(int)
for user, reviews in test_favorable_by_users.items():
    for candidate_rule in candidate_rules:
        premise, conclusion = candidate_rule
        if premise.issubset(reviews):
            if conclusion in reviews:
                correct_counts[candidate_rule] += 1
            else:
                incorrect_counts[candidate_rule] += 1

In [82]:
test_confidence = {candidate_rule: correct_counts[candidate_rule] / float(correct_counts[candidate_rule] + incorrect_counts[candidate_rule])
                   for candidate_rule in rule_confidence}
print(len(test_confidence))

15285


In [83]:
sorted_test_confidence = sorted(test_confidence.items(), key=itemgetter(1), reverse=True)
print(sorted_test_confidence[:5])

[((frozenset([64, 98, 7, 258, 174, 181]), 172), 1.0), ((frozenset([64, 1, 98, 7, 172, 79, 181, 56]), 174), 1.0), ((frozenset([64, 1, 98, 7, 172, 79, 50]), 174), 1.0), ((frozenset([64, 1, 98, 7, 79, 50]), 174), 1.0), ((frozenset([64, 1, 79, 181, 7]), 174), 1.0)]


In [84]:
for index in range(10):
    print("Rule #{0}".format(index + 1))
    (premise, conclusion) = sorted_confidence[index][0]
    premise_names = ", ".join(get_movie_name(idx) for idx in premise)
    conclusion_name = get_movie_name(conclusion)
    print("Rule: If a person recommends {0} they will also recommend {1}".format(premise_names, conclusion_name))
    print(" - Train Confidence: {0:.3f}".format(rule_confidence.get((premise, conclusion), -1)))
    print(" - Test Confidence: {0:.3f}".format(test_confidence.get((premise, conclusion), -1)))
    print("")

Rule #1
Rule: If a person recommends Pulp Fiction (1994), Contact (1997), Empire Strikes Back, The (1980), Return of the Jedi (1983), Twelve Monkeys (1995) they will also recommend Star Wars (1977)
 - Train Confidence: 1.000
 - Test Confidence: 0.966

Rule #2
Rule: If a person recommends Silence of the Lambs, The (1991), Godfather, The (1972), Empire Strikes Back, The (1980), Raiders of the Lost Ark (1981), Twelve Monkeys (1995) they will also recommend Shawshank Redemption, The (1994)
 - Train Confidence: 1.000
 - Test Confidence: 0.854

Rule #3
Rule: If a person recommends Pulp Fiction (1994), Toy Story (1995), Shawshank Redemption, The (1994), Godfather, The (1972) they will also recommend Silence of the Lambs, The (1991)
 - Train Confidence: 1.000
 - Test Confidence: 0.870

Rule #4
Rule: If a person recommends Shawshank Redemption, The (1994), Fargo (1996), Return of the Jedi (1983), Raiders of the Lost Ark (1981), Fugitive, The (1993) they will also recommend Pulp Fiction (1994)
 