# Most-Popular Recommender

**Idea: Recommending what's most popular and also not already in "interested inventory" of user**

**Caveat: "most-popular" doesn't take into consideration user's personal interests**

In [2]:
from collections import Counter
from operator import itemgetter

In [10]:
usersInterests = [
    ["Hadoop", "Big Data", "HBase", "Java", "Spark", "Storm", "Cassandra"],
    ["NoSQL", "MongoDB", "Cassandra", "HBase", "Postgres"],
    ["Python", "scikit-learn", "scipy", "numpy", "statsmodels", "pandas"],
    ["R", "Python", "statistics", "regression", "probability"],
    ["machine learning", "regression", "decision trees", "libsvm"],
    ["Python", "R", "Java", "C++", "Haskell", "programming languages"],
    ["statistics", "probability", "mathematics", "theory"],
    ["machine learning", "scikit-learn", "Mahout", "neural networks"],
    ["neural networks", "deep learning", "Big Data", "artificial intelligence"],
    ["Hadoop", "Java", "MapReduce", "Big Data"],
    ["statistics", "R", "statsmodels"],
    ["C++", "deep learning", "artificial intelligence", "probability"],
    ["pandas", "R", "Python"],
    ["databases", "HBase", "Postgres", "MySQL", "MongoDB"],
    ["libsvm", "regression", "support vector machines"]
]

In [29]:
# MOST-POPULAR RANKING: SIMPLE COUNTER
popularInterests = Counter(interest for userInterests in usersInterests for interest in userInterests)
    #                       ^
    #                       |
    #          the list compr. flattens the list of lists.
print popularInterests # Counter returns a descending dict by count.

Counter({'Python': 4, 'R': 4, 'Java': 3, 'regression': 3, 'statistics': 3, 'probability': 3, 'HBase': 3, 'Big Data': 3, 'neural networks': 2, 'Hadoop': 2, 'deep learning': 2, 'pandas': 2, 'artificial intelligence': 2, 'libsvm': 2, 'C++': 2, 'Postgres': 2, 'MongoDB': 2, 'scikit-learn': 2, 'machine learning': 2, 'statsmodels': 2, 'Cassandra': 2, 'NoSQL': 1, 'Mahout': 1, 'Storm': 1, 'MySQL': 1, 'programming languages': 1, 'Haskell': 1, 'mathematics': 1, 'Spark': 1, 'numpy': 1, 'theory': 1, 'decision trees': 1, 'MapReduce': 1, 'scipy': 1, 'databases': 1, 'support vector machines': 1})


In [30]:
popularInterests = sorted(popularInterests.items(), key=itemgetter(1), reverse=True)
print popularInterests # turn Counter dict into a list (also descending) for the convenience of search.

[('Python', 4), ('R', 4), ('Java', 3), ('regression', 3), ('statistics', 3), ('probability', 3), ('HBase', 3), ('Big Data', 3), ('neural networks', 2), ('Hadoop', 2), ('deep learning', 2), ('pandas', 2), ('artificial intelligence', 2), ('libsvm', 2), ('C++', 2), ('Postgres', 2), ('MongoDB', 2), ('scikit-learn', 2), ('machine learning', 2), ('statsmodels', 2), ('Cassandra', 2), ('NoSQL', 1), ('Mahout', 1), ('Storm', 1), ('MySQL', 1), ('programming languages', 1), ('Haskell', 1), ('mathematics', 1), ('Spark', 1), ('numpy', 1), ('theory', 1), ('decision trees', 1), ('MapReduce', 1), ('scipy', 1), ('databases', 1), ('support vector machines', 1)]


In [31]:
def most_popular_new_interests(userInterests, maxResults=5):
    # return top maxResults most popular interests the user is not already interested in.
    suggestions = [(interest, freq) for interest,freq in popularInterests
                   if interest not in userInterests]
    return suggestions[:maxResults]
user0Interests = usersInterests[0]
most_popular_new_interests(user0Interests)

[('Python', 4),
 ('R', 4),
 ('regression', 3),
 ('statistics', 3),
 ('probability', 3)]

# User-Based Collaborative Filtering

**Idea: Find similar users to a user and recommend things those guys are interested in**

**Caveat: Doesn't work well when # of items gets large **

In [99]:
import numpy as np
import theano.tensor as T
from theano import function
from collections import defaultdict

In [59]:
# SIMILARITY MEASURE: COSINE
v = T.vector('v')
w = T.vector('w')
vwCosine = T.dot(v,w) / T.sqrt(T.dot(v,v)*T.dot(w,w))
cosine_similarity = function([v,w], vwCosine)

In [44]:
uniqueInterests = sorted(list({interest for userInterests in usersInterests for interest in userInterests}))
    # a list of unique interests sorted alphabetically.
    # {..} is set compr.
print uniqueInterests

['Big Data', 'C++', 'Cassandra', 'HBase', 'Hadoop', 'Haskell', 'Java', 'Mahout', 'MapReduce', 'MongoDB', 'MySQL', 'NoSQL', 'Postgres', 'Python', 'R', 'Spark', 'Storm', 'artificial intelligence', 'databases', 'decision trees', 'deep learning', 'libsvm', 'machine learning', 'mathematics', 'neural networks', 'numpy', 'pandas', 'probability', 'programming languages', 'regression', 'scikit-learn', 'scipy', 'statistics', 'statsmodels', 'support vector machines', 'theory']


In [47]:
def make_user_interest_vector(userInterests):
    # return a vector (len=uniqueInterest vector) where
    #  a cell = 1 if the user is interested in the corresponding, 0 otherwise.
    return [1 if interest in userInterests else 0 for interest in uniqueInterests]

In [51]:
userInterestMatrix = map(make_user_interest_vector, usersInterests)
for userInterests in userInterestMatrix:
    print userInterests

[1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]
[0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1]
[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]
[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 0, 0, 0, 1, 0, 

================================================================================================

### A. Naive Pairwise Cosine (for small data)

In [91]:
%%time
userSimilarities = [[cosine_similarity(interestVector_i, interestVector_j).item() 
                     for interestVector_j in userInterestMatrix] 
                    for interestVector_i in userInterestMatrix]
    # .item(): cosine_similarity produces array(val), calling to get val.
for userSimilarityVector in userSimilarities:
    print userSimilarityVector

[1.0, 0.3380617018914066, 0.0, 0.0, 0.0, 0.1543033499620919, 0.0, 0.0, 0.1889822365046136, 0.5669467095138409, 0.0, 0.0, 0.0, 0.1690308509457033, 0.0]
[0.3380617018914066, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.6, 0.0]
[0.0, 0.0, 1.0, 0.18257418583505536, 0.0, 0.16666666666666666, 0.0, 0.20412414523193154, 0.0, 0.0, 0.23570226039551587, 0.0, 0.47140452079103173, 0.0, 0.0]
[0.0, 0.0, 0.18257418583505536, 1.0, 0.22360679774997896, 0.3651483716701107, 0.4472135954999579, 0.0, 0.0, 0.0, 0.5163977794943222, 0.22360679774997896, 0.5163977794943222, 0.0, 0.2581988897471611]
[0.0, 0.0, 0.0, 0.22360679774997896, 1.0, 0.0, 0.0, 0.25, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5773502691896258]
[0.1543033499620919, 0.0, 0.16666666666666666, 0.3651483716701107, 0.0, 1.0, 0.0, 0.0, 0.0, 0.20412414523193154, 0.23570226039551587, 0.20412414523193154, 0.47140452079103173, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.4472135954999579, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.2886751345948129, 0.25, 0.0, 0.0, 0.0]
[0

### B. Efficient Cosine Similarity Matrix Computation

In [92]:
userInterestMatrix = np.asarray(userInterestMatrix)
def cosine(interestMatrix):
    interestMatrix_norm = interestMatrix / np.apply_along_axis(lambda r: np.sqrt(np.dot(r,r)), 
                                                               1, interestMatrix)[:,np.newaxis]
        # each r is a userInterestVector.
        # procedure: normalize each row => divide each row by corresponding row-norm.
    return np.dot(interestMatrix_norm, interestMatrix_norm.T)

In [94]:
%%time
userSimilarities = cosine(userInterestMatrix)
print userSimilarities

[[ 1.          0.3380617   0.          0.          0.          0.15430335
   0.          0.          0.18898224  0.56694671  0.          0.          0.
   0.16903085  0.        ]
 [ 0.3380617   1.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.          0.6
   0.        ]
 [ 0.          0.          1.          0.18257419  0.          0.16666667
   0.          0.20412415  0.          0.          0.23570226  0.
   0.47140452  0.          0.        ]
 [ 0.          0.          0.18257419  1.          0.2236068   0.36514837
   0.4472136   0.          0.          0.          0.51639778  0.2236068
   0.51639778  0.          0.25819889]
 [ 0.          0.          0.          0.2236068   1.          0.          0.
   0.25        0.          0.          0.          0.          0.          0.
   0.57735027]
 [ 0.15430335  0.          0.16666667  0.36514837  0.          1.          0.
   0.          0.          0.204124

================================================================================================

In [97]:
# FIND K-MOST SIMILAR OTHER USERS
def most_similar_users_to(userID):
    pairs = [(otherUserID, similarity) for otherUserID,similarity in enumerate(userSimilarities[userID])
             if userID!=otherUserID and similarity>0]
    return sorted(pairs, key=lambda (_,similarity):similarity, reverse=True)

In [98]:
most_similar_users_to(0)

[(9, 0.56694670951384074),
 (1, 0.33806170189140661),
 (8, 0.1889822365046136),
 (13, 0.1690308509457033),
 (5, 0.15430334996209191)]

In [100]:
# MAKING RECOMMENDATION
#  for each interest, add up the user similarities of the other users interested in it.
#  NB: the interest pool is all the interests of users most similar to the current user.
def user_based_suggestions(userID, includeCurrentInterests=False):
    suggestions = defaultdict(float)
    for otherUserID,similarity in most_similar_users_to(userID):
        for interest in usersInterests[otherUserID]:
            suggestions[interest] += similarity
    suggestions = sorted(suggestions.items(), key=lambda (_,weight):weight, reverse=True)
    if includeCurrentInterests: return suggestions
    return [(suggestion,weight) for suggestion,weight in suggestions 
            if suggestion not in usersInterests[userID]]

In [101]:
user_based_suggestions(0)

[('MapReduce', 0.56694670951384074),
 ('MongoDB', 0.50709255283710997),
 ('Postgres', 0.50709255283710997),
 ('NoSQL', 0.33806170189140661),
 ('neural networks', 0.1889822365046136),
 ('deep learning', 0.1889822365046136),
 ('artificial intelligence', 0.1889822365046136),
 ('databases', 0.1690308509457033),
 ('MySQL', 0.1690308509457033),
 ('programming languages', 0.15430334996209191),
 ('Python', 0.15430334996209191),
 ('Haskell', 0.15430334996209191),
 ('C++', 0.15430334996209191),
 ('R', 0.15430334996209191)]

# Item-Based Collaborative Filtering

In [119]:
# INTEREST-USER MATRIX
interestUserMatrix = [[userInterestVector[j] for userInterestVector in userInterestMatrix]
                      for j,_ in enumerate(uniqueInterests)] # transposition.

In [125]:
interestSimilarities = cosine(interestUserMatrix)
interestSimilarities

array([[ 1.        ,  0.        ,  0.40824829, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  1.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.40824829,  0.        ,  1.        , ...,  0.        ,
         0.        ,  0.        ],
       ..., 
       [ 0.        ,  0.        ,  0.        , ...,  1.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         1.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  1.        ]])

In [126]:
# FIND K-MOST SIMILAR OTHER INTERESTS
def most_similar_interests_to(interestID):
    pairs = [(uniqueInterests[otherInterestID], similarity) 
              for otherInterestID,similarity in enumerate(interestSimilarities[interestID])
             if interestID!=otherInterestID and similarity>0]
    return sorted(pairs, key=lambda (_,similarity):similarity, reverse=True)

In [127]:
most_similar_interests_to(2)

[('HBase', 0.81649658092772603),
 ('NoSQL', 0.70710678118654746),
 ('Spark', 0.70710678118654746),
 ('Storm', 0.70710678118654746),
 ('Hadoop', 0.49999999999999989),
 ('MongoDB', 0.49999999999999989),
 ('Postgres', 0.49999999999999989),
 ('Big Data', 0.40824829046386302),
 ('Java', 0.40824829046386302)]

In [128]:
# MAKING RECOMMENDATION
# scan through all interests of a user, for each interest i, find
#  its closest interests, then for each of these close neighbors, 
#  add it and its weight (its similarity to i) to suggestion dictionary. 
def item_based_suggestions(userID, includeCurrentInterests=False):
    suggestions = defaultdict(float)
    userInterestVector = userInterestMatrix[userID]
    for interestID,isInterested in enumerate(userInterestVector):
        if isInterested: # isInterested checks out cells in userInterestVector, returns 1 if true.
            similarInterests = most_similar_interests_to(interestID)
            for interest,similarity in similarInterests:
                suggestions[interest] += similarity
    suggestions = sorted(suggestions.items(), key=lambda (_,similarity):similarity, reverse=True)
    if includeCurrentInterests: return suggestions
    return [(suggestion, weight) for suggestion,weight in suggestions
            if suggestion not in usersInterests[userID]]

In [129]:
item_based_suggestions(0)

[('MapReduce', 1.8618073195657989),
 ('Postgres', 1.3164965809277258),
 ('MongoDB', 1.3164965809277258),
 ('NoSQL', 1.2844570503761732),
 ('programming languages', 0.57735026918962584),
 ('MySQL', 0.57735026918962584),
 ('Haskell', 0.57735026918962584),
 ('databases', 0.57735026918962584),
 ('neural networks', 0.40824829046386302),
 ('deep learning', 0.40824829046386302),
 ('C++', 0.40824829046386302),
 ('artificial intelligence', 0.40824829046386302),
 ('Python', 0.28867513459481292),
 ('R', 0.28867513459481292)]