<h1>CH22 推薦系統介紹</h1>

In [1]:
import math, random
from collections import defaultdict, Counter
from linear_algebra import dot

users_interests = [
    ["Hadoop", "Big Data", "HBase", "Java", "Spark", "Storm", "Cassandra"],
    ["NoSQL", "MongoDB", "Cassandra", "HBase", "Postgres"],
    ["Python", "scikit-learn", "scipy", "numpy", "statsmodels", "pandas"],
    ["R", "Python", "statistics", "regression", "probability"],
    ["machine learning", "regression", "decision trees", "libsvm"],
    ["Python", "R", "Java", "C++", "Haskell", "programming languages"],
    ["statistics", "probability", "mathematics", "theory"],
    ["machine learning", "scikit-learn", "Mahout", "neural networks"],
    ["neural networks", "deep learning", "Big Data", "artificial intelligence"],
    ["Hadoop", "Java", "MapReduce", "Big Data"],
    ["statistics", "R", "statsmodels"],
    ["C++", "deep learning", "artificial intelligence", "probability"],
    ["pandas", "R", "Python"],
    ["databases", "HBase", "Postgres", "MySQL", "MongoDB"],
    ["libsvm", "regression", "support vector machines"]
]

#15使用者
#36興趣

<h1>Recommending What’s Popular</h1>

In [2]:
popular_interests = Counter(interest
                            for user_interests in users_interests
                            for interest in user_interests).most_common()
popular_interests

#此步驟為計算所有興趣的出現次數，並做排序

[('Python', 4),
 ('R', 4),
 ('Java', 3),
 ('Big Data', 3),
 ('HBase', 3),
 ('regression', 3),
 ('statistics', 3),
 ('probability', 3),
 ('MongoDB', 2),
 ('artificial intelligence', 2),
 ('Hadoop', 2),
 ('statsmodels', 2),
 ('libsvm', 2),
 ('machine learning', 2),
 ('C++', 2),
 ('deep learning', 2),
 ('Cassandra', 2),
 ('Postgres', 2),
 ('scikit-learn', 2),
 ('neural networks', 2),
 ('pandas', 2),
 ('programming languages', 1),
 ('MySQL', 1),
 ('decision trees', 1),
 ('MapReduce', 1),
 ('Spark', 1),
 ('databases', 1),
 ('numpy', 1),
 ('mathematics', 1),
 ('Mahout', 1),
 ('Haskell', 1),
 ('theory', 1),
 ('support vector machines', 1),
 ('NoSQL', 1),
 ('scipy', 1),
 ('Storm', 1)]

In [4]:
def most_popular_new_interests(user_interests, max_results=5):
    suggestions = [(interest, frequency)
                   for interest, frequency in popular_interests
                   if interest not in user_interests]
    return suggestions[:max_results]  # 列到 輸入的最大數




#此函數可的輸入為  1.使用者編號   2.最大興趣個數  
#回傳變數為 1.興趣 2.其興趣所出現之總次數

In [5]:
most_popular_new_interests(users_interests[1], 5)

[('Python', 4), ('R', 4), ('Big Data', 3), ('Java', 3), ('probability', 3)]

In [5]:
most_popular_new_interests(users_interests[1], 4)

[('Python', 4), ('R', 4), ('Java', 3), ('Big Data', 3)]

<h1>User-Based Collaborative Filtering (依使用者推薦)</h1>

In [6]:
#
# user-based filtering
#

def cosine_similarity(v, w):
    return dot(v, w) / math.sqrt(dot(v, v) * dot(w, w))

#定義COS夾角函數

In [9]:
unique_interests = sorted(list({ interest
                                 for user_interests in users_interests
                                 for interest in user_interests }))

unique_interests


                           
#此步驟將所有興趣取出後 ，依照字母排列

['Big Data',
 'C++',
 'Cassandra',
 'HBase',
 'Hadoop',
 'Haskell',
 'Java',
 'Mahout',
 'MapReduce',
 'MongoDB',
 'MySQL',
 'NoSQL',
 'Postgres',
 'Python',
 'R',
 'Spark',
 'Storm',
 'artificial intelligence',
 'databases',
 'decision trees',
 'deep learning',
 'libsvm',
 'machine learning',
 'mathematics',
 'neural networks',
 'numpy',
 'pandas',
 'probability',
 'programming languages',
 'regression',
 'scikit-learn',
 'scipy',
 'statistics',
 'statsmodels',
 'support vector machines',
 'theory']

In [10]:
def make_user_interest_vector(user_interests):
    """given a list of interests, produce a vector whose i-th element is 1
    if unique_interests[i] is in the list, 0 otherwise"""
    return [1 if interest in user_interests else 0
            for interest in unique_interests]

#此函數將原本使用者的興趣 ，依照上一步驟之次序，填入1和0  ，以此最為使用者之興趣向量

In [11]:
user_interest_matrix = list(map(make_user_interest_vector, users_interests))

user_interest_matrix

#將使用者用map函數依次帶入 ，求出使用者興趣向量，再用list組成 矩陣

[[1,
  0,
  1,
  1,
  1,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [0,
  0,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  1,
  0,
  0,
  0,
  1,
  1,
  0,
  1,
  0,
  0],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  0,
  1,
  0,
  0,
  0],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0],
 [0,
  1,
  0,
  0,
  0,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  1,
  0,
  0,
  0,
  0,


In [12]:
user_similarities = [[cosine_similarity(interest_vector_i, interest_vector_j)
                      for interest_vector_j in user_interest_matrix]
                     for interest_vector_i in user_interest_matrix]

user_similarities

#將每個使用者分別對所有使用者取cos值

[[1.0,
  0.3380617018914066,
  0.0,
  0.0,
  0.0,
  0.1543033499620919,
  0.0,
  0.0,
  0.1889822365046136,
  0.5669467095138409,
  0.0,
  0.0,
  0.0,
  0.1690308509457033,
  0.0],
 [0.3380617018914066,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.6,
  0.0],
 [0.0,
  0.0,
  1.0,
  0.18257418583505536,
  0.0,
  0.16666666666666666,
  0.0,
  0.20412414523193154,
  0.0,
  0.0,
  0.23570226039551587,
  0.0,
  0.47140452079103173,
  0.0,
  0.0],
 [0.0,
  0.0,
  0.18257418583505536,
  1.0,
  0.22360679774997896,
  0.3651483716701107,
  0.4472135954999579,
  0.0,
  0.0,
  0.0,
  0.5163977794943222,
  0.22360679774997896,
  0.5163977794943222,
  0.0,
  0.2581988897471611],
 [0.0,
  0.0,
  0.0,
  0.22360679774997896,
  1.0,
  0.0,
  0.0,
  0.25,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.5773502691896258],
 [0.1543033499620919,
  0.0,
  0.16666666666666666,
  0.3651483716701107,
  0.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.20412414523193154,
  0.235702260

In [15]:
def most_similar_users_to(user_id):
    pairs = [(other_user_id, similarity)                      # find other
             for other_user_id, similarity in                 # users with
                enumerate(user_similarities[user_id])         # nonzero
             if user_id != other_user_id and similarity > 0]  # similarity

    return sorted(pairs,                                      # sort them
                  key=lambda pair: pair[1],                   # most similar
                  reverse=True)                               # first

#此函數之輸入值為 非負整數(此數字做為使用者位置代碼)
#回傳值為cos值大於0之使用者代號 和 權重(cos值) ，並依照銓重由大到小排列


In [22]:
most_similar_users_to(0)

[(9, 0.5669467095138409),
 (1, 0.3380617018914066),
 (8, 0.1889822365046136),
 (13, 0.1690308509457033),
 (5, 0.1543033499620919)]

In [14]:
most_similar_users_to(5)

[(12, 0.47140452079103173),
 (3, 0.3651483716701107),
 (10, 0.23570226039551587),
 (9, 0.20412414523193154),
 (11, 0.20412414523193154),
 (2, 0.16666666666666666),
 (0, 0.1543033499620919)]

In [22]:
def user_based_suggestions(user_id, include_current_interests=False):
    # sum up the similarities
    suggestions = defaultdict(float)   #起始值為空
    for other_user_id, similarity in most_similar_users_to(user_id):  
        for interest in users_interests[other_user_id]:
            suggestions[interest] += similarity  #形式為興趣  加上 權重

    # convert them to a sorted list
    suggestions = sorted(suggestions.items(),
                         key=lambda pair: pair[1],
                         reverse=True)

    # and (maybe) exclude already-interests
    if include_current_interests:
        return suggestions
    else:
        return [(suggestion, weight)
                for suggestion, weight in suggestions
                if suggestion not in users_interests[user_id]]
    
#依照上述函數所列出的使用者及其權重 回填入興趣向量(若以有值可以往上累加)，並回傳排序後的興趣作為推薦興趣



In [19]:
user_based_suggestions(0)

[('MapReduce', 0.5669467095138409),
 ('Postgres', 0.50709255283711),
 ('MongoDB', 0.50709255283711),
 ('NoSQL', 0.3380617018914066),
 ('neural networks', 0.1889822365046136),
 ('deep learning', 0.1889822365046136),
 ('artificial intelligence', 0.1889822365046136),
 ('MySQL', 0.1690308509457033),
 ('databases', 0.1690308509457033),
 ('programming languages', 0.1543033499620919),
 ('Python', 0.1543033499620919),
 ('Haskell', 0.1543033499620919),
 ('C++', 0.1543033499620919),
 ('R', 0.1543033499620919)]

In [20]:
user_based_suggestions(0,'True')

[('Big Data', 0.7559289460184544),
 ('Java', 0.7212500594759328),
 ('Hadoop', 0.5669467095138409),
 ('MapReduce', 0.5669467095138409),
 ('Postgres', 0.50709255283711),
 ('HBase', 0.50709255283711),
 ('MongoDB', 0.50709255283711),
 ('Cassandra', 0.3380617018914066),
 ('NoSQL', 0.3380617018914066),
 ('neural networks', 0.1889822365046136),
 ('deep learning', 0.1889822365046136),
 ('artificial intelligence', 0.1889822365046136),
 ('MySQL', 0.1690308509457033),
 ('databases', 0.1690308509457033),
 ('programming languages', 0.1543033499620919),
 ('Python', 0.1543033499620919),
 ('Haskell', 0.1543033499620919),
 ('C++', 0.1543033499620919),
 ('R', 0.1543033499620919)]

<h1>Item-Based Collaborative Filtering (依興趣推薦)</h1>

In [23]:
interest_user_matrix = [[user_interest_vector[j]
                         for user_interest_vector in user_interest_matrix]
                        for j, _ in enumerate(unique_interests)]

interest_user_matrix


#和前面使用者矩陣手法相同   只是此筆矩陣以興趣為代表
#興趣和興趣間的關聯  取決於是否相同的兩種興趣   會被不同人選到

[[1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0],
 [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
 [1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
 [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
 [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
 [0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0],
 [0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0],
 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
 [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,

In [14]:
interest_similarities = [[cosine_similarity(user_vector_i, user_vector_j)
                          for user_vector_j in interest_user_matrix]
                         for user_vector_i in interest_user_matrix]

interest_similarities

[[1.0,
  0.0,
  0.4082482904638631,
  0.3333333333333333,
  0.8164965809277261,
  0.0,
  0.6666666666666666,
  0.0,
  0.5773502691896258,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.5773502691896258,
  0.5773502691896258,
  0.4082482904638631,
  0.0,
  0.0,
  0.4082482904638631,
  0.0,
  0.0,
  0.0,
  0.4082482904638631,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0],
 [0.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.7071067811865475,
  0.4082482904638631,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.35355339059327373,
  0.35355339059327373,
  0.0,
  0.0,
  0.5,
  0.0,
  0.0,
  0.5,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.4082482904638631,
  0.7071067811865475,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0],
 [0.4082482904638631,
  0.0,
  1.0,
  0.8164965809277261,
  0.5,
  0.0,
  0.4082482904638631,
  0.0,
  0.0,
  0.5,
  0.0,
  0.7071067811865475,
  0.5,
  0.0,
  0.0,
  0.7071067811865475,
  0.7071067811865475,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
 

In [26]:
def most_similar_interests_to(interest_id):
    similarities = interest_similarities[interest_id]
    pairs = [(unique_interests[other_interest_id], similarity)
             for other_interest_id, similarity in enumerate(similarities)
             if interest_id != other_interest_id and similarity > 0]
    return sorted(pairs,
                  key=lambda pair: pair[1],
                  reverse=True)

#輸入興趣代號   只要和其他興趣之COS值不為0   都取入

In [24]:
most_similar_interests_to(0)

[('Hadoop', 0.8164965809277261),
 ('Java', 0.6666666666666666),
 ('MapReduce', 0.5773502691896258),
 ('Spark', 0.5773502691896258),
 ('Storm', 0.5773502691896258),
 ('Cassandra', 0.4082482904638631),
 ('artificial intelligence', 0.4082482904638631),
 ('deep learning', 0.4082482904638631),
 ('neural networks', 0.4082482904638631),
 ('HBase', 0.3333333333333333)]

In [26]:
most_similar_interests_to(1)

[('Haskell', 0.7071067811865475),
 ('programming languages', 0.7071067811865475),
 ('artificial intelligence', 0.5),
 ('deep learning', 0.5),
 ('Java', 0.4082482904638631),
 ('probability', 0.4082482904638631),
 ('Python', 0.35355339059327373),
 ('R', 0.35355339059327373)]

In [29]:
most_similar_interests_to(12)

[('MongoDB', 1.0),
 ('HBase', 0.8164965809277261),
 ('MySQL', 0.7071067811865475),
 ('NoSQL', 0.7071067811865475),
 ('databases', 0.7071067811865475),
 ('Cassandra', 0.5)]

In [27]:
def item_based_suggestions(user_id, include_current_interests=False):
    suggestions = defaultdict(float)
    user_interest_vector = user_interest_matrix[user_id]
    for interest_id, is_interested in enumerate(user_interest_vector):
        if is_interested == 1:
            similar_interests = most_similar_interests_to(interest_id)
            for interest, similarity in similar_interests:
                suggestions[interest] += similarity

                
                
    suggestions = sorted(suggestions.items(),
                         key=lambda pair: pair[1],
                         reverse=True)
    
    

    if include_current_interests:
        return suggestions
    else:
        return [(suggestion, weight)
                for suggestion, weight in suggestions
                if suggestion not in users_interests[user_id]]

    
#輸入使用者代號   將其興趣相關的所有興趣列出   若有重複  權重累加計算

In [25]:
item_based_suggestions(0)

[('MapReduce', 1.861807319565799),
 ('MongoDB', 1.3164965809277263),
 ('Postgres', 1.3164965809277263),
 ('NoSQL', 1.2844570503761732),
 ('databases', 0.5773502691896258),
 ('MySQL', 0.5773502691896258),
 ('programming languages', 0.5773502691896258),
 ('Haskell', 0.5773502691896258),
 ('neural networks', 0.4082482904638631),
 ('deep learning', 0.4082482904638631),
 ('C++', 0.4082482904638631),
 ('artificial intelligence', 0.4082482904638631),
 ('Python', 0.2886751345948129),
 ('R', 0.2886751345948129)]

item_based_suggestions是將含有元素的權重相加

In [30]:
item_based_suggestions(10)

[('Python', 1.3922285251880866),
 ('probability', 0.9553418012614796),
 ('pandas', 0.8535533905932737),
 ('scipy', 0.7071067811865475),
 ('numpy', 0.7071067811865475),
 ('regression', 0.6220084679281462),
 ('theory', 0.5773502691896258),
 ('mathematics', 0.5773502691896258),
 ('scikit-learn', 0.5),
 ('programming languages', 0.5),
 ('Haskell', 0.5),
 ('C++', 0.35355339059327373),
 ('Java', 0.2886751345948129)]

In [31]:
most_similar_interests_to(14)

[('Python', 0.75),
 ('statistics', 0.5773502691896258),
 ('Haskell', 0.5),
 ('programming languages', 0.5),
 ('C++', 0.35355339059327373),
 ('pandas', 0.35355339059327373),
 ('statsmodels', 0.35355339059327373),
 ('Java', 0.2886751345948129),
 ('probability', 0.2886751345948129),
 ('regression', 0.2886751345948129)]

In [32]:
most_similar_interests_to(32)

[('probability', 0.6666666666666666),
 ('R', 0.5773502691896258),
 ('mathematics', 0.5773502691896258),
 ('theory', 0.5773502691896258),
 ('statsmodels', 0.4082482904638631),
 ('regression', 0.3333333333333333),
 ('Python', 0.2886751345948129)]

In [33]:
most_similar_interests_to(33)

[('numpy', 0.7071067811865475),
 ('scipy', 0.7071067811865475),
 ('pandas', 0.5),
 ('scikit-learn', 0.5),
 ('statistics', 0.4082482904638631),
 ('Python', 0.35355339059327373),
 ('R', 0.35355339059327373)]

<h1>將以上函數整合</h1>

In [17]:
if __name__ == "__main__":

    print("Popular Interests")
    print(popular_interests)
    print()

    print("Most Popular New Interests")
    print("already like:", ["NoSQL", "MongoDB", "Cassandra", "HBase", "Postgres"])
    print(most_popular_new_interests(["NoSQL", "MongoDB", "Cassandra", "HBase", "Postgres"]))
    print()
    print("already like:", ["R", "Python", "statistics", "regression", "probability"])
    print(most_popular_new_interests(["R", "Python", "statistics", "regression", "probability"]))
    print()

    print("User based similarity")
    print("most similar to 0")
    print(most_similar_users_to(0))

    print("Suggestions for 0")
    print(user_based_suggestions(0))
    print()

    print("Item based similarity")
    print("most similar to 'Big Data'")
    print(most_similar_interests_to(0))
    print()

    print("suggestions for user 0")
    print(item_based_suggestions(0))

Popular Interests
[('R', 4), ('Python', 4), ('probability', 3), ('Java', 3), ('statistics', 3), ('HBase', 3), ('regression', 3), ('Big Data', 3), ('Cassandra', 2), ('neural networks', 2), ('Hadoop', 2), ('MongoDB', 2), ('deep learning', 2), ('pandas', 2), ('C++', 2), ('libsvm', 2), ('Postgres', 2), ('statsmodels', 2), ('scikit-learn', 2), ('machine learning', 2), ('artificial intelligence', 2), ('Mahout', 1), ('Storm', 1), ('scipy', 1), ('numpy', 1), ('Spark', 1), ('support vector machines', 1), ('databases', 1), ('theory', 1), ('MapReduce', 1), ('mathematics', 1), ('NoSQL', 1), ('MySQL', 1), ('decision trees', 1), ('programming languages', 1), ('Haskell', 1)]

Most Popular New Interests
already like: ['NoSQL', 'MongoDB', 'Cassandra', 'HBase', 'Postgres']
[('R', 4), ('Python', 4), ('probability', 3), ('Java', 3), ('statistics', 3)]

already like: ['R', 'Python', 'statistics', 'regression', 'probability']
[('Java', 3), ('HBase', 3), ('Big Data', 3), ('Cassandra', 2), ('neural networks',