In [5]:
%load_ext autoreload
%autoreload 2
%pylab inline

from collections import defaultdict, OrderedDict, Counter
from copy import deepcopy
import time
import numpy as np
from pprint import pprint
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from scipy.stats import poisson
from datetime import date, datetime
import uuid
import pandas as pd
from tputils import csv_load

pd.set_option('display.max_columns', None)
pd.set_option('display.mpl_style', 'default') # Make the graphs a bit prettier
plt.rcParams['figure.figsize'] = (12, 5)
font = {'size'   : 12}
matplotlib.rc('font', **font)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"
mpl_style had been deprecated and will be removed in a future version.
Use `matplotlib.pyplot.style.use` instead.

  exec(code_obj, self.user_global_ns, self.user_ns)


# Load data

In [6]:
doc_tags = csv_load("doc_tags.csv", delimiter=",")
tag_names = doc_tags[0][1:]
doc_names = [row[0] for row in doc_tags[1:]]
doc_tags = [row[1:] for row in doc_tags[1:]]
doc_tags = np.array(doc_tags, dtype=int)

nr_docs, nr_tags = doc_tags.shape

print(doc_names)
print(tag_names)
print(doc_tags)
print("nr_docs", nr_docs)
print("nr_tags", nr_tags)

assert len(tag_names) == nr_tags
assert len(doc_names) == nr_docs

['doc1', 'doc2', 'doc3', 'doc4', 'doc5', 'doc6', 'doc7', 'doc8', 'doc9', 'doc10', 'doc11', 'doc12', 'doc13', 'doc14', 'doc15', 'doc16', 'doc17', 'doc18', 'doc19', 'doc20']
['baseball', 'economics', 'politics', 'Europe', 'Asia', 'soccer', 'war', 'security', 'shopping', 'family']
[[1 0 1 0 1 1 0 0 0 1]
 [0 1 1 1 0 0 0 1 0 0]
 [0 0 0 1 1 1 0 0 0 0]
 [0 0 1 1 0 0 1 1 0 0]
 [0 1 0 0 0 0 0 0 1 1]
 [1 0 0 1 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 1 0 1]
 [0 0 1 1 0 0 1 0 0 1]
 [0 0 0 0 0 1 0 0 1 0]
 [0 1 0 0 1 0 1 0 0 0]
 [0 0 1 0 1 0 0 0 1 0]
 [1 0 0 0 0 1 1 0 0 0]
 [0 0 1 1 1 0 0 1 0 0]
 [0 1 1 1 0 0 0 0 1 0]
 [0 0 0 1 0 1 1 1 0 0]
 [1 0 0 0 0 1 0 0 1 0]
 [0 1 1 1 0 0 0 1 0 0]
 [0 0 0 1 0 0 0 0 1 0]
 [0 1 1 0 1 0 1 0 0 1]
 [0 0 1 1 0 0 1 0 1 0]]
nr_docs 20
nr_tags 10


In [7]:
user_ratings = csv_load("user_ratings.csv", delimiter=",")
user_names = user_ratings[0]
user_ratings = user_ratings[1:]
user_ratings = np.array(user_ratings, dtype=int).T

nr_users = user_ratings.shape[0]

print(user_names)
print(user_ratings)
print("nr_users", nr_users)

assert user_ratings.shape[1] == nr_docs

['User 1', 'User 2']
[[ 1 -1  0  0  0  1  0  0  0  0  0  0  0  0  0  1  0  0 -1  0]
 [-1  1  0  1  0  0  0  0  0  0  0 -1  0  0  0  0  1  0  0  0]]
nr_users 2


# Part 1. Build and use a very basic profile

First, you will build a very simple profile of user preferences for attributes. In this profile, you’ll count the total the number of positive and negative evaluations associated with each attribute, and create a profile with the total score for each attribute for each user. For example, user 1’s score for “Family” will get a +1 from doc1 (positive evaluation) and a -1 from doc 19 (negative evaluation) for a total profile value of 0 (neutral). In contrast, user 2’s score for Europe will be +3 (+1 each for doc2, doc4, and doc17).

You can compute the profiles and place them in the “User Profiles” section of the spreadsheet.

Now compute the predicted score for each user for each document (a simple dot-product). Type in the answers to the following questions (answers may include already-rated articles) as part of the Quiz:

 - Which document does the simple profile predict user 1 will like best?
 - What score does that prediction get?
 - How many documents does the model predict user 2 will dislike (prediction score that is negative)?

Notice that this model is consistent with the users’ ratings -- it predicts liking for all the positive documents and disliking for all the negative ones.


In [12]:
def get_user_profiles(user_ratings, doc_tags):
    user_profiles = []
    for user_rating in user_ratings:
        user_profiles.append(sum(user_rating.reshape(-1, 1) * doc_tags, axis=0))

    return np.stack(user_profiles)

user_profiles = get_user_profiles(user_ratings, doc_tags)
print(tag_names)
print(user_profiles)

['baseball', 'economics', 'politics', 'Europe', 'Asia', 'soccer', 'war', 'security', 'shopping', 'family']
[[ 3 -2 -1  0  0  2 -1 -1  1  0]
 [-2  2  2  3 -1 -2  0  3  0 -1]]


In [19]:
def get_user_doc_predictions(user_profiles, doc_tags):
    user_predictions_list = []
    user_doc_predictions= np.matmul(user_profiles, doc_tags.T)
    for user_doc_prediction in user_doc_predictions:
        user_predictions = sorted(zip(doc_names, user_doc_prediction), reverse=True, key=lambda x: x[1])
        user_predictions_list.append(user_predictions)
    return user_predictions_list
        
predictions = get_user_doc_predictions(user_profiles, doc_tags)
predictions

[[('doc16', 3.3335847797553408),
  ('doc12', 2.3090209150657572),
  ('doc1', 2.2562345105657151),
  ('doc6', 1.9387774270626785),
  ('doc9', 1.6019141338792098),
  ('doc3', 1.2316706458761313),
  ('doc18', 0.7844570503761733),
  ('doc15', 0.28445705037617341),
  ('doc11', 0.077350269189625842),
  ('doc20', -0.16275654512378457),
  ('doc5', -0.36986332631033203),
  ('doc7', -0.5),
  ('doc14', -0.66275654512378457),
  ('doc8', -0.74010681431341041),
  ('doc13', -0.79289321881345254),
  ('doc4', -1.2401068143134104),
  ('doc10', -1.3944271909999157),
  ('doc2', -1.7401068143134104),
  ('doc17', -1.7401068143134104),
  ('doc19', -1.8944271909999157)],
 [('doc2', 5.0527864045000417),
  ('doc17', 5.0527864045000417),
  ('doc4', 3.9754361353104164),
  ('doc13', 3.6055728090000843),
  ('doc14', 3.5527864045000421),
  ('doc20', 2.4754361353104164),
  ('doc8', 2.0282225398104585),
  ('doc15', 1.8980858661207902),
  ('doc18', 1.5),
  ('doc19', 1.0810089443105007),
  ('doc7', 1.0527864045000421),


## Part 2. Next, let’s treat all articles as having unit weight ...

You may have noticed that in our computation an article that had many attributes checked could have more influence on the overall profile than one that had only a few. doc 1 and doc 19 each have five attributes, while doc6, doc7, and doc18 only have 2 attributes each.

We want to explore whether our simple model may be counting these attribute-heavy documents too much. For example, we might conclude that liking doc6 says more about liking baseball (since it is one of only two attributes for the article along with Europe) than liking doc1 says (since doc1 is also about politics, Asia, soccer, and family).

To try this out, make a copy of the attributes matrix on another sheet. Then we’re going to have you normalize each row to be a unit length vector. We can do this in two steps:

1. Count the total number of items in the row (you can do this via SUM or COUNT function).
1. Divide each value by the square root of that number of items. If you do this right, doc1’s values will all change from 1 to 0.447214 (approx). Documents with 4 attributes will change to 0.5 (since 4 * .5^2 = 1), and so forth. Remember, don’t have the SUM or COUNT depend on the copy of the cells you’re changing or you’ll get a circular dependency. Have your new sheet depend on values on your old sheet.

Once you have the new values, compute your second set of user profiles and new predictions. If you did this right, you’ll see a prediction of 1.0090 (approx) for user1/doc1. Don’t worry about the scale of the numbers (they’ll all be smaller, in absolute value terms), but look at the order of them.

This time we’ll start with user2. With our simple profile, doc7 and doc19 both had similar “like” predictions (+2 each). Now they don’t. Let's see what values are predicted for the second model:

doc7: 0.7444 (plus or minus 0.01)

doc 19: 0.4834 (plus or minus 0.01)

The difference here can be seen by looking at the profile attribute values. Doc7 is 50% about one of user2’s favorite topics (security) which is now more heavily emphasized).

Now let’s look at user 1. While user 1's first-place document is the same in both models, that isn't true for other places. In our simple model, the second/third place recommendation was a tie between doc1 and doc 12. Neither of those is in second place with this new model.

Type in the answers to the following questions as part of the Quiz:
 
 - Which document is now in second with this new model?
 - What prediction score does it have?

In [28]:
doc_tags_normed = doc_tags / np.linalg.norm(doc_tags, ord=2, axis=1, keepdims=True)
user_profiles = user_profiles = get_user_profiles(user_ratings, doc_tags_normed)
predictions_normed = get_user_doc_predictions(user_profiles, doc_tags_normed)
predictions_normed

[[('doc16', 1.9246460699581855),
  ('doc6', 1.3709226658874269),
  ('doc12', 1.3331138468776909),
  ('doc9', 1.132724346944564),
  ('doc1', 1.0090187477611812),
  ('doc3', 0.71110537894954473),
  ('doc18', 0.55469489987058918),
  ('doc15', 0.14222852518808671),
  ('doc11', 0.044658198738520505),
  ('doc20', -0.081378272561892284),
  ('doc5', -0.21354069100864065),
  ('doc14', -0.33137827256189228),
  ('doc7', -0.35355339059327373),
  ('doc8', -0.37005340715670521),
  ('doc13', -0.39644660940672627),
  ('doc4', -0.62005340715670521),
  ('doc10', -0.80507291408913528),
  ('doc19', -0.84721359549995789),
  ('doc2', -0.87005340715670521),
  ('doc17', -0.87005340715670521)],
 [('doc2', 2.5263932022500208),
  ('doc17', 2.5263932022500208),
  ('doc4', 1.9877180676552082),
  ('doc13', 1.8027864045000421),
  ('doc14', 1.7763932022500211),
  ('doc20', 1.2377180676552082),
  ('doc18', 1.0606601717798212),
  ('doc8', 1.0141112699052293),
  ('doc15', 0.94904293306039511),
  ('doc7', 0.7444324057629

In [29]:
list(zip(predictions[0], predictions_normed[0]))

[(('doc16', 3.3335847797553408), ('doc16', 1.9246460699581855)),
 (('doc12', 2.3090209150657572), ('doc6', 1.3709226658874269)),
 (('doc1', 2.2562345105657151), ('doc12', 1.3331138468776909)),
 (('doc6', 1.9387774270626785), ('doc9', 1.132724346944564)),
 (('doc9', 1.6019141338792098), ('doc1', 1.0090187477611812)),
 (('doc3', 1.2316706458761313), ('doc3', 0.71110537894954473)),
 (('doc18', 0.7844570503761733), ('doc18', 0.55469489987058918)),
 (('doc15', 0.28445705037617341), ('doc15', 0.14222852518808671)),
 (('doc11', 0.077350269189625842), ('doc11', 0.044658198738520505)),
 (('doc20', -0.16275654512378457), ('doc20', -0.081378272561892284)),
 (('doc5', -0.36986332631033203), ('doc5', -0.21354069100864065)),
 (('doc7', -0.5), ('doc14', -0.33137827256189228)),
 (('doc14', -0.66275654512378457), ('doc7', -0.35355339059327373)),
 (('doc8', -0.74010681431341041), ('doc8', -0.37005340715670521)),
 (('doc13', -0.79289321881345254), ('doc13', -0.39644660940672627)),
 (('doc4', -1.240106814

In [30]:
list(zip(predictions[1], predictions_normed[1]))

[(('doc2', 5.0527864045000417), ('doc2', 2.5263932022500208)),
 (('doc17', 5.0527864045000417), ('doc17', 2.5263932022500208)),
 (('doc4', 3.9754361353104164), ('doc4', 1.9877180676552082)),
 (('doc13', 3.6055728090000843), ('doc13', 1.8027864045000421)),
 (('doc14', 3.5527864045000421), ('doc14', 1.7763932022500211)),
 (('doc20', 2.4754361353104164), ('doc20', 1.2377180676552082)),
 (('doc8', 2.0282225398104585), ('doc18', 1.0606601717798212)),
 (('doc15', 1.8980858661207902), ('doc8', 1.0141112699052293)),
 (('doc18', 1.5), ('doc15', 0.94904293306039511)),
 (('doc19', 1.0810089443105007), ('doc7', 0.74443240576298331)),
 (('doc7', 1.0527864045000421), ('doc19', 0.48344189675271271)),
 (('doc11', 0.60557280900008426), ('doc11', 0.34962762429011646)),
 (('doc5', 0.55278640450004213), ('doc6', 0.33618411529912018)),
 (('doc10', 0.47543613531041629), ('doc5', 0.31915137944246469)),
 (('doc6', 0.47543613531041617), ('doc10', 0.27449318070394418)),
 (('doc3', 0.028222539810458303), ('doc3'

## Part 3. Finally, let’s consider how common different terms are among our documents …

We’re going to do one more model -- one that accounts for the fact the the content attributes have vastly different frequencies.

We’re going to include an IDF (inverse document frequency) term into our equation. Start with your spreadsheet from part 2. Add a row that shows 1/DF where DF is the number of documents in which each content attribute occurs. For example, baseball occurs in 4 documents, so baseball’s entry will be 0.25. Politics occurs in 10 documents, so it will get an IDF score of 0.1 (1 / 10).

Note that this is far more dramatic a computation than is usually used with large datasets (more common is 1 / log(DF)), but we need a dramatic value to see differences with a small dataset.

Next, update your prediction formula to do a three-way dot product: document vector * profile * IDF (fortunately, SUMPRODUCT can handle a third array). If you did this right, you’ll see a prediction of about 0.2476 for user1/doc1.

Ok, now let’s look at the results.

Type in the answers to the following questions into the Quiz:

 - Compare doc1 and doc9 for user1. What’s user1’s prediction for doc9 in the new IDF weighted model? See how there’s a dramatic difference from the prior model?
 - Now let’s look at user 2. Look at doc6. It was moderately positive before and now is slightly negative. Why did that change?

In [31]:
idf = 1 / np.sum(doc_tags, axis=0, keepdims=True)
doc_tags_normed_idf = doc_tags_normed * idf
predictions_normed_idf = get_user_doc_predictions(user_profiles, doc_tags_normed_idf)
predictions_normed_idf

[[('doc16', 0.39615287985188608),
  ('doc6', 0.31943234224805939),
  ('doc12', 0.31164827655467259),
  ('doc1', 0.2476124657905287),
  ('doc9', 0.17906719376543057),
  ('doc3', 0.10945899074393546),
  ('doc18', 0.071634512479864618),
  ('doc15', 0.021183771740190184),
  ('doc11', 0.01875153415956634),
  ('doc20', -0.0062905787084923427),
  ('doc5', -0.043526623104614706),
  ('doc8', -0.047529883650608472),
  ('doc14', -0.05328121675015849),
  ('doc13', -0.057252722067278147),
  ('doc7', -0.058925565098878953),
  ('doc4', -0.089196550317275136),
  ('doc19', -0.12153324130475628),
  ('doc10', -0.12803122640182821),
  ('doc2', -0.13618718835894128),
  ('doc17', -0.13618718835894128)],
 [('doc2', 0.32915447174015361),
  ('doc17', 0.32915447174015361),
  ('doc4', 0.24029611917898988),
  ('doc13', 0.2085533387818238),
  ('doc14', 0.20415447174015361),
  ('doc20', 0.11529611917898988),
  ('doc7', 0.11353114209326928),
  ('doc15', 0.10227647689652244),
  ('doc18', 0.096423651979983732),
  ('do

In [32]:
list(zip(predictions_normed[0], predictions_normed_idf[0]))

[(('doc16', 1.9246460699581855), ('doc16', 0.39615287985188608)),
 (('doc6', 1.3709226658874269), ('doc6', 0.31943234224805939)),
 (('doc12', 1.3331138468776909), ('doc12', 0.31164827655467259)),
 (('doc9', 1.132724346944564), ('doc1', 0.2476124657905287)),
 (('doc1', 1.0090187477611812), ('doc9', 0.17906719376543057)),
 (('doc3', 0.71110537894954473), ('doc3', 0.10945899074393546)),
 (('doc18', 0.55469489987058918), ('doc18', 0.071634512479864618)),
 (('doc15', 0.14222852518808671), ('doc15', 0.021183771740190184)),
 (('doc11', 0.044658198738520505), ('doc11', 0.01875153415956634)),
 (('doc20', -0.081378272561892284), ('doc20', -0.0062905787084923427)),
 (('doc5', -0.21354069100864065), ('doc5', -0.043526623104614706)),
 (('doc14', -0.33137827256189228), ('doc8', -0.047529883650608472)),
 (('doc7', -0.35355339059327373), ('doc14', -0.05328121675015849)),
 (('doc8', -0.37005340715670521), ('doc13', -0.057252722067278147)),
 (('doc13', -0.39644660940672627), ('doc7', -0.0589255650988789

In [33]:
list(zip(predictions_normed[1], predictions_normed_idf[1]))

[(('doc2', 2.5263932022500208), ('doc2', 0.32915447174015361)),
 (('doc17', 2.5263932022500208), ('doc17', 0.32915447174015361)),
 (('doc4', 1.9877180676552082), ('doc4', 0.24029611917898988)),
 (('doc13', 1.8027864045000421), ('doc13', 0.2085533387818238)),
 (('doc14', 1.7763932022500211), ('doc14', 0.20415447174015361)),
 (('doc20', 1.2377180676552082), ('doc20', 0.11529611917898988)),
 (('doc18', 1.0606601717798212), ('doc7', 0.11353114209326928)),
 (('doc8', 1.0141112699052293), ('doc15', 0.10227647689652244)),
 (('doc15', 0.94904293306039511), ('doc18', 0.096423651979983732)),
 (('doc7', 0.74443240576298331), ('doc8', 0.07057475962899408)),
 (('doc19', 0.48344189675271271), ('doc10', 0.046812153896812612)),
 (('doc11', 0.34962762429011646), ('doc5', 0.044585266915505399)),
 (('doc6', 0.33618411529912018), ('doc19', 0.043342577813484461)),
 (('doc5', 0.31915137944246469), ('doc11', 0.017749503112534246)),
 (('doc10', 0.27449318070394418), ('doc3', -0.062892269975720907)),
 (('doc3'