In [1]:
users = [
{ "id": 0, "name": "Hero" },
{ "id": 1, "name": "Dunn" },
{ "id": 2, "name": "Sue" },
{ "id": 3, "name": "Chi" },
{ "id": 4, "name": "Thor" },
{ "id": 5, "name": "Clive" },
{ "id": 6, "name": "Hicks" },
{ "id": 7, "name": "Devin" },
{ "id": 8, "name": "Kate" },
{ "id": 9, "name": "Klein" }
]

# “friendship” data, represented as a list of pairs of IDs
# the tuple (0, 1) indicates that the data scientist with id 0 (Hero) and
# the data scientist with id 1 (Dunn) are friends.

friendships = [(0, 1), (0, 2), (1, 2), (1, 3), (2, 3), (3, 4),
               (4, 5), (5, 6), (5, 7), (6, 8), (7, 8), (8, 9)]

# 1) Add a list of friends to each user

In [2]:
# set each user’s friends property to an empty list:
for user in users:
    user["friends"] = []

print users

[{'friends': [], 'id': 0, 'name': 'Hero'}, {'friends': [], 'id': 1, 'name': 'Dunn'}, {'friends': [], 'id': 2, 'name': 'Sue'}, {'friends': [], 'id': 3, 'name': 'Chi'}, {'friends': [], 'id': 4, 'name': 'Thor'}, {'friends': [], 'id': 5, 'name': 'Clive'}, {'friends': [], 'id': 6, 'name': 'Hicks'}, {'friends': [], 'id': 7, 'name': 'Devin'}, {'friends': [], 'id': 8, 'name': 'Kate'}, {'friends': [], 'id': 9, 'name': 'Klein'}]


In [3]:
print users[0]['friends']

[]


In [4]:
# then we populate the lists using the friendships data:

for i, j in friendships:
    # this works because users[i] is the user whose id is i
    users[i]["friends"].append(users[j]) # add i as a friend of j
    users[j]["friends"].append(users[i]) # add j as a friend of i
print users[0]

{'friends': [{'friends': [{...}, {'friends': [{...}, {...}, {'friends': [{...}, {...}, {'friends': [{...}, {'friends': [{...}, {'friends': [{...}, {'friends': [{...}, {'friends': [{...}, {...}], 'id': 7, 'name': 'Devin'}, {'friends': [{...}], 'id': 9, 'name': 'Klein'}], 'id': 8, 'name': 'Kate'}], 'id': 6, 'name': 'Hicks'}, {'friends': [{...}, {'friends': [{'friends': [{...}, {...}], 'id': 6, 'name': 'Hicks'}, {...}, {'friends': [{...}], 'id': 9, 'name': 'Klein'}], 'id': 8, 'name': 'Kate'}], 'id': 7, 'name': 'Devin'}], 'id': 5, 'name': 'Clive'}], 'id': 4, 'name': 'Thor'}], 'id': 3, 'name': 'Chi'}], 'id': 2, 'name': 'Sue'}, {'friends': [{...}, {'friends': [{...}, {...}, {...}], 'id': 2, 'name': 'Sue'}, {'friends': [{...}, {'friends': [{...}, {'friends': [{...}, {'friends': [{...}, {'friends': [{...}, {...}], 'id': 7, 'name': 'Devin'}, {'friends': [{...}], 'id': 9, 'name': 'Klein'}], 'id': 8, 'name': 'Kate'}], 'id': 6, 'name': 'Hicks'}, {'friends': [{...}, {'friends': [{'friends': [{...},

# 2) what’s the average number of connections

Once each user dict contains a list of friends, we can easily ask questions of our
graph, like “what’s the average number of connections?”
First we find the total number of connections, by summing up the lengths of all the
friends lists:

In [5]:
def number_of_friends(user):
    """how many friends does _user_ have?"""
    return len(user["friends"]) # length of friend_ids list

total_connections = sum(number_of_friends(user)
                        for user in users) # 24
print total_connections

24


In [6]:
# And then we just divide by the number of users:
from __future__ import division # integer division is lame

num_users = len(users) # length of the users list
print num_users
avg_connections = total_connections / num_users # 2.4
print avg_connections

10
2.4


It’s also easy to find the most connected people—they’re the people who have the largest
number of friends.

Since there aren’t very many users, we can sort them from “most friends” to “least
friends”:

In [7]:
# create a list (user_id, number_of_friends)
num_friends_by_id = [(user["id"], number_of_friends(user))
                    for user in users]
print num_friends_by_id

sorted(num_friends_by_id, # get it sorted
        key=lambda (user_id, num_friends): num_friends, # by num_friends
        reverse=True) # largest to smallest


# each pair is (user_id, num_friends)
# [(1, 3), (2, 3), (3, 3), (5, 3), (8, 3),
# (0, 2), (4, 2), (6, 2), (7, 2), (9, 1)]

[(0, 2), (1, 3), (2, 3), (3, 3), (4, 2), (5, 3), (6, 2), (7, 2), (8, 3), (9, 1)]


[(1, 3),
 (2, 3),
 (3, 3),
 (5, 3),
 (8, 3),
 (0, 2),
 (4, 2),
 (6, 2),
 (7, 2),
 (9, 1)]

# 3) Data Scientists You May Know

Your first instinct is to suggest that a user might know the friends of friends. These
are easy to compute: for each of a user’s friends, iterate over that person’s friends, and
collect all the results:

In [8]:
def friends_of_friend_ids_bad(user):
    # "foaf" is short for "friend of a friend"
    return [foaf["id"]
            for friend in user["friends"] # for each of user's friends
            for foaf in friend["friends"]] # get each of _their_ friends

In [9]:
friends_of_friend_ids_bad(users[0])

[0, 2, 3, 0, 1, 3]

It includes user 0 (twice), since Hero is indeed friends with both of his friends. It
includes users 1 and 2, although they are both friends with Hero already. And it
includes user 3 twice, as Chi is reachable through two different friends:

In [10]:
print [friend["id"] for friend in users[0]["friends"]] # [1, 2]
print [friend["id"] for friend in users[1]["friends"]] # [0, 2, 3]
print [friend["id"] for friend in users[2]["friends"]] # [0, 1, 3]

[1, 2]
[0, 2, 3]
[0, 1, 3]


Knowing that people are friends-of-friends in multiple ways seems like interesting
information, so maybe instead we should produce a count of mutual friends. And we
definitely should use a helper function to exclude people already known to the user:

In [11]:
from collections import Counter # not loaded by default

def not_the_same(user, other_user):
    """two users are not the same if they have different ids"""
    return user["id"] != other_user["id"]

def not_friends(user, other_user):
    """other_user is not a friend if he's not in user["friends"];
    that is, if he's not_the_same as all the people in user["friends"]"""
    return all(not_the_same(friend, other_user)
               for friend in user["friends"])

def friends_of_friend_ids(user):
    return Counter(foaf["id"]
                    for friend in user["friends"] # for each of my friends
                    for foaf in friend["friends"] # count *their* friends
                    if not_the_same(user, foaf) # who aren't me
                    and not_friends(user, foaf)) # and aren't my friends

print friends_of_friend_ids(users[3]) # Counter({0: 2, 5: 1})

Counter({0: 2, 5: 1})


This correctly tells Chi (id 3) that she has two mutual friends with Hero (id 0) but
only one mutual friend with Clive (id 5).

As a data scientist, you know that you also might enjoy meeting users with similar
interests. (This is a good example of the “substantive expertise” aspect of data science.)
After asking around, you manage to get your hands on this data, as a list of
pairs (user_id, interest):

In [12]:
interests = [
(0, "Hadoop"), (0, "Big Data"), (0, "HBase"), (0, "Java"),
(0, "Spark"), (0, "Storm"), (0, "Cassandra"),
(1, "NoSQL"), (1, "MongoDB"), (1, "Cassandra"), (1, "HBase"),
(1, "Postgres"), (2, "Python"), (2, "scikit-learn"), (2, "scipy"),
(2, "numpy"), (2, "statsmodels"), (2, "pandas"), (3, "R"), (3, "Python"),
(3, "statistics"), (3, "regression"), (3, "probability"),
(4, "machine learning"), (4, "regression"), (4, "decision trees"),
(4, "libsvm"), (5, "Python"), (5, "R"), (5, "Java"), (5, "C++"),
(5, "Haskell"), (5, "programming languages"), (6, "statistics"),
(6, "probability"), (6, "mathematics"), (6, "theory"),
(7, "machine learning"), (7, "scikit-learn"), (7, "Mahout"),
(7, "neural networks"), (8, "neural networks"), (8, "deep learning"),
(8, "Big Data"), (8, "artificial intelligence"), (9, "Hadoop"),
(9, "Java"), (9, "MapReduce"), (9, "Big Data")
]

For example, Thor (id 4) has no friends in common with Devin (id 7), but they share
an interest in machine learning.

It’s easy to build a function that finds users with a certain interest:

In [13]:
def data_scientists_who_like(target_interest):
    return [user_id
            for user_id, user_interest in interests
            if user_interest == target_interest]

data_scientists_who_like('Java')

[0, 5, 9]

This works, but it has to examine the whole list of interests for every search. If we
have a lot of users and interests (or if we just want to do a lot of searches), we’re probably
better off building an index from interests to users:

In [14]:
from collections import defaultdict

# keys are interests, values are lists of user_ids with that interest
user_ids_by_interest = defaultdict(list)

for user_id, interest in interests:
    user_ids_by_interest[interest].append(user_id)

print user_ids_by_interest

defaultdict(<type 'list'>, {'Java': [0, 5, 9], 'neural networks': [7, 8], 'NoSQL': [1], 'Hadoop': [0, 9], 'Mahout': [7], 'Storm': [0], 'regression': [3, 4], 'statistics': [3, 6], 'probability': [3, 6], 'programming languages': [5], 'Python': [2, 3, 5], 'deep learning': [8], 'Haskell': [5], 'mathematics': [6], 'Spark': [0], 'numpy': [2], 'pandas': [2], 'artificial intelligence': [8], 'theory': [6], 'libsvm': [4], 'C++': [5], 'R': [3, 5], 'HBase': [0, 1], 'Postgres': [1], 'decision trees': [4], 'Big Data': [0, 8, 9], 'MongoDB': [1], 'scikit-learn': [2, 7], 'MapReduce': [9], 'machine learning': [4, 7], 'scipy': [2], 'statsmodels': [2], 'Cassandra': [0, 1]})


In [15]:
# And another from users to interests:
# keys are user_ids, values are lists of interests for that user_id
interests_by_user_id = defaultdict(list)

for user_id, interest in interests:
    interests_by_user_id[user_id].append(interest)
    
print interests_by_user_id

defaultdict(<type 'list'>, {0: ['Hadoop', 'Big Data', 'HBase', 'Java', 'Spark', 'Storm', 'Cassandra'], 1: ['NoSQL', 'MongoDB', 'Cassandra', 'HBase', 'Postgres'], 2: ['Python', 'scikit-learn', 'scipy', 'numpy', 'statsmodels', 'pandas'], 3: ['R', 'Python', 'statistics', 'regression', 'probability'], 4: ['machine learning', 'regression', 'decision trees', 'libsvm'], 5: ['Python', 'R', 'Java', 'C++', 'Haskell', 'programming languages'], 6: ['statistics', 'probability', 'mathematics', 'theory'], 7: ['machine learning', 'scikit-learn', 'Mahout', 'neural networks'], 8: ['neural networks', 'deep learning', 'Big Data', 'artificial intelligence'], 9: ['Hadoop', 'Java', 'MapReduce', 'Big Data']})


Now it’s easy to find who has the most interests in common with a given user:
- Iterate over the user’s interests.
- For each interest, iterate over the other users with that interest.
- Keep count of how many times we see each other user.

In [16]:
def most_common_interests_with(user):
    return Counter(interested_user_id
                    for interest in interests_by_user_id[user["id"]]
                    for interested_user_id in user_ids_by_interest[interest]
                    if interested_user_id != user["id"])

# 4) Salaries and Experience

In [17]:
# Salary data is of course sensitive,
# but he manages to provide you an anonymous data set containing each user’s
# salary (in dollars) and tenure as a data scientist (in years):
    
salaries_and_tenures = [(83000, 8.7), (88000, 8.1),
                        (48000, 0.7), (76000, 6),
                        (69000, 6.5), (76000, 7.5),
                        (60000, 2.5), (83000, 10),
                        (48000, 1.9), (63000, 4.2)]

It seems pretty clear that people with more experience tend to earn more. How can
you turn this into a fun fact? Your first idea is to look at the average salary for each
tenure:

In [18]:
# keys are years, values are lists of the salaries for each tenure
salary_by_tenure = defaultdict(list)

for salary, tenure in salaries_and_tenures:
    salary_by_tenure[tenure].append(salary)
    
print salary_by_tenure

# keys are years, each value is average salary for that tenure
average_salary_by_tenure = {
    tenure : sum(salaries) / len(salaries)
    for tenure, salaries in salary_by_tenure.items()
}

print average_salary_by_tenure

defaultdict(<type 'list'>, {6.5: [69000], 7.5: [76000], 6: [76000], 10: [83000], 8.1: [88000], 4.2: [63000], 0.7: [48000], 8.7: [83000], 1.9: [48000], 2.5: [60000]})
{6.5: 69000.0, 7.5: 76000.0, 6: 76000.0, 10: 83000.0, 8.1: 88000.0, 4.2: 63000.0, 8.7: 83000.0, 0.7: 48000.0, 1.9: 48000.0, 2.5: 60000.0}


This turns out to be not particularly useful, as none of the users have the same tenure, which means we’re just reporting the individual users’ salaries.

In [19]:
# It might be more helpful to bucket the tenures:
def tenure_bucket(tenure):
    if tenure < 2:
        return "less than two"
    elif tenure < 5:
        return "between two and five"
    else:
        return "more than five"

In [20]:
# Then group together the salaries corresponding to each bucket:

# keys are tenure buckets, values are lists of salaries for that bucket
salary_by_tenure_bucket = defaultdict(list)

for salary, tenure in salaries_and_tenures:
    bucket = tenure_bucket(tenure)
    salary_by_tenure_bucket[bucket].append(salary)
    
# And finally compute the average salary for each group:

# keys are tenure buckets, values are average salary for that bucket
average_salary_by_bucket = {
    tenure_bucket : sum(salaries) / len(salaries)
    for tenure_bucket, salaries in salary_by_tenure_bucket.iteritems()
}

print average_salary_by_bucket

{'more than five': 79166.66666666667, 'between two and five': 61500.0, 'less than two': 48000.0}


# 5) Paid Accounts

In [21]:
def predict_paid_or_unpaid(years_experience):
    if years_experience < 3.0:
        return "paid"
    elif years_experience < 8.5:
        return "unpaid"
    else:
        return "paid"

# 6) Topics of Interest

One simple (if not particularly exciting) way to find the most popular interests is simply
to count the words:
1. Lowercase each interest (since different users may or may not capitalize their
interests).
2. Split it into words.
3. Count the results.

In [23]:
words_and_counts = Counter(word
                            for user, interest in interests
                            for word in interest.lower().split())

print words_and_counts

Counter({'learning': 3, 'java': 3, 'python': 3, 'big': 3, 'data': 3, 'hbase': 2, 'regression': 2, 'cassandra': 2, 'statistics': 2, 'probability': 2, 'hadoop': 2, 'networks': 2, 'machine': 2, 'neural': 2, 'scikit-learn': 2, 'r': 2, 'nosql': 1, 'programming': 1, 'deep': 1, 'haskell': 1, 'languages': 1, 'decision': 1, 'artificial': 1, 'storm': 1, 'mongodb': 1, 'intelligence': 1, 'mathematics': 1, 'numpy': 1, 'pandas': 1, 'postgres': 1, 'libsvm': 1, 'trees': 1, 'scipy': 1, 'spark': 1, 'mapreduce': 1, 'c++': 1, 'theory': 1, 'statsmodels': 1, 'mahout': 1})


This makes it easy to list out the words that occur more than once:

In [29]:
for word, count in words_and_counts.most_common():
    if count > 1:
        print word, count

learning 3
java 3
python 3
big 3
data 3
hbase 2
regression 2
cassandra 2
statistics 2
probability 2
hadoop 2
networks 2
machine 2
neural 2
scikit-learn 2
r 2
