# DataSciencester

### Finding Key Connectors

In [5]:
users = [
    {"id": 0, "name": "Hero"},
    {"id": 1, "name": "Dunn"},
    {"id": 2, "name": "Sue"},
    {"id": 3, "name": "Chi"},
    {"id": 4, "name": "Thor"},
    {"id": 5, "name": "Clive"},
    {"id": 6, "name": "Hicks"},
    {"id": 7, "name": "Devin"},
    {"id": 8, "name": "Kate"},
    {"id": 9, "name": "Klein"}
]

friendship_pairs = [(0, 1), (0, 2), (1, 2), (1, 3), (2, 3), (3, 4), (4, 5), (5, 6), (5, 7), (6, 8), (7, 8), (8, 9)]

#initialize the dict with an empty list for each user id:
friendships = {user["id"]: [] for user in users}

In [6]:
# And loop over the friendship pairs to populate it:
for i, j in friendship_pairs:
    friendships[i].append(j)
    friendships[j].append(i)

In [7]:
def number_of_friends(user):
    """How many friends does _user_ have?"""
    user_id = user["id"]
    friend_ids = friendships[user_id]
    return len(friend_ids)

In [8]:
total_connections = sum(number_of_friends(user) for user in users)

num_users = len(users)
avg_connections = total_connections / num_users

In [9]:
#Create a list (user_id, number_of_friends)
num_friends_by_id = [(user["id"], number_of_friends(user)) for user in users]

#sort the list by num_friends largest to smallest
num_friends_by_id.sort(key=lambda id_and_friends: id_and_friends[1], reverse=True)

print(num_friends_by_id)

[(1, 3), (2, 3), (3, 3), (5, 3), (8, 3), (0, 2), (4, 2), (6, 2), (7, 2), (9, 1)]


### Data Scientists You May Know

In [10]:
def foaf_ids_bad(user):
    """foaf is short for 'friend of a friend'"""
    return [foaf_id
            for friend_id in friendships[user["id"]]
            for foaf_id in friendships[friend_id]]

In [11]:
foaf_ids_bad(users[0])

[0, 2, 3, 0, 1, 3]

* Includes user 0 twice since Hero is indeed friends with both of his friends
* includes user 3 twice as Chi is reachable through two different friends

In [12]:
from collections import Counter

def friend_of_friends(user):
    user_id = user["id"]
    return Counter(
        foaf_id
        for friend_id in friendships[user_id]
        for foaf_id in friendships[friend_id]
        if foaf_id != user_id
        and foaf_id not in friendships[user_id]
    )

In [14]:
print(friend_of_friends(users[3]))

Counter({0: 2, 5: 1})


* this tells Chi (id 3) she has two mutual friends with Hero (id 0) and only one mutual friend with Clive (id 5)

In [27]:
interests = [
    (0, "Hadoop"), (0, "Big Data"), (0, "HBase"), (0, "Java"), 
    (0, "Spark"), (0, "Storm"), (0, "Cassandra"), 
    (1, "NoSQL"), (1, "MongoDB"), (1, "Cassandra"),(1, "HBase"),
    (1, "Postgres"), (2, "Python"), (2, "scikit-learn"), (2, "scipy"),
    (2, "numpy"), (2, "statsmodels"), (2, "pandas"), (3, "R"), (3, "python"),
    (3, "statistics"), (3, "regression"), (3, "probability"),
    (4, "machine learning"), (4, "regression"), (4, "decision trees"),
    (4, "libsvm"), (5, "Python"), (5, "R"), (5, "Java"), (5, "C++"),  
    (5, "Haskell"), (5, "programming languages"), (6, "statistics"),
    (6, "probability"), (6, "mathematics"), (6, "theory"),
    (7, "machine learning"), (7, "scikit-learn"), (7, "Mahout"),
    (7, "neural networks"), (8, "neural networks"), (8, "deep learning"),
    (8, "Big Data"), (8, "artificial intelligence"), (9, "Hadoop"),
    (9, "Java"), (9, "MapReduce"), (9, "Big Data")
]

In [28]:
def data_scientists_who_like(target_interest):
    """Find the ids of all users who like the target interest."""
    return [user_id 
            for user_id, user_interest in interests
            if user_interest == target_interest]

* This works, but has to examine the whole list of interests for every search.
* instead, we can build an index from interests to users

In [29]:
from collections import defaultdict

#Keys are interest, values are lists of user_ids with that interest
user_ids_by_interest = defaultdict(list)

for user_id, interest in interests:
    user_ids_by_interest[interest].append(user_id)

* and another from users to interests:

In [30]:
# keys are user_ids, values are lists of interests for that user_id
interests_by_user_id = defaultdict(list)

for user_id, interest in interests:
    interests_by_user_id[user_id].append(interest)

##### Now we can find who has the most interests in common with a given user:
* iterate over the user's interests
* for each interest, iterate over the other users with that interest
* keep count of how many times we see each other users

In [31]:
def most_common_interests_with(user):
    return Counter(
        interested_user_id
        for interest in interests_by_user_id[user["id"]]
        for interested_user_id in user_ids_by_interest[interest]
        if interested_user_id != user["id"]
    )

In [33]:
print(most_common_interests_with(users[5]))

Counter({2: 1, 3: 1, 0: 1, 9: 1})


### Salaries and Experience

In [34]:
salaries_and_tenures = [(83000, 8.7), (80000, 8.1),
                        (48000, 0.7), (76000, 6),
                        (69000, 6.5), (76000, 7.5),
                        (60000, 2.5), (83000, 10),
                        (48000, 1.9), (63000, 4.2)]

In [38]:
#keys are years, values are lists of the salaries for each tenure
salary_by_tenure = defaultdict(list)

for salary, tenure in salaries_and_tenures:
    salary_by_tenure[tenure].append(salary)

In [40]:
#keys are years, each value is average salary for that tenure
average_salary_by_tenure = {
    tenure: sum(salaries) / len(salaries)
    for tenure, salaries in salary_by_tenure.items()
}

In [41]:
print(average_salary_by_tenure)

{8.7: 83000.0, 8.1: 80000.0, 0.7: 48000.0, 6: 76000.0, 6.5: 69000.0, 7.5: 76000.0, 2.5: 60000.0, 10: 83000.0, 1.9: 48000.0, 4.2: 63000.0}


* not very useful since nobody has exact same tenure
* might be helpful to bucket the tenures

In [42]:
def tenure_bucket(tenure):
    if tenure < 2:
        return "less than two"
    elif tenure < 5:
        return "between two and five"
    else:
        return "more than five"

In [45]:
#keys are tenure buckets, values are lists of salaries for that bucket
salary_by_tenure_bucket = defaultdict(list)
for salary, tenure in salaries_and_tenures:
    bucket = tenure_bucket(tenure)
    salary_by_tenure_bucket[bucket].append(salary)

In [46]:
average_salary_by_bucket = {
    tenure_bucket: sum(salaries) / len(salaries)
    for tenure_bucket, salaries in salary_by_tenure_bucket.items()
}

In [47]:
print(average_salary_by_bucket)

{'more than five': 77833.33333333333, 'less than two': 48000.0, 'between two and five': 61500.0}


* "Data Scientists with more than 5 years' experience earn 65% more than data scientists with little or no experience"

### Paid Accounts

* We notice there seems to be a correspondence between years of experience and paid acounts:<br>
    0.7 paid<br>
    1.9 unpaid<br>
    2.5 paid<br>
    4.2 unpaid<br>
    6.0 unpaid<br>
    6.5 unpaid<br>
    7.5 unpaid<br>
    8.1 unpaid<br>
    8.7 paid<br>
    10.0 paid<br>

In [50]:
#very basic prediction function
def predict_paid_or_unpaid(years_experience):
    if years_experience < 3.0:
        return "paid"
    elif years_experience < 8.5:
        return "unpaid"
    else:
        return "paid"

### Topics of Interest

One simple way to find the most popular interests is to count the words:
* lowercase each interest
* split it into words
* count the results

In [52]:
words_and_counts = Counter(word
                           for user, interest in interests
                           for word in interest.lower().split())

In [53]:
for word, count in words_and_counts.most_common():
    if count > 1:
        print(word, count)

big 3
data 3
java 3
python 3
learning 3
hadoop 2
hbase 2
cassandra 2
scikit-learn 2
r 2
statistics 2
regression 2
probability 2
machine 2
neural 2
networks 2
