In [94]:
import babypandas as bpd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from notebook.services.config import ConfigManager

cm = ConfigManager()
cm.update(
    "livereveal", {
        'width': 1500,
        'height': 700,
        "scroll": True,
})

{'width': 1500, 'height': 700, 'scroll': True}

# DSC 10 Discussion Week 4
---

# Practice With Merge

Let's create two tables to practice merge on

In [155]:
people = bpd.DataFrame().assign(
    name = ["kyle","jill","cole","alex"],
    age  = [24,22,21,24],
    city = ["San Diego","LA","San Francisco","Irvine"]
    )
people

Unnamed: 0,name,age,city
0,kyle,24,San Diego
1,jill,22,LA
2,cole,21,San Francisco
3,alex,24,Irvine


In [156]:
cities = bpd.DataFrame().assign(
    name = ["San Diego", "LA", "San Francisco","Denver","New York"],
    popular_food  = ["California Burrito", "Tacos", "Sourdough", "Denver Omelete", "Cheesecake"]
    )
cities

Unnamed: 0,name,popular_food
0,San Diego,California Burrito
1,LA,Tacos
2,San Francisco,Sourdough
3,Denver,Denver Omelete
4,New York,Cheesecake


## How to merge people with cities? How many rows will there be?

In [152]:
people.merge(cities, left_on='name', right_on='name')

Unnamed: 0,name,age,city,popular_food


In [157]:
people.merge(cities, left_on='city', right_on='name')

Unnamed: 0,name_x,age,city,name_y,popular_food
0,kyle,24,San Diego,San Diego,California Burrito
1,jill,22,LA,LA,Tacos
2,cole,21,San Francisco,San Francisco,Sourdough


In [102]:
people.merge(cities, left_on='city', right_on='name').drop(columns = "name_y")

Unnamed: 0,name_x,age,city,popular_food
0,kyle,24,San Diego,California Burrito
1,jill,22,LA,Tacos
2,cole,21,San Francisco,Sourdough


## How to join people with birthdays? How many rows will there be?

In [162]:
people

Unnamed: 0,name,age,city
0,kyle,24,San Diego
1,jill,22,LA
2,cole,21,San Francisco
3,alex,24,Irvine


In [159]:
birthdays = bpd.DataFrame().assign(
    age = [21,21,22,24],
    importance = ["Legal Drinking Age", "Officially an Adult", "Taylor Swift Song", "Kyle's Age"]
    )
birthdays

Unnamed: 0,age,importance
0,21,Legal Drinking Age
1,21,Officially an Adult
2,22,Taylor Swift Song
3,24,Kyle's Age


In [161]:
birthdays.merge(people, left_on='age', right_on='age')

Unnamed: 0,age,importance,name,city
0,21,Legal Drinking Age,cole,San Francisco
1,21,Officially an Adult,cole,San Francisco
2,22,Taylor Swift Song,jill,LA
3,24,Kyle's Age,kyle,San Diego
4,24,Kyle's Age,alex,Irvine


In [104]:
people.merge(birthdays, left_on='age', right_on='age')

Unnamed: 0,name,age,city,importance
0,kyle,24,San Diego,Kyle's Age
1,alex,24,Irvine,Kyle's Age
2,jill,22,LA,Taylor Swift Song
3,cole,21,San Francisco,Legal Drinking Age
4,cole,21,San Francisco,Officially an Adult


# Groupby & conditionals, loops on dataset

# Olympic Athletes
---

From kaggle user Randi H Griffin:
>This is a historical dataset on the modern Olympic Games, including all the Games from Athens 1896 to Rio 2016. I scraped this data from www.sports-reference.com in May 2018. The R code I used to scrape and wrangle the data is on GitHub. I recommend checking my kernel before starting your own analysis.
>
>Note that the Winter and Summer Games were held in the same year up until 1992. After that, they staggered them such that Winter Games occur on a four year cycle starting with 1994, then Summer in 1996, then Winter in 1998, and so on. A common mistake people make when analyzing this data is to assume that the Summer and Winter Games have always been staggered.
Content
>
>The file athlete_events.csv contains 271116 rows and 15 columns. Each row corresponds to an individual athlete competing in an individual Olympic event (athlete-events). The columns are:
>
>1. ID - Unique number for each athlete  
>2. Name - Athlete's name  
>3. Sex - M or F  
>4. Age - Integer  
>5. Height - In centimeters  
>6. Weight - In kilograms  
>7. Team - Team name  
>8. NOC - National Olympic Committee 3-letter code  
>9. Games - Year and season  
>10. Year - Integer  
>11. Season - Summer or Winter  
>12. City - Host city  
>13. Sport - Sport  
>14. Event - Event  
>15. Medal - Gold, Silver, Bronze, or NA  


In [177]:
data = bpd.read_csv("data/athlete_events.csv")
data.columns

Index(['ID', 'Name', 'Sex', 'Age', 'Height', 'Weight', 'Team', 'NOC', 'Games',
       'Year', 'Season', 'City', 'Sport', 'Event', 'Medal'],
      dtype='object')

# Something familiar, let's assign points to each country
---

Let's say we're assigning points to each country based on the number of Golds, Silvers, and Bronzes they've won.

Medals are with the following amount of points:

<pre>
  Gold    +5 pts
  Silver  +3 pts
  Bronze  +2 pts
  nan     0 pts
</pre>

Let's do it in a different way than the homework 3

In [183]:
def medal_to_points(medal):
    if medal == "Gold":
        return 5
    elif medal == "Silver":
        return 3
    elif medal == "Bronze":
        return 2
    else:
        return 0

In [184]:
print(medal_to_points("Gold"))
print(medal_to_points("Bronze"))
print(medal_to_points("Arda"))

5
2
0


In [185]:
def medal_to_points_v2(medal):
    points = [5, 3, 2]
    medals = ["Gold", "Silver", "Bronze"]
    if medal in medals:
        medal_index = medals.index(medal)
        return points[medal_index]
    else:
        return 0

In [186]:
print(medal_to_points_v2("Gold"))
print(medal_to_points_v2("Bronze"))
print(medal_to_points_v2("Arda"))

5
2
0


Okay, now we need to apply that function to our table.

What does `apply` return again?  And how will we use what it returns?

In [187]:
data_after = data[data.get("Year") >= 2010]

In [188]:
points_column = data_after.get("Medal").apply(medal_to_points)
data_with_points = data_after.assign(Points = points_column)
data_with_points

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal,Points
1,2,A Lamusi,M,23.0,170.0,60.0,China,CHN,2012 Summer,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,,0
40,16,Juhamatti Tapio Aaltonen,M,28.0,184.0,85.0,Finland,FIN,2014 Winter,2014,Winter,Sochi,Ice Hockey,Ice Hockey Men's Ice Hockey,Bronze,2
80,22,Andreea Aanei,F,22.0,170.0,125.0,Romania,ROU,2016 Summer,2016,Summer,Rio de Janeiro,Weightlifting,Weightlifting Women's Super-Heavyweight,,0
98,34,Jamale (Djamel-) Aarrass (Ahrass-),M,30.0,187.0,76.0,France,FRA,2012 Summer,2012,Summer,London,Athletics,"Athletics Men's 1,500 metres",,0
134,48,Abdelhak Aatakni,M,24.0,,64.0,Morocco,MAR,2012 Summer,2012,Summer,London,Boxing,Boxing Men's Light-Welterweight,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
271090,135557,Dominik ycki,M,38.0,192.0,95.0,Poland,POL,2012 Summer,2012,Summer,London,Sailing,Sailing Men's Two Person Keelboat,,0
271091,135558,ukasz Tomasz ygado,M,32.0,200.0,89.0,Poland,POL,2012 Summer,2012,Summer,London,Volleyball,Volleyball Men's Volleyball,,0
271110,135568,Olga Igorevna Zyuzkova,F,33.0,171.0,69.0,Belarus,BLR,2016 Summer,2016,Summer,Rio de Janeiro,Basketball,Basketball Women's Basketball,,0
271112,135570,Piotr ya,M,27.0,176.0,59.0,Poland,POL,2014 Winter,2014,Winter,Sochi,Ski Jumping,"Ski Jumping Men's Large Hill, Individual",,0


If we only care about the country and the points, do we need to work with this entire table?

In [189]:
# Select relevant columns
country_points = data_with_points.get(["NOC", "Points"]).sort_values(by = "Points", ascending = False)
country_points

Unnamed: 0,NOC,Points
108902,GER,5
139368,CHN,5
34313,AUS,5
243221,CAN,5
9417,USA,5
...,...,...
99460,AUS,0
99453,USA,0
99452,USA,0
99433,CAN,0


Now, how do we find the total amount of points scored by each country?

In [190]:
# Group by country
scores = country_points.groupby("NOC").sum()
scores.sort_values(by = "Points", ascending=False)

Unnamed: 0_level_0,Points
NOC,Unnamed: 1_level_1
USA,2468
RUS,1213
GER,1164
CAN,1055
GBR,995
...,...
MRI,0
MOZ,0
MON,0
GAM,0


# Something familiar, top 5 countries entries count?
---

Let's choose 5 countries and only work with their data. We should use the NOC column. We are looking to see how many entries these 5 countries have in total.

In [195]:
included_countries = ["USA", "CHN", "RUS", "GBR", "GER"]

We have already added points to the entire dataset based on the Medal placement, so let's just get our countries from that `data_with_points` table.

In [196]:
def in_included(country):
    return country in included_countries
print(in_included("USA"))
print(in_included("CAN"))

True
False


## Solution #1 with apply

In [200]:
# Solution #1
countries = data_with_points[data_with_points.get("NOC").apply(in_included)]
countries

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal,Points
1,2,A Lamusi,M,23.0,170.0,60.0,China,CHN,2012 Summer,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,,0
164,67,Mariya Vasilyevna Abakumova (-Tarabina),F,26.0,179.0,80.0,Russia,RUS,2012 Summer,2012,Summer,London,Athletics,Athletics Women's Javelin Throw,,0
276,145,Jeremy Abbott,M,24.0,175.0,70.0,United States,USA,2010 Winter,2010,Winter,Vancouver,Figure Skating,Figure Skating Men's Singles,,0
277,145,Jeremy Abbott,M,28.0,175.0,70.0,United States,USA,2014 Winter,2014,Winter,Sochi,Figure Skating,Figure Skating Men's Singles,,0
278,145,Jeremy Abbott,M,28.0,175.0,70.0,United States,USA,2014 Winter,2014,Winter,Sochi,Figure Skating,Figure Skating Mixed Team,Bronze,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
270943,135489,Anastasiya Valeryevna Zuyeva-Fesikova,F,26.0,182.0,71.0,Russia,RUS,2016 Summer,2016,Summer,Rio de Janeiro,Swimming,Swimming Women's 4 x 100 metres Medley Relay,,0
270988,135508,Vera Igorevna Zvonaryova,F,27.0,172.0,59.0,Russia,RUS,2012 Summer,2012,Summer,London,Tennis,Tennis Women's Singles,,0
271019,135525,Martin Zwicker,M,29.0,175.0,64.0,Germany,GER,2016 Summer,2016,Summer,Rio de Janeiro,Hockey,Hockey Men's Hockey,Bronze,2
271023,135528,Marc Zwiebler,M,28.0,181.0,75.0,Germany,GER,2012 Summer,2012,Summer,London,Badminton,Badminton Men's Singles,,0


In [201]:
countries.groupby("NOC").count().get("Points")

NOC
CHN    1214
GBR    1318
GER    1576
RUS    1587
USA    2116
Name: Points, dtype: int64

## Solution #2 with merge

In [202]:
inc_countries = bpd.DataFrame().assign(NOC = included_countries)
inc_countries

Unnamed: 0,NOC
0,USA
1,CHN
2,RUS
3,GBR
4,GER


In [205]:
countries = data_with_points.merge(inc_countries, left_on='NOC', right_on='NOC')
countries

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal,Points
0,2,A Lamusi,M,23.0,170.0,60.0,China,CHN,2012 Summer,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,,0
1,1464,Ai Yanhan,F,14.0,168.0,54.0,China,CHN,2016 Summer,2016,Summer,Rio de Janeiro,Swimming,Swimming Women's 200 metres Freestyle,,0
2,1464,Ai Yanhan,F,14.0,168.0,54.0,China,CHN,2016 Summer,2016,Summer,Rio de Janeiro,Swimming,Swimming Women's 4 x 200 metres Freestyle Relay,,0
3,6376,Ba Dexin,M,23.0,185.0,80.0,China,CHN,2014 Winter,2014,Winter,Sochi,Curling,Curling Men's Curling,,0
4,6847,Bai Anqi,F,19.0,164.0,59.0,China,CHN,2012 Summer,2012,Summer,London,Swimming,Swimming Women's 200 metres Backstroke,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7806,133187,Andrew Young,M,17.0,193.0,83.0,Great Britain,GBR,2010 Winter,2010,Winter,Vancouver,Cross Country Skiing,Cross Country Skiing Men's 15 kilometres,,0
7807,133187,Andrew Young,M,17.0,193.0,83.0,Great Britain,GBR,2010 Winter,2010,Winter,Vancouver,Cross Country Skiing,Cross Country Skiing Men's Team Sprint,,0
7808,133187,Andrew Young,M,21.0,193.0,83.0,Great Britain,GBR,2014 Winter,2014,Winter,Sochi,Cross Country Skiing,Cross Country Skiing Men's Sprint,,0
7809,133187,Andrew Young,M,21.0,193.0,83.0,Great Britain,GBR,2014 Winter,2014,Winter,Sochi,Cross Country Skiing,Cross Country Skiing Men's 15 kilometres,,0


In [206]:
countries.groupby("NOC").count().get("Points")

NOC
CHN    1214
GBR    1318
GER    1576
RUS    1587
USA    2116
Name: Points, dtype: int64

## Groupby with multiple columns

For each country (among all countries), for each sport played by that country, count how many data points we have.

|NOC|Sport|# Entries|
|--------|--------|--------|
|USA|Basketball |10 |
|USA|Swimming |5 | 
|USA|Curling |0 |
|CAN|Swimming |0 | 
|CAN|Curling |6|
|...|... |...|

etc.

In [212]:
( 
    data_with_points.groupby(["NOC", "Sport"])
                  .count()
                  .get("ID")
)

NOC  Sport        
AFG  Athletics         4
     Boxing            1
     Judo              2
     Taekwondo         2
ALB  Alpine Skiing     4
                      ..
ZIM  Football         17
     Rowing            4
     Shooting          1
     Swimming          6
     Triathlon         1
Name: ID, Length: 2626, dtype: int64

In [216]:
( 
    data_with_points.groupby(["NOC", "Sport"])
                  .count()
                  .reset_index()
                  .get(["NOC", "Sport", "ID"])
)

Unnamed: 0,NOC,Sport,ID
0,AFG,Athletics,4
1,AFG,Boxing,1
2,AFG,Judo,2
3,AFG,Taekwondo,2
4,ALB,Alpine Skiing,4
...,...,...,...
2621,ZIM,Football,17
2622,ZIM,Rowing,4
2623,ZIM,Shooting,1
2624,ZIM,Swimming,6


# Probability and Experiments

## Prob. Problem 1

Easy question probability wise: We throw a fair die (meaning all outcomes are equally likely). What is the probability that the number result is divisible by 3.

In [218]:
# required outcomes / # all outcomes
# required outcomes: 0, 3, 6
# all outcomes: 0, 1, 2, 3, 4, 5
3 / 6

0.5

How about a d20 dice (20 sided dice, sides ranging from 1 to 20) Let's count it with python:

In [219]:
die_values = range(1,20 + 1)
[i for i in die_values]

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]

In [220]:
count = 0
for side in die_values:
    if side % 3 == 0:
        print(side, end=', ')
        count += 1
count

3, 6, 9, 12, 15, 18, 

6

Let's find the probability according to our counts

In [221]:
total_count = len(die_values)
div3_probability = count / total_count
div3_probability

0.3

For many problems we face, we can't plausibly count all the outcomes this way. 

In such cases we have to run experiments to decide on empricial probability values.

 Let's try to find the result we just obtained experimentally, without counting every possible outcome.

In [241]:
import random

die_values = range(1,20 + 1)
def div_by3_experiment(runs):
        samples = np.random.choice(die_values, runs, replace=True)
        positive_outcomes = samples % 3 == 0
        return sum(positive_outcomes) 

In [242]:
div_by3_experiment(1)

0

In [243]:
div_by3_experiment(10)

2

In [244]:
div_by3_experiment(100)

31

In [245]:
runs = 10_000
empirical_prob = div_by3_experiment(runs) / runs
empirical_prob

0.3069

In [246]:
runs = 1_000_000
empirical_prob = div_by3_experiment(runs) / runs
empirical_prob

0.300756

As you can see, we are getting closer and closer to the correct value of 0.3

## Prob. Problem 2

Different problem, we are rolling two D6 dice (each has 6 sides). 

What is the probability that the sum of the numbers will be even?

Let's first think mathematically and count the results. 

To have an even number, we should either add two even numbers, or two odd numbers.

Two possibilities:
* Both are even 3 * 3 = 9 possibilities (0,2,4)
* Both are odd  3 * 3 = 9 possibilities (1,3,5)


9 + 9 = 18 possibilities (half of the total 36)

Let's use python to confirm by counting

In [247]:
die_values = range(1,6+1)

def even_count(die_values):
    count = 0
    for die1 in die_values:
        for die2 in die_values:
            dice_sum = die1 + die2
            count += dice_sum % 2 == 0
    return count

even_count(die_values)

18

In [248]:
total_outcomes = len(die_values) ** 2
probability = even_count(die_values) / total_outcomes
probability

0.5

What if both dice were D20 (20 sides):

We are looking for sums that are even and greater than or equal to 30

In [267]:
def even_count_modified(die_values, lower_lim):
    count = 0
    for die1 in die_values:
        for die2 in die_values:
            dice_sum = die1 + die2
            if dice_sum >= lower_lim:
                count += dice_sum % 2 == 0
    return count

die_values = range(1, 20+1)
lower_limit = 30
e_count = even_count_modified(die_values, lower_limit)
e_count

36

In [268]:
total_outcomes = len(die_values) ** 2
probability = e_count / total_outcomes
probability

0.09

Let's try to get this result experimentally

In [270]:
def even_exp(die_values, runs, lower_lim):
        die1_samples = np.random.choice(die_values, runs, replace=True)
        die2_samples = np.random.choice(die_values, runs, replace=True)
        dice_sum = die1_samples + die2_samples
        positive_outcomes = (dice_sum % 2 == 0) & (dice_sum >= lower_lim)
        return sum(positive_outcomes) 

In [271]:
runs = 1000000
e_count = even_exp(die_values, runs, lower_limit)
probability = e_count / runs
probability

0.090167

That's pretty close!

# One last thing before we go...

Warning: Be careful of the difference between:
* logical: and, bitwise: &
* logical: or,  bitwise: |

Let's go over their difference

In [272]:
True and False

False

In [273]:
np.array([True, False, False]) or np.array([False, True, False])

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [274]:
np.array([True, False, False]) | np.array([False, True, False])

array([ True,  True, False])

* Bitwise operations work compare each element of two boolean arrays
* Logical operations compare two single boolean values (not arrays)