In [54]:
import babypandas as bpd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from notebook.services.config import ConfigManager

cm = ConfigManager()
cm.update(
    "livereveal", {
        'width': 1500,
        'height': 700,
        "scroll": True,
})

{'width': 1500, 'height': 700, 'scroll': True}

# DSC 10 Discussion Week 4
---

### Merge, GroupBy, Conditionals, Iteration, Simulation

# College Scorecard

http://collegescorecard.ed.gov

In [66]:
colleges = bpd.read_csv('data/csc_basic.csv').set_index('UNITID')
colleges

Unnamed: 0_level_0,INSTNM,CITY,STABBR,PREDDEG,CONTROL,UGDS,RELAFFIL,DISTANCEONLY,UGDS_NRA,PCTPELL,PCTFLOAN
UNITID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
100654,Alabama A & M University,Normal,AL,3,1,4616.0,0,0.0,0.0065,0.7039,0.7667
100663,University of Alabama at Birmingham,Birmingham,AL,3,1,12047.0,0,0.0,0.0179,0.3525,0.5179
100690,Amridge University,Montgomery,AL,3,2,293.0,1,1.0,0.0000,0.6971,0.8436
100706,University of Alabama in Huntsville,Huntsville,AL,3,1,6346.0,0,0.0,0.0303,0.2949,0.4312
100724,Alabama State University,Montgomery,AL,3,1,4704.0,0,0.0,0.0159,0.7815,0.8113
...,...,...,...,...,...,...,...,...,...,...,...
489830,Arthur's Beauty College-Jonesboro,Jonesboro,AR,1,3,71.0,0,0.0,0.0000,0.7525,0.7228
489900,Palm Beach Academy of Health & Beauty-Distinct...,Lauderhill,FL,1,3,27.0,0,0.0,0.0000,0.0000,0.0000
489937,Piedmont International University,Winston-Salem,NC,3,2,295.0,1,0.0,0.0034,0.3634,0.3343
490009,Spartan College of Aeronautics and Technology,Westminster,CO,2,3,85.0,0,0.0,0.0000,0.5754,0.4804


# Question (review)

Which state has the most colleges/universities?

In [56]:
#...
colleges_per_state = colleges.groupby('STABBR').count().sort_values(by='CITY').get(['CITY'])
colleges_per_state

Unnamed: 0_level_0,CITY
STABBR,Unnamed: 1_level_1
MP,1
VI,1
PW,1
AS,1
MH,1
...,...
PA,339
FL,358
TX,400
NY,412


# Question

Which state has the largest number of colleges *per person*?

In [57]:
pops = bpd.read_csv('data/state-population.csv')
pops

Unnamed: 0,state/region,ages,year,population
0,AL,under18,2012,1117489.0
1,AL,total,2012,4817528.0
2,AL,under18,2010,1130966.0
3,AL,total,2010,4785570.0
4,AL,under18,2011,1125763.0
...,...,...,...,...
2539,USA,total,2010,309326295.0
2540,USA,under18,2011,73902222.0
2541,USA,total,2011,311582564.0
2542,USA,under18,2012,73708179.0


In [58]:
#... pops_by_state
pops_by_state = pops[(pops.get('ages') == 'total') & (pops.get('year') == 2012)]
pops_by_state

Unnamed: 0,state/region,ages,year,population
1,AL,total,2012,4817528.0
95,AK,total,2012,730307.0
97,AZ,total,2012,6551149.0
191,AR,total,2012,2949828.0
193,CA,total,2012,37999878.0
...,...,...,...,...
2304,WV,total,2012,1856680.0
2399,WI,total,2012,5724554.0
2400,WY,total,2012,576626.0
2495,PR,total,2012,3651545.0


In [59]:
#... with_per_person
with_pops = colleges_per_state.merge(pops_by_state, left_index=True, right_on='state/region')
per_person = with_pops.assign(
    per_person=with_pops.get('CITY') / with_pops.get('population')
).sort_values(by='per_person')
per_person

Unnamed: 0,CITY,state/region,ages,year,population,per_person
95,9,AK,total,2012,730307.0,0.000012
960,86,MD,total,2012,5884868.0,0.000015
1344,41,NV,total,2012,2754354.0,0.000015
2111,400,TX,total,2012,26060796.0,0.000015
2303,108,WA,total,2012,6895318.0,0.000016
...,...,...,...,...,...,...
1728,130,OK,total,2012,3815780.0,0.000034
2207,22,VT,total,2012,625953.0,0.000035
2495,139,PR,total,2012,3651545.0,0.000038
2304,73,WV,total,2012,1856680.0,0.000039


# Question

What if we had set the index of `pops_by_state`?

In [223]:
pops_with_index = pops_by_state.set_index('state/region')
pops_with_index

Unnamed: 0_level_0,ages,year,population
state/region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AL,total,2012,4817528.0
AK,total,2012,730307.0
AZ,total,2012,6551149.0
AR,total,2012,2949828.0
CA,total,2012,37999878.0
...,...,...,...
WV,total,2012,1856680.0
WI,total,2012,5724554.0
WY,total,2012,576626.0
PR,total,2012,3651545.0


# Question

The `preddeg` column specifies the primary degree type offered (Associates, Bachelors, etc.) How many institutions of each type does each state have?

In [61]:
colleges

Unnamed: 0_level_0,INSTNM,CITY,STABBR,PREDDEG,CONTROL,UGDS,RELAFFIL,DISTANCEONLY,UGDS_NRA,PCTPELL,PCTFLOAN
UNITID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
100654,Alabama A & M University,Normal,AL,3,1,4616.0,0,0.0,0.0065,0.7039,0.7667
100663,University of Alabama at Birmingham,Birmingham,AL,3,1,12047.0,0,0.0,0.0179,0.3525,0.5179
100690,Amridge University,Montgomery,AL,3,2,293.0,1,1.0,0.0000,0.6971,0.8436
100706,University of Alabama in Huntsville,Huntsville,AL,3,1,6346.0,0,0.0,0.0303,0.2949,0.4312
100724,Alabama State University,Montgomery,AL,3,1,4704.0,0,0.0,0.0159,0.7815,0.8113
...,...,...,...,...,...,...,...,...,...,...,...
489830,Arthur's Beauty College-Jonesboro,Jonesboro,AR,1,3,71.0,0,0.0,0.0000,0.7525,0.7228
489900,Palm Beach Academy of Health & Beauty-Distinct...,Lauderhill,FL,1,3,27.0,0,0.0,0.0000,0.0000,0.0000
489937,Piedmont International University,Winston-Salem,NC,3,2,295.0,1,0.0,0.0034,0.3634,0.3343
490009,Spartan College of Aeronautics and Technology,Westminster,CO,2,3,85.0,0,0.0,0.0000,0.5754,0.4804


In [63]:
#...
colleges.groupby(['STABBR', 'PREDDEG']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,INSTNM,CITY,CONTROL,UGDS,RELAFFIL,DISTANCEONLY,UGDS_NRA,PCTPELL,PCTFLOAN
STABBR,PREDDEG,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
AK,1,5,5,5,5,5,5,5,5,5
AK,2,1,1,1,1,1,1,1,1,1
AK,3,3,3,3,3,3,3,3,3,3
AL,1,31,31,31,31,31,31,31,31,31
AL,2,21,21,21,21,21,21,21,21,21
...,...,...,...,...,...,...,...,...,...,...
WV,2,16,16,16,16,16,16,16,16,16
WV,3,21,21,21,21,21,21,21,21,21
WY,1,2,2,2,2,2,2,2,2,2
WY,2,8,8,8,8,8,8,8,8,8


# Question

Suppose that a college is considered large if it has more than 15000 undergrads, "medium" if it has more than 5000 but <= 15000, "small" if it has more than 100 but <= 5000, and "tiny" if it has <= than 100 students. Write a function `college_size` which takes in a number of undergrads and returns a string ("tiny", "small", "medium", "large").

In [69]:
#-
def college_size(n):
    if n <= 100:
        return 'tiny'
    elif n <= 5_000:
        return 'small'
    elif n <= 15_000:
        return 'medium'
    else:
        return 'large'

# College Scorecard with Earnings


In [73]:
with_earnings = bpd.read_csv('data/csc_financials.txt')
with_earnings

Unnamed: 0,UNITID,INSTNM,CITY,STABBR,PREDDEG,CONTROL,UGDS,RELAFFIL,DISTANCEONLY,UGDS_NRA,PCTPELL,PCTFLOAN,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP,RPY_3YR_RT_SUPP,GT_25K_P6,NPT4
0,100654,Alabama A & M University,Normal,AL,3,1,4616.0,0,0.0,0.0065,0.7039,0.7667,29900,35000.0,0.245850,0.453,15567.0
1,100663,University of Alabama at Birmingham,Birmingham,AL,3,1,12047.0,0,0.0,0.0179,0.3525,0.5179,40200,21500.0,0.519911,0.669,16475.0
2,100690,Amridge University,Montgomery,AL,3,2,293.0,1,1.0,0.0000,0.6971,0.8436,40100,23000.0,0.233100,0.658,10155.0
3,100706,University of Alabama in Huntsville,Huntsville,AL,3,1,6346.0,0,0.0,0.0303,0.2949,0.4312,45600,23500.0,0.549003,0.685,19423.0
4,100724,Alabama State University,Montgomery,AL,3,1,4704.0,0,0.0,0.0159,0.7815,0.8113,26700,32091.0,0.196354,0.393,15037.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6294,489760,Milan Institute-Bakersfield West,Bakersfield,CA,2,3,78.0,0,0.0,0.0000,0.6506,0.6265,-1,-1.0,-1.000000,-1.000,23291.0
6295,489812,Pima Medical Institute-Dillon,Dillon,MT,1,3,35.0,0,0.0,0.0000,0.1053,0.2632,-1,-1.0,-1.000000,-1.000,18671.0
6296,489830,Arthur's Beauty College-Jonesboro,Jonesboro,AR,1,3,71.0,0,0.0,0.0000,0.7525,0.7228,-1,-1.0,-1.000000,-1.000,10263.0
6297,489937,Piedmont International University,Winston-Salem,NC,3,2,295.0,1,0.0,0.0034,0.3634,0.3343,-1,-1.0,-1.000000,-1.000,16669.0


# Question

What is the median earnings `MD_EARN_WNE_P10` amount for UCSD graduates?

In [81]:
#-
with_earnings[with_earnings.get('INSTNM').str.contains('Diego')]
with_earnings.set_index('INSTNM').get('MD_EARN_WNE_P10').loc['University of California-San Diego']

58600

# Cards

In [145]:
values = [2, 3, 4, 5, 6, 7, 8, 9, 10, 'J', 'Q', 'K', 'A']
suits = ['hearts', 'diamonds', 'clubs', 'spades']

ALL_CARDS = []
for value in values:
    for suit in suits:
        card = str(value) + ' of ' + suit
        ALL_CARDS.append(card)

# Question

Simulate drawing 5 cards *without replacement*

In [147]:
#-
np.random.choice(ALL_CARDS, 5, replace=False)

array(['J of clubs', '6 of hearts', '7 of hearts', '8 of clubs',
       '9 of hearts'], dtype='<U14')

# Question

Simulate drawing 5 cards *with* replacement

In [222]:
#-
np.random.choice(ALL_CARDS, 5, replace=True)

array(['2 of spades', '9 of spades', 'A of spades', '3 of hearts',
       'A of spades'], dtype='<U14')

# Question

How do we check if a card is a Club?

In [149]:
card = 'Q of clubs'

# Question


Make a function `number_of_suit(cards, suit)` which, given a list of cards, counts the number of cards matching the suit.

In [221]:
#-
def number_of_suit(cards, suit):
    number = 0
    for card in cards:
        if card.endswith(suit):
            number = number + 1
    return number

# Question

What is the probability that a hand of 5 cards, drawn *without* replacement, has 2 or more clubs?

1. Figure out how to run one experiment (put it in a function).
2. Run the experiment a bunch of times.
3. Calculate the proportion of times that the thing is true.

### 1. Run one experiment

In [163]:
def experiment():
    # draw 5 cards, without replacement
    hand = np.random.choice(ALL_CARDS, 5, replace=False)

    # count the number of clubs
    return number_of_suit(hand, 'clubs')

experiment()

1

### 2. Run the experiment a bunch of times

Start by running it 1000 times

In [199]:
results = np.array([])
times = 1_000

for time in np.arange(times):
    result = experiment()
    results = np.append(results, result)

### 3. Calculate the probability

That is, what proportion of times did we see >= 2 clubs?

In [200]:
np.count_nonzero(results >= 2) / times

0.363

# Question

What is the probability that all of the cards are clubs?

In [201]:
#-
np.count_nonzero(results == 5) / times

0.0

# Question

What is the probability of getting all red cards when drawing 5 cards without replacement?

In [218]:
#-
def all_red(hand):
    for card in hand:
        if card.endswith('clubs') or card.endswith('spades'):
            return False
    return True

In [219]:
#-
def experiment():
    # draw 5 cards, without replacement
    hand = np.random.choice(ALL_CARDS, 5, replace=False)

    # count the number of clubs
    return all_red(hand)

experiment()

False

In [220]:
#-
results = np.array([])
times = 1_000

for time in np.arange(times):
    result = experiment()
    results = np.append(results, result)