In [None]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

## Group

In [None]:
#creating a table from scratch 
cones = Table().with_columns(
    'Flavor', make_array('strawberry', 'chocolate', 'chocolate', 'strawberry', 'chocolate'),
    'Price', make_array(3.55, 4.75, 6.55, 5.25, 5.25),
    'Calories', make_array(200, 345, 230, 500, 245)
)
cones

In [None]:
#group by flavor
cones.group('Flavor')

In [None]:
cones.group('Flavor', sum)

In [None]:
#what happens if the data is not numerical?
cones.with_column('Awards', make_array('None', 'None', 'Five Stars', 'Four Stars', 'None')).group('Flavor', sum)

In [None]:
cones.group('Flavor', sum)

In [None]:
#could you get the same result using where?

In [None]:
cones.where('Flavor', 'chocolate').column('Price').sum()

In [None]:
#let's try a different function instead of sum. What do you think will happen?
cones.group('Flavor', max)

In [None]:
cones.group('Flavor', np.mean)

In [None]:
nba = Table.read_table('nba_salaries.csv')
nba = nba.relabeled("'15-'16 SALARY", 'SALARY')
nba

In [None]:
#Which team spent the most on its players? 
nba.group('TEAM', sum)


In [None]:
nba.select('TEAM', 'SALARY').group('TEAM', sum)

In [None]:
#sort in descending order of salary

nba.select('TEAM','SALARY') .group('TEAM', sum).sort(1, descending = True)

In [None]:
#We can plot it
nba.select('TEAM', 'SALARY').group('TEAM', sum).sort(1, descending = True).barh('TEAM') 

In [None]:
#we could use Position for grouping as well
 

## Group with multiple columns

In [None]:
all_cones = Table.read_table('cones.csv')
all_cones

In [None]:
#again, we can use group on the Flavor
all_cones.group('Flavor')

In [None]:
#but we can group by Flavor and Color if we want to
all_cones.group(['Flavor', 'Color'])

In [None]:
#columns can be given as list or array
all_cones.group(make_array('Flavor', 'Color'))

In [None]:
all_cones.group(make_array('Flavor', 'Color'), min)

In [None]:
#Back to NBA. I want to find the most expensive player for each team and position (we have 5 positions in total)
nba

In [None]:
nba.group(make_array('TEAM', 'POSITION'), max)

In [None]:
# Does Walter Tavares make 12 million dollars? What happened?

nba.where('TEAM', 'Atlanta Hawks').where('POSITION', 'C')

In [None]:
nba.select('TEAM', 'POSITION', 'SALARY').group(make_array('TEAM', 'POSITION'), max)

In [None]:
starters = nba.select('TEAM', 'POSITION', 'SALARY').group(make_array('TEAM', 'POSITION'), max)
starters 

## Discussion Question

In [None]:
starters.group('TEAM', max)

In [None]:
starters.drop('POSITION').group('TEAM', max).sort(1, descending = True)

In [None]:
starters.select('TEAM', 'SALARY max').group('TEAM', max).sort(1, descending = True)

## Joining tables

In [None]:
drinks = Table(['Drink', 'Cafe', 'Price']).with_rows([
    ['Milk Tea', 'Tea One', 4],
    ['Espresso', 'Nefeli',  2],
    ['Latte',    'Nefeli',  3],
    ['Espresso', "Abe's",   2]
])
drinks

In [None]:
discounts = Table().with_columns(
    'Coupon % off', make_array(25, 50, 5),
    'Location', make_array('Tea One', 'Nefeli', 'Tea One')
)
discounts

In [None]:
t = drinks.join('Cafe', discounts, "Location")
t

In [None]:
#price after the coupon
t.column('Price') * (100 - t.column(3)) / 100


In [None]:
#price after the coupon, in a table
t.with_column('Discounted', t.column('Price') * (100 - t.column(3)) / 100)


## Random Selection

In [None]:
coin = make_array("Heads", "Tails")
np.random.choice(coin)  #will run just once

In [None]:
#biased_coin (not fair)
biased_coin = make_array("Heads", "Heads", "Heads", "Tails")

In [None]:
np.random.choice(biased_coin, 10)

In [None]:
#now we can check how many heads are there
np.count_nonzero(tosses == "Heads")

In [None]:
#let's have even more tosses
more_tosses = np.random.choice(coin, 100)
np.count_nonzero(more_tosses == "Heads")

## Comparison

In [None]:
#Think of it as a question, not mathematical inequality. 
2+2 > 1

In [None]:
#what do you expect to see?
2+1 = 3

In [None]:
2+1 == 3

In [None]:
#and now?
6 == 12/2 == 4+3

In [None]:
# we can compare strings as well
'nighthouse' > 'lighthouse'

In [None]:
#what do you expect to see?
'nighthouse' > 'lighthouse' > 'light'

In [None]:
#assignment operation: takes value from the right and assign it to the left
x = 2
y = 3

In [None]:
x > y

In [None]:
# checking if x and y have the same value. Note, it is ==, not =.
x == y

In [None]:
#opposite: checking if x does not equal to 4. 
x != 4

In [None]:
#combining comparisons:
a = True;
b = False;
a and b

In [None]:
b and b

## Discussion Question

In [None]:
(not True) and True

In [None]:
a = True
b = True
not(( (not a) and b) or ( (not b) or a))

In [None]:
def sign(x):
    if x > 0: 
        return "Positive"

In [None]:
sign(3)

In [None]:
#what will happen?
sign(-3)

In [None]:
def sign(x):
    if x > 0: 
        return "Positive"
    elif x < 0:
        return "Negative"

In [None]:
sign(-3)

In [None]:
def sign(x):
    if x > 0: 
        return "Positive"
    elif x < 0:
        return "Negative"
    elif x == 0:
        return "Neither positive, nor negative"

In [None]:
sign(3)

In [None]:
sign(0)

In [None]:
#we can simplify the last check a bit:
def sign(x):
    if x > 0: 
        return "Positive"
    elif x < 0:
        return "Negative"
    else:
        return "Neither positive, nor negative"


## Discussion Question

In [None]:
def func(a, b):
    if (a + b > 4 and b > 0):
        return 'print 1'
    elif (a*b >= 4 or b < 0):
        return 'print 2'
    else: 
        return 'print 3'

In [None]:
func(2, 2)

In [None]:
#develop a function:
# takes 2 element array (say, moon and sun)
# also takes another value (should be one of the array entries, either moon or sun)
# if the this value matches the first array entry, return the second array entry, and vice versa

def other_one(array2, x):
    if x==array2.item(0):
        return array2.item(1)
    
    elif x==array2.item(1):
        return array2.item(0)
    
    else:
        return "Invalid input"

In [None]:
other_one(make_array(4,5), 4)

## Iterative statements

In [None]:
#the variable name (i, here) is up to you. but i is used very often as in i(tem)
for i in np.arange(5):
    print(i)

In [None]:
for i in np.arange(10):
    print(np.random.choice(coin))

In [None]:
#we want to save the outputs into array
#start with empty array, each time add a new output
#practice with append first
a = make_array("Goofy", "Donald", "Mickey")
np.append(a, "Minnie")



In [None]:
#need to re-assign the name to the same one 
disney = make_array()
disney = np.append(disney, "Goofy")
disney

In [None]:
#add the rest of them

disney = make_array()
disney = np.append(disney, "Goofy")
disney = np.append(disney, "Donald")
disney = np.append(disney, "Mickey")
disney = np.append(disney, "Minnie")
disney

In [None]:
#now let's use a for loop to simulate five tosses of a coin 
# and place the results into an array.

tosses = make_array()

for i in np.arange(5):
    tosses = np.append(tosses, np.random.choice(coin))

tosses

In [None]:
tosses = make_array()

for i in np.arange(1000):
    tosses = np.append(tosses, np.random.choice(coin))

np.count_nonzero(tosses == 'Heads')


In [None]:
#Example: Run 10000 samples
# 100 tosses in each sample
# Calculate number of heads

N = 10000 #number of repetitions 

head_count = make_array() #empty array, keeps track of heads

for i in np.arange(N):   #run it 10000 times
    tosses = np.random.choice(coin, 100)  #generate 100 coin tosses 
    head_count = np.append(head_count, np.count_nonzero(tosses == 'Heads')) # calculate number of heads, update heads array

head_count

In [None]:
#Create a table and then plot the results

results = Table().with_columns(
    'Repetition', np.arange(1, N+1),
    'Number of Heads', head_count
)

results

In [None]:
results.select('Number of Heads').hist(bins=np.arange(30.5, 70.6, 1))
