# Import pandas and read csv file into a Pandas table (DataFrame)

In [None]:
import pandas as pd

In [None]:
penguins = pd.read_csv('palmer_penguins.csv')
penguins.head()

In [None]:
penguins.shape

# Arrays and pd.Series

In [None]:
type(penguins['Species'])

In [None]:
type(penguins.Island)

In [None]:
penguins.Island.unique()

# Individual indexing

In [None]:
penguins.loc[2]

In [None]:
penguins.iloc[2]

In [None]:
penguins.drop(index=[2, 3])

# Review - comparisons

In [None]:
x = 2

In [None]:

(x > 0 or x < 10) and not ( x == 2 )

In [None]:
( x > 0 and x < 10) or not (x == 2)

# Boolean indexing

In [None]:
# If we didn't have boolean indexing, we'd need to write a for/if loop like this...
df = pd.DataFrame(columns = penguins.columns)

for row_num in penguins.index:
    row = penguins.loc[row_num]
    if row['Culmen Length (mm)'] < 40:
        df.loc[len(df)] = row
df

In [None]:
culm = (penguins['Culmen Length (mm)'] < 40)
culm

In [None]:
# the parentheses in the cell above are not needed, but are often used to help with readability
# following code does the same as the above cell
culm = penguins['Culmen Length (mm)'] < 40
culm

In [None]:
type(culm)

In [None]:
penguins[culm]

In [None]:
sum(culm)

In [None]:
torg = (penguins['Island'] == 'Torgersen')
torg

In [None]:
sum(torg)

In [None]:
penguins[torg].head()

In [None]:
sum(culm & torg)

In [None]:
culm_and_torg = penguins[culm & torg]

In [None]:
culm_and_torg.head()

In [None]:
culm_and_torg.iloc[2] # 3rd from the top

In [None]:
culm_and_torg.loc[2] # row label (index) is 2 ...

In [None]:
sum(culm | torg)

In [None]:
penguins[culm | torg]

In [None]:
chonk = (penguins['Body Mass (g)'] > 5000)
sum(chonk)

In [None]:
sum(culm & torg | chonk)

In [None]:
sum((culm & torg) | chonk)

In [None]:
sum(culm & (torg | chonk))

In [None]:
sum(culm & torg & chonk)

# Groupby

In [None]:
nba = pd.read_csv('nba_salaries.csv')
nba

In [None]:
# if we didn't use groupby...

for pos in nba.position.unique():
    pos_bool = (nba.position==pos)
    print(pos, nba[pos_bool].salary.mean())

In [None]:
# easy living with groupby for easy/common operations like mean
nba.groupby('position').salary.mean()

In [None]:
# you can add even more stuff
nba.groupby('position').salary.mean().round()

In [None]:
# for loop + group by
for pos, sub_nba in nba.groupby('position'):
    print(pos, sub_nba.salary.mean())

# Sort

In [None]:
nba.sort_values(by='salary')

In [None]:
nba.sort_values(by='salary', ascending=False)

In [None]:
nba.sort_values(by=['salary', 'season'])

In [None]:
nba.sort_values(by=['season', 'salary'])

# NAs

In [None]:
penguins.isna()

In [None]:
penguins.notna()

In [None]:
penguins.dropna().shape

In [None]:
penguins.dropna(subset=['Culmen Length (mm)', 'Island']).shape

# Discussion questions

## Penguins

a) What is the average `Body Mass (g)` of `Female` penguins in `Dream` island?

b) What is the percentage of `Female` penguins in `Adelie` species? 

c) What is the percentage of chonky penguins per species? Chonky is defined as `Body Mass` over 5kg.


In [None]:
F = (penguins.Sex == 'FEMALE')
dream = (penguins.Island == 'Dream')
sub_pens = penguins[F & dream]
sub_pens['Body Mass (g)'].mean()


In [None]:
adelie = (penguins.Species == 'Adelie Penguin (Pygoscelis adeliae)')
sum(F & adelie)/sum(adelie)

In [None]:
for species, sub_pens in penguins.groupby('Species'):
    print(species, ':', sub_pens.shape)

In [None]:
for species, sub_pens in penguins.groupby('Species'):
    chonk = (sub_pens['Body Mass (g)'] > 5000)
    print(species, ':', sum(chonk)/len(chonk))

## NBA

a) Add a new column of the salary in units of 1M. Round to 2 digits after decimal point.

b) Find the names of all point guards (PG) who made more than $15M

c) After evaluating these expressions in order, what's the result?

```python
nba = pd.read_csv('nba_salaries.csv')
nba.drop(columns='position')
nba.shape
```


In [None]:
nba['salary (in M)'] = nba.salary/1000000
nba['salary (in M)'] = nba['salary (in M)'].round(2)
nba

In [None]:
PG = (nba.position == 'PG')
rich = (nba['salary'] >= 15)
rich_PGs = nba[PG & rich]
rich_PGs['name']

In [None]:
nba = pd.read_csv('nba_salaries.csv')
nba.drop(columns='position')
nba.shape