# Part 1 -- Intro


Load the tips dataset from seaborn


In [5]:
import seaborn as sns # type: ignore
import numpy as np

tips = sns.load_dataset('tips')
tips


Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


### Filter rows by `smoker == 'No'` and `total_bill >= 10`


In [6]:
tips.loc[(tips['smoker']== 'No') & (tips['total_bill']>= 10)]


Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
235,10.07,1.25,Male,No,Sat,Dinner,2
238,35.83,4.67,Female,No,Sat,Dinner,3
239,29.03,5.92,Male,No,Sat,Dinner,3
242,17.82,1.75,Male,No,Sat,Dinner,2


### What is the average `total_bill` for each value of `smoker`, `day`, and `time`


In [7]:
tips.groupby(['smoker','day','time'])['total_bill'].agg(np.mean)


  tips.groupby(['smoker','day','time'])['total_bill'].agg(np.mean)
  tips.groupby(['smoker','day','time'])['total_bill'].agg(np.mean)


smoker  day   time  
Yes     Thur  Lunch     19.190588
              Dinner          NaN
        Fri   Lunch     12.323333
              Dinner    19.806667
        Sat   Lunch           NaN
              Dinner    21.276667
        Sun   Lunch           NaN
              Dinner    24.120000
No      Thur  Lunch     17.075227
              Dinner    18.780000
        Fri   Lunch     15.980000
              Dinner    19.233333
        Sat   Lunch           NaN
              Dinner    19.661778
        Sun   Lunch           NaN
              Dinner    20.506667
Name: total_bill, dtype: float64

# Part 2 -- Tidy


Taken from the r4ds "Tidy Data" Chapter: https://r4ds.had.co.nz/exploratory-data-analysis.html


In [8]:
import pandas as pd


In [9]:
tbl1 = pd.read_csv('./data/table1.csv')


In [10]:
tbl2 = pd.read_csv('./data/table2.csv')


In [11]:
tbl3 = pd.read_csv('./data/table3.csv')


In [12]:
tbl1


Unnamed: 0,country,year,cases,population
0,Afghanistan,1999,745,19987071
1,Afghanistan,2000,2666,20595360
2,Brazil,1999,37737,172006362
3,Brazil,2000,80488,174504898
4,China,1999,212258,1272915272
5,China,2000,213766,1280428583


### Tidy the `tbl2` dataset


In [13]:
tbl2


Unnamed: 0,country,year,type,count
0,Afghanistan,1999,cases,745
1,Afghanistan,1999,population,19987071
2,Afghanistan,2000,cases,2666
3,Afghanistan,2000,population,20595360
4,Brazil,1999,cases,37737
5,Brazil,1999,population,172006362
6,Brazil,2000,cases,80488
7,Brazil,2000,population,174504898
8,China,1999,cases,212258
9,China,1999,population,1272915272


In [14]:
tbl2.pivot_table(index=['country','year'],
                 columns='type',
                 values='count').reset_index()


type,country,year,cases,population
0,Afghanistan,1999,745.0,19987070.0
1,Afghanistan,2000,2666.0,20595360.0
2,Brazil,1999,37737.0,172006400.0
3,Brazil,2000,80488.0,174504900.0
4,China,1999,212258.0,1272915000.0
5,China,2000,213766.0,1280429000.0


### Tidy the `tbl3` dataset


In [15]:
# just give me the population
tbl3


Unnamed: 0,country,year,rate
0,Afghanistan,1999,745/19987071
1,Afghanistan,2000,2666/20595360
2,Brazil,1999,37737/172006362
3,Brazil,2000,80488/174504898
4,China,1999,212258/1272915272
5,China,2000,213766/1280428583


In [16]:
tbl3_split= tbl3['rate'].str.split('/', expand=True)


In [17]:
tbl3[['rate_split', 'population']] = tbl3_split
# tbl3.drop(['number'], axis='columns', inplace=True)
tbl3


Unnamed: 0,country,year,rate,rate_split,population
0,Afghanistan,1999,745/19987071,745,19987071
1,Afghanistan,2000,2666/20595360,2666,20595360
2,Brazil,1999,37737/172006362,37737,172006362
3,Brazil,2000,80488/174504898,80488,174504898
4,China,1999,212258/1272915272,212258,1272915272
5,China,2000,213766/1280428583,213766,1280428583


In [18]:
#another method
pop = tbl3['rate'].str.split('/').str.get(1)
pop


0      19987071
1      20595360
2     172006362
3     174504898
4    1272915272
5    1280428583
Name: rate, dtype: object

# Part 3 -- Apply functions


Look at the `table3` dataset


In [19]:
tbl3 = pd.read_csv('./data/table3.csv')
tbl3


Unnamed: 0,country,year,rate
0,Afghanistan,1999,745/19987071
1,Afghanistan,2000,2666/20595360
2,Brazil,1999,37737/172006362
3,Brazil,2000,80488/174504898
4,China,1999,212258/1272915272
5,China,2000,213766/1280428583


### Write a function that takes a value of `rate` and parses out the total population.


In [20]:
tbl3.dtypes
tbl3['rate']


0         745/19987071
1        2666/20595360
2      37737/172006362
3      80488/174504898
4    212258/1272915272
5    213766/1280428583
Name: rate, dtype: object

In [None]:
def extract_pop(rate, delim='/', position=1):
    return int(rate.split(delim)[position])


In [27]:
assert extract_pop('123/546678') == 546678


In [25]:
tbl3['rate'].apply(extract_pop)


0      19987071
1      20595360
2     172006362
3     174504898
4    1272915272
5    1280428583
Name: rate, dtype: int64

### Set the population to a new column


In [28]:
tbl3['population'] = tbl3['rate'].apply(extract_pop)
tbl3


Unnamed: 0,country,year,rate,population
0,Afghanistan,1999,745/19987071,19987071
1,Afghanistan,2000,2666/20595360,20595360
2,Brazil,1999,37737/172006362,172006362
3,Brazil,2000,80488/174504898,174504898
4,China,1999,212258/1272915272,1272915272
5,China,2000,213766/1280428583,1280428583


# Part 4 -- Plots


In [None]:
titanic = sns.load_dataset('titanic')
titanic.head()


### Create a figure with 2 axes

### distplot of `fare` in one axes

### boxplot of `class` and `fare` on the other axes


In [110]:
import matplotlib.pyplot as plt # type: ignore
import seaborn as sns # type: ignore


# Part 5 -- Models


In [None]:
titanic = sns.load_dataset('titanic')
titanic.head()


In [None]:
titanic.info()


### Subset `survived`, `class`, `who`


### Create dummy encoded dataset


### Fit a logistic regression on `survived`


In [113]:
# from sklearn.linear_model import LogisticRegression
