## Definitions

In [2]:
import seaborn as sns
import pandas as pd
import zipfile
import numpy as np

idx = pd.IndexSlice

plotconfig = {
    'style':'.',
    'grid':True,
    'markersize':5,
    'figsize':(20,6)
}

def load_and_transform_data(zip_file):
    with zipfile.ZipFile(zip_file) as z:
    # open the csv file in the dataset
        with z.open("Covid data/CovidDeaths.csv") as f:

            covid = pd.read_csv(f,index_col=['iso_code','date'], parse_dates=['date'], date_parser=lambda d: pd.to_datetime(d, format="%d-%m-%y"))

            country_columns = ['continent','location','population']
            countries = covid.groupby('iso_code').apply(lambda g: g.iloc[0][country_columns])
            
            countries = countries[countries.apply(lambda row: len(row.name) == 3,axis=1)]
            countries.continent = countries.continent.astype('category')

            keep_covid_columns = ['new_cases','new_deaths','icu_patients','hosp_patients']

            covid = covid[keep_covid_columns]
            covid = covid[covid.apply(lambda row: len(row.name[0]) == 3,axis=1)]

            covid = covid.sort_index()
    return covid.new_cases, covid.new_deaths, covid.icu_patients, covid.hosp_patients, countries

new_cases, new_deaths, icu_patients, hosp_patients, countries = load_and_transform_data('./data/covid.zip')

czech_cases = new_cases.loc[idx['CZE']]
slovak_cases = new_cases.loc[idx['SVK']]

# Lecture 7 - Pandas

by Vítek Macháček, inspired by Martin Hronec

* Pan(el) Da(ta)
* go-to package for data analysis in Python
* designed for working with "labeled" (relational data)
* developed by Wes McKinney in 2008 while at AQR Capital Management out of the need for a tool to perform quantitative analysis on financial data (convinced AQR to open-source it)

* two primary data structures:
    * *Series* (1D labeled homogeneously-typed array)
    * *DataFrame* (2D labeled, tabular structure with potentially heterogeneously-typed column)

* built on top of NumPy
* Vast majority of your python data work should start and end in Pandas.
* Huge ecosystem that built around

Here are just a few of the things that pandas does well:

* Easy handling of missing data (represented as NaN) in floating point as well as non-floating point data
* Size mutability: columns can be inserted and deleted from DataFrame and higher dimensional objects
* Automatic and explicit data alignment: objects can be explicitly aligned to a set of labels, or the user can simply ignore the labels and let Series, DataFrame, etc. automatically align the data for you in computations
* Powerful, flexible group by functionality to perform split-apply-combine operations on data sets, for both aggregating and transforming data
* Make it easy to convert ragged, differently-indexed data in other Python and NumPy data structures into DataFrame objects
* Intelligent label-based slicing, fancy indexing, and subsetting of large data sets
* Intuitive merging and joining data sets
* Flexible reshaping and pivoting of data sets
* Hierarchical labeling of axes (possible to have multiple labels per tick)
* Robust IO tools for loading data from flat files (CSV and delimited), Excel files, databases, and saving / loading data from the ultrafast HDF5 format
* Time series-specific functionality: date range generation and frequency conversion, moving window statistics, date shifting and lagging.


## `pd.Series` - labelled 1D object

###  Construction

Series can be created using:
* `dict`


```python 
pd.Series({'Vítek':31,'Martin':30, 'Honza':30})
```

* `list`, `tuple` or `np.array`


```python 
pd.Series(np.random.randint(100,size=(10,)),index=[f'Person {i}' for i in range(10)],name='Age')
```

* scalar

```python
pd.Series(np.nan,index=['Vítek','Martin','Honza'],name='NANs')
```

Main attributes:

* `values` - collection of measurements, categories etc.
* `name` - column name
* `dtype` - datatype - `int`,`float`,`pd.Timestamp`,`object`
* `shape`

In [None]:
czech_cases.name

In [None]:
czech_cases.values

In [None]:
czech_cases.index

In [None]:
czech_cases.dtype

In [None]:
czech_cases.shape

### Args / Kwargs

In [None]:
plotconfig

In [None]:
czech_cases.plot(**plotconfig)

In [None]:
czech_cases.add(slovak_cases)# czech_cases + slovak_cases

## Indexing data
### Using `loc` - selecting based on index labels

In [None]:
czech_cases['2020-12-24']

In [None]:
czech_cases.loc['2020-09-01':'2020-11-15'].plot()

Do this only in exceptional cases

In [None]:
czech_cases.loc['2021-03-28'] += 1 

## Using `iloc` - selecting by order
* generally not recommended

In [None]:
czech_cases.iloc[0]

In [None]:
czech_cases.iloc[-1]


### subsetting using `mask` - conditional on value of series

In [None]:
ax = czech_cases.plot(color = 'lightgrey',label='other values',legend=True,**plotconfig)
czech_cases[(czech_cases >= 5000) & (czech_cases < 15000)].plot(ax=ax,label='Values between 500 and 750',legend=True,**plotconfig)
czech_cases[czech_cases.index.weekday == 6].plot(ax=ax,label='Sunday',legend=True,**plotconfig)
czech_cases[czech_cases.index.weekday == 5].plot(ax=ax,label='Saturday',legend=True,**plotconfig)

In [None]:
czech_cases >= 5000

In [None]:
(czech_cases >= 5000) & (czech_cases < 15000)

In [None]:
czech_cases.index.weekday

In [None]:
czech_cases[czech_cases.index.weekday == 6]

## `pd.DataFrame()` - 2D,tabular version of `pd.Series`

In [None]:
slovak_cases

In [None]:
df = pd.DataFrame({'Czechia':czech_cases,'Slovakia':slovak_cases})
df.plot(**plotconfig)


### DataFrame construction from raw data:
#### row-wise: list into pd.DataFrame

Each member of the list is one row

* iterable with values
* pd.Series
* dict

#### column-wise dict into pd.DataFrame

**key** is the column name

**value** is the content (should be iterable)

## Input-Output


In [None]:
df.sample(5)

In [None]:
df.shape

In [None]:
df.dtypes

In [None]:
df.iloc[0]

In [None]:
df['Czechia'] # or df.Czechia

In [None]:
df.index

## Deeper look at the `index`

In [None]:
df.index

In [None]:
df.index.values

In [None]:
df.index.dtype

## Dtypes

* **Numeric** - `int64` and `float64`

* **Datetime** - `datetime64[ns]`
    - Datetimes can be timezone-aware
    - variable precision
    - see `pd.to_datetime`

* **Categorical**
    - set of values specified in advance
    - efficient performance
    - possible to specify ordering (`strongly agree` vs. `agree`)

* **Object** - everything else
    - most often str
    - But also list or virtually anything

* **Custom datatypes**
    - geometry in geopandas


In [None]:
df.columns

## Path to `MultiIndex`

In [None]:
df

In [None]:
new_cases

In [None]:
new_cases.index

In [None]:
new_cases.loc[('CZE','2020-12-24')]

if slicing or multi-selecting use `idx = pd.IndexSlice` (in definitions)

In [None]:
czechoslovak_christmas = new_cases.loc[idx[['CZE','SVK'],'2020-12-24':'2020-12-27']] #
czechoslovak_christmas

## Reshaping and pivoting

https://pandas.pydata.org/pandas-docs/stable/user_guide/reshaping.html

### Reshape `pd.Series` into `pd.DataFrame`: `.unstack`

In [None]:
czechoslovak_christmas.unstack()

In [None]:
new_cases.head()

In [None]:
new_cases.unstack('iso_code')

### Stack `pd.DataFrame` to `pd.Series`


In [None]:
df.stack()

## Applying functions

### on `pd.Series`

#### Aggregation
- decreasing dimensionality

In [None]:
czech_cases.mean()

In [None]:
czech_cases.min()

In [None]:
czech_cases.sum()

In [None]:
czech_cases.agg([np.mean,np.max,np.mean,np.median,np.std])

### Transforming
* preserves dimensionality and shape

In [None]:
czech_cases.diff()

In [None]:
czech_cases.apply(np.log)

In [None]:
czech_cases.cumsum()

In [None]:
czech_cases.pct_change()

#### Custom functions

In [None]:
def custom_transforming_function(x):
    return (x ** 2) -25

czech_cases.apply(custom_transforming_function)

In [None]:
czech_cases.apply(lambda x: (x**2) -25)

## Applying functions on `pd.DataFrame`
### Aggregating

In [None]:
df

In [None]:
df.sum()

In [None]:
df.sum(axis=1)

### Custom function on `pd.DataFrame`

In [None]:
def describe_day(row):
    date = row.name
    return f'On {date.day_name()} of {date.month_name()} {date.day}, {date.year}, there were {row.Czechia} cases in Czechia and {row.Slovakia} cases in Slovakia'
df.apply(describe_day,axis=1)

In [None]:
def describe_country(col):
    return f'Between {col.index.min():%Y/%m/%d} and {col.index.max():%Y/%m/%d} there were on average {col.mean()} with standard deviation {col.std()}'

df.apply(describe_country,axis=0)

### Combine `.apply` with reshaping

In [None]:
cntr_cases = new_cases.unstack('iso_code')
cntr_cases

In [None]:
cntr_cases.median()

In [None]:
cntr_cases.median(axis=1)

In [None]:
cntr_cases.apply([np.mean,np.median,np.std],axis=1)

In [None]:
def share_of_missing(col):
    return col.isna().sum()/col.shape[0]
cntr_cases.apply(share_of_missing).sort_values().plot.bar(figsize=(12,6))

### Row-wise

- "best in covid"

In [None]:
def get_best_in_covid(row):
    return pd.Series({'best':row.idxmax(),'value':row.max()})
cntr_cases.apply(get_best_in_covid,axis=1).best.value_counts().sort_values()

## Group By

**Split-Apply-Combine Logic**

https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html

* Splitting the data into groups based on some criteria.
* Applying a function to each group independently.
* Combining the results into a data structure.


In [None]:
covid = pd.DataFrame({
    'new_cases':new_cases,
    'new_deaths':new_deaths
}).join(countries)
covid

In [None]:
covid.groupby('continent').median()

In [None]:
covid.groupby(['continent', pd.Grouper(level='date',freq='M')]).median()

### Vítek's messy analytics one-liners

In [None]:
covid.groupby(['continent', pd.Grouper(level='date',freq='M')]).new_cases.median().unstack('continent').plot(style='-',figsize=(12,6),title='Median cases in month in continent')

### Group By + Apply

In [None]:
interesting_countries = ['Austria', 'Poland', 'Germany', 'Czechia', 'Slovakia', 'Hungary', 'France', 'Denmark', 'Sweden']

some_countries = covid[covid.location.isin(interesting_countries)]
some_countries['deaths_per_case'] = some_countries.new_deaths/some_countries.new_cases
some_countries

In [None]:
some_countries.groupby(['location',pd.Grouper(level='date',freq='M')]).apply(lambda g: g.new_cases.median()/g.population.iloc[0]).unstack('location').plot(title='Median cases per capita in Europe monthly',figsize=(12,6))

In [None]:
some_countries.groupby(['location',pd.Grouper(level='date',freq='M')]).deaths_per_case.median().unstack('location').plot(title='Median deaths per case monthly',figsize=(12,6))

In [None]:
sns.heatmap(some_countries.groupby(['iso_code',lambda idx: idx[1].weekday()]).apply(lambda g: g.new_cases.median()/g.population.iloc[0]).unstack(1))

## Merging and joing datasets

https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html

* `pd.concat` - alignment (along index or columns)
* `pd.merge` - combining data (along columns, by values)
    * `df.join` - merge on index


### Concatenate
![concatenate](./img/concatenate.png)

### Merge
![merge](./img/merge.png)



## Rolling object

In [None]:
ax = czech_cases.plot(label='original',**plotconfig,legend=True)
czech_cases.rolling(3).mean().plot(label='3 days rolling',ax=ax,legend=True)
czech_cases.rolling(5).mean().plot(label='5 days rolling',ax=ax,legend=True)
czech_cases.rolling(10).mean().plot(label='10 days rolling',ax=ax,legend=True)