In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import os
idx = pd.IndexSlice

plotconfig={
    'style':'.',
    'ylim':(0,2000),
    'xlim':('2018-01-01','2021-01-01'),
    'grid':True,
    'markersize':5,
    'figsize':(20,6)
}

locations = pd.read_csv('csv/locations.csv',index_col='name')
directions = pd.read_csv('csv/directions.csv',index_col='name')

detections = pd.read_csv('csv/detections.csv',index_col=['location','direction','ts'],parse_dates=True).value.sort_index()

def series_for_direction(detections,location,direction):
    return detections.loc[(location, direction)].rename((location, direction))

series = series_for_direction(detections,'Podolské nábřeží - stezka','do centra (Výtoň)')
another = series_for_direction(detections,'Podolské nábřeží - stezka','z centra (Braník)')

# Lecture 7 - Pandas

by Vítek Macháček, inspired by Martin Hronec

* Pan(el) Da(ta)
* go-to package for data analysis in Python
* designed for working with "labeled" (relational data)
* developed by Wes McKinney in 2008 while at AQR Capital Management out of the need for a tool to perform quantitative analysis on financial data (convinced AQR to open-source it)

* two primary data structures:
    * *Series* (1D labeled homogeneously-typed array)
    * *DataFrame* (2D labeled, tabular structure with potentially heterogeneously-typed column)

* built on top of NumPy
* Most of your python data work will start and many of it will end in Pandas.
* Huge ecosystem that built around 

Here are just a few of the things that pandas does well:

* Easy handling of missing data (represented as NaN) in floating point as well as non-floating point data

* Size mutability: columns can be inserted and deleted from DataFrame and higher dimensional objects

* Automatic and explicit data alignment: objects can be explicitly aligned to a set of labels, or the user can simply ignore the labels and let Series, DataFrame, etc. automatically align the data for you in computations

* Powerful, flexible group by functionality to perform split-apply-combine operations on data sets, for both aggregating and transforming data

* Make it easy to convert ragged, differently-indexed data in other Python and NumPy data structures into DataFrame objects

* Intelligent label-based slicing, fancy indexing, and subsetting of large data sets

* Intuitive merging and joining data sets

* Flexible reshaping and pivoting of data sets

* Hierarchical labeling of axes (possible to have multiple labels per tick)

* Robust IO tools for loading data from flat files (CSV and delimited), Excel files, databases, and saving / loading data from the ultrafast HDF5 format

* Time series-specific functionality: date range generation and frequency conversion, moving window statistics, date shifting and lagging.



## `pd.Series` - labelled 1D object

###  Construction

Series can be created using:
* `dict`


```python 
pd.Series({'Vítek':31,'Martin':30, 'Honza':30})
```

* `list`, `tuple` or `np.array`


```python 
pd.Series(np.random.randint(100,size=(10,)),index=[f'Person {i}' for i in range(10)],name='Age')
```

* scalar

```python
pd.Series(np.nan,index=['Vítek','Martin','Honza'],name='NANs')
```

Main features:

* `values` - collection of measurements, categories etc.
* `name` - column name
* `dtype` - datatype - `int`,`float`,`pd.Timestamp`,`object`
* `shape`

In [None]:
series

`name`

In [None]:
series.name

`values`

In [None]:
series.values

`index`

In [None]:
series.index

`dtype`

In [None]:
series.dtype

`shape`

In [None]:
series.shape

In [None]:
series.plot(**plotconfig);

In [None]:
series.median()

## Indexing data
### Using `loc` - select using `index`

In [None]:
series.loc['2020-12-24']

In [None]:
series.loc[['2019-01-01','2019-12-31']]

In [None]:
series.loc[[f'{year}-{month:02}-01' for year in range(2018,2021) for month in range(1,13)]]

In [None]:
series.loc['2020-03-01':'2020-06-01']

In [None]:
series.loc[pd.Timestamp('2021-03-28')] = 450

### Using `iloc` - select by order in the series. Try to avoid.

In [None]:
series.iloc[0]

In [None]:
series.iloc[-1]

### subsetting using `mask` - conditional on value of series

In [None]:
series

In [None]:
series.index

In [None]:
ax = series.plot(color = 'lightgrey',label='other values',legend=True,**plotconfig)
series[(series >= 500) & (series < 750)].plot(ax=ax,label='Values between 500 and 750',legend=True,**plotconfig)
series.loc[series.index.weekday == 6].plot(ax=ax,label='Sunday observations',legend=True,**plotconfig)

In [None]:
series>=500

In [None]:
(series >= 500) & (series < 750)

In [None]:
series.index.weekday

In [None]:
series.index.weekday == 6

## `pd.DataFrame()` - 2D, more `pd.Series`

In [None]:
df = pd.DataFrame([series,another]).T
df.columns = [col[1] for col in df.columns]
df.plot(**plotconfig)

In [None]:
df.head()

## DataFrame construction from raw data:

### row-wise: `list` into `pd.DataFrame`
Each member of the list is one row

* iterable with values
* `pd.Series`
* `dict`

### column-wise `dict` into `pd.DataFrame`
**key** is the column name

**value** is the content

expected key values:
* `list`
* `pd.Series`
* `1D np.array` etc.

In [None]:
df.tail()

In [None]:
df.shape

In [None]:
df.index

In [None]:
df.columns

In [None]:
df.dtypes

Rows are `pd.Series`

In [None]:
df.iloc[0]

as well as columns

In [None]:
df['do centra (Výtoň)']

## Deeper look at the index

In [None]:
df.index

In [None]:
df.index.values

In [None]:
df.index.dtype

## Dtypes

* Numeric - `int64` and `float64`
* Datetime - `datetime64[ns]`
    * Datetimes can be timezone-aware
    * variable precision

    * see `pd.to_datetime`
* Object - everything else
    * most often `str`
    * But also `list` or virtually anything

* Custom specialized datatypes 
    * `geometry` in `geopandas`

## Path to `MultiIndex`
https://pandas.pydata.org/pandas-docs/stable/user_guide/advanced.html

In [None]:
df

In [None]:
detections

In [None]:
detections.index

In [None]:
detections.loc[('Anděl (Plzeňská)', 'Plzeňská (z centra)', '2019-01-20')]

use `pd.IndexSlice` to slice multiindexes

In [None]:
idx = pd.IndexSlice
detections.loc[idx[:,:, '2020-03-01':'2020-06-01']]

## Reshaping and pivoting

https://pandas.pydata.org/pandas-docs/stable/user_guide/reshaping.html

### Reshape `Series` into `DataFrame`: `unstack`

In [None]:
detections.unstack(['location','direction'])

### `Stack` to `series`

Columns are index-like 

In [None]:
df.stack()

## Apply functions on `pd` objects

### Column-wise

In [None]:
detcols = detections.unstack(['location','direction'])
detcols.head()

In [None]:
detcols.median()

In [None]:
detcols.apply(np.mean)

In [None]:
def share_of_missing(col):
    return col[col.isna()].shape[0]/col.shape[0]
detcols.apply(share_of_missing)

### Row-wise

In [None]:
def get_max_counter(row):
    recordman = row.idxmax()
    return pd.Series({'location':recordman[0],'direction':recordman[1]})
detcols.apply(get_max_counter,axis=1)

## Group By

**Split-Apply-Combine Logic**

https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html


* Splitting the data into groups based on some criteria.

* Applying a function to each group independently.

* Combining the results into a data structure.



In [None]:
detections

In [None]:
detections.groupby(['location','direction']).median()

In [None]:
detections.groupby(['location',pd.Grouper(level='ts',freq='1W')]).median().unstack('location').plot(ls='--',legend=False,**plotconfig);

In [None]:
detections.groupby(['location',lambda idx:idx[2].dayofweek]).median().unstack(0).sort_values(by=0,axis=1)

In [None]:
detections.groupby(['location',lambda idx:idx[2].dayofweek]).median().unstack(0).sort_values(by=0,axis=1).plot.bar(legend=False,figsize=(20,6),title='Median daily count across day ');

In [None]:
detections[detections.index.get_level_values('ts').month.isin([3,4,5,6,7,8,9,10])].dropna().groupby([lambda idx: idx[2].year,lambda idx: idx[2].week]).median().unstack(0).plot(title='Mediánový počet průjezdů v daném týdnu v roce (pouze březen - říjen)',figsize=(20,6),grid=True)

### Group by and Apply - Number of missing values per group and per month

In [None]:
def share_of_missing_values(g):
    return 1-(g.dropna().shape[0]/g.shape[0])

detections.groupby(['location']).apply(share_of_missing_values).sort_values().plot.bar(figsize=(20,6),grid=True)

In [None]:
missing = detections.groupby(['location',pd.Grouper(level='ts',freq='1M')]).apply(share_of_missing_values).unstack('location')

fig, ax = plt.subplots(1,1,figsize=(10,10))
img = ax.matshow(missing,cmap='Reds')

ax.set_yticks(range(len(missing.index)))
ax.set_yticklabels(missing.index.strftime('%Y-%m'))

ax.set_xticks(range(len(missing.columns)))
ax.set_xticklabels(missing.columns,rotation=90)
fig.colorbar(img)
fig.tight_layout()

## Merging and joining datasets
https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html

* `pd.concat` - vertical alignment (along index)
* `pd.merge` - horizontal alignment (along columns, by )
    * `df.join` - merge on index


### Concatenate

![Concatenating schema](./img/concat.png)

### Merge
![Concatenating schema](./img/merge.png)

## Join

## Time-series 

In [None]:
detections.groupby(['location','direction']).apply(lambda g:g.pct_change()).unstack(['location','direction'])

## Rolling

## Input-Output

In [None]:
?pd.read_csv

In [None]:
?pd.read_*

## Plotting with Matplotlib