# Reading and Visualising Data with Pandas

Tamás Gál (tamas.gal@fau.de)


In [None]:
%matplotlib inline
import pandas as pd

import matplotlib as ml
import sys
plt = ml.pyplot
ml.rcParams['figure.figsize'] = (10.0, 5.0)

print("Python version: {0}\n"
      "Pandas version: {1}\n"
      "Matplotlib version: {2}\n"
     
      .format(sys.version, pd.__version__, ml.__version__))

In [None]:
from IPython.core.magic import register_line_magic

@register_line_magic
def shorterr(line):
    """Show only the exception message if one is raised."""
    try:
        output = eval(line)
    except Exception as e:
        print("\x1b[31m\x1b[1m{e.__class__.__name__}: {e}\x1b[0m".format(e=e))
    else:
        return output
    
del shorterr

In [None]:
import warnings
warnings.filterwarnings('ignore')  # annoying UserWarnings from Jupyter which are not fixed yet

In [None]:
## If working on Google Colab, uncomment these lines and run the cell

# import requests
# from pathlib import Path
# import shutil
# Path('data').mkdir(exist_ok=True)

# requests.get('https://github.com/vuillaut/info801/blob/bc5b468f3dcbe6c0b015a6f1302b60c98a773154/pandas/data/neutrinos.csv')
# requests.get('https://github.com/vuillaut/info801/blob/bc5b468f3dcbe6c0b015a6f1302b60c98a773154/pandas/data/reco.csv')
# shutil.move('neutrinos.csv', 'data')
# shutil.move('reco.csv', 'data')


## Exercise 1

Use the `pd.read_csv()` function to create a `DataFrame` from the dataset `data/neutrinos.csv`.

### Problems encountered

- the first few lines represent a plain header and need to be skipped
- comments are indicated with `$` at the beginning of the line
- the column separator is `:`
- the decimal delimiter is `,`
- the index column is the first one
- there is a footer to be excluded
- footer exclusion only works with the Python-engine

In [None]:
neutrinos = pd.read_csv('data/neutrinos.csv', skiprows=4, comment='$',
                decimal=',', delimiter=':', index_col=0,
                skipfooter=3,
           )

In [None]:
neutrinos

In [None]:
# neutrinos = pd.read_csv('data/neutrinos.csv', skiprows=6, delimiter=':', decimal=',', comment ='$', index_col=0, skipfooter=1)

In [None]:
neutrinos.head()

### Check the dtypes to make sure everthing is parsed correctly (and is not an `object`-array)

In [None]:
neutrinos.dtypes

In [None]:
neutrinos.bjorkeny.astype(float)

In [None]:
neutrinos.bjorkeny = neutrinos.bjorkeny.str.replace(',', '.').astype(float)

In [None]:
neutrinos.dtypes

## Exercise 2

Create a histogram of the neutrino energies.

In [None]:
plt.hist(neutrinos['energy'], bins=100, color='red', label='energy');
plt.legend()

In [None]:
neutrinos.energy.hist(bins=10, legend=True);
plt.legend()

## Exercise 3

Use the `pd.read_csv()` function to create a `DataFrame` from the dataset `data/reco.csv`.

In [None]:
reco = pd.read_csv('data/reco.csv', index_col=0)
reco

## Exercise 4

Combine the `neutrinos` and `reco` `DataFrames`  using `pd.concat()`

In [None]:
neutrinos

In [None]:
neutrinos.shape

In [None]:
reco.shape

In [None]:
df = pd.concat([neutrinos, reco.add_prefix('reco_')], axis=1)

In [None]:
df.shape

In [None]:
df

### Problems encountered

- need to define the right axis
- identical column names should be avoided

## Exercise 5

Make a scatter plot to visualise the zenith reconstruction quality.



In [None]:
df.zenith

In [None]:
df.reco_zenith

In [None]:
df[['zenith', 'reco_zenith']]

In [None]:
df.plot.scatter(x='zenith', y='reco_zenith', alpha=0.01)
plt.plot([0,4], [0,4], color='red')
plt.xlim(-1, 4)

## Exercise 6


1. Create a histogram of `proba_cscd`

2. Create a histogram of `proba_cscd` for energies between 1 and 5 GeV

2. Create a histogram of the cascade probabilities (__`neutrinos`__ dataset: `proba_cscd` column) for the energy ranges 1-5 GeV, 5-10 GeV, 10-20 GeV and 20-100 GeV.

In [None]:
df[['proba_cscd', 'energy']]

In [None]:
df['energy_bin'] = pd.Series(np.nan)

In [None]:
df['energy_bin'][(df.energy <= 5) & (df.energy >= 1)] = 1
df['energy_bin'][(df.energy > 5) & (df.energy <= 10)] = 2
df['energy_bin'][(df.energy > 10) & (df.energy <= 20)] = 3
df['energy_bin'][(df.energy > 20) & (df.energy <= 100)] = 4

In [None]:
df

In [None]:
df['energy_bin'] = pd.cut(x=df.energy, bins=[1, 5, 10, 20, 100])
df

In [None]:
df.hist('proba_cscd', by='energy_bin', bins=100, figsize=(15,10), alpha=0.6);

## Exercise 7

Create a 2D histogram showing the distribution of the `x` and `y` values of the starting positions (`pos_x` and `pos_y`) of the neutrinos. This is basically a 2D plane of the starting positions. using the method hist2d

In [None]:
plt.hist2d(df.pos_x, df.pos_y, bins=100, range=((-200,200),(-200,200)));
plt.axis('equal')

In [None]:
df.plot.hexbin(x='pos_x', y='pos_y',  gridsize=100)
plt.xlim(-200, 200)
plt.ylim(-200, 200)
plt.axis('equal')

## Acknowledgements
![](images/eu_asterics.png)

This tutorial was supported by the H2020-Astronomy ESFRI and Research Infrastructure Cluster (Grant Agreement number: 653477).