In [3]:
import pandas as pd
import numpy as np
from numpy.random import default_rng
rng = default_rng()

## Series

Given the Series below:

In [89]:
s = pd.Series(np.arange(5),index=list("abcde"))
s

a    0
b    1
c    2
d    3
e    4
dtype: int64

without entering the statements:

- predict the values and the type of object returned for each statement:

In [90]:
s['d']        # 3, int
s['b':'d']    # 1,2,3, int
s[2::2][::-1] # [::-1] = reverse & [2::2] = start at 2 and step by 2
s[['b', 'a']] # df: b:1, a:0, object is int

b    1
a    0
dtype: int64

In [91]:
s['d']

np.int64(3)

In [92]:
s['b':'d'] 

b    1
c    2
d    3
dtype: int64

In [93]:
s[2::2][::-1] 

e    4
c    2
dtype: int64

- predict the contents of `s`, `s1` and `lst`:

In [94]:
lst, idx = np.arange(5), list("abcde")
s = pd.Series(lst,idx)
s[-1:] = 10               # ...
lst[0] = 5                # ...
s1 = pd.Series(s.copy())  # ...
s1[0] = -1                # ...

  s1[0] = -1                # ...


In [95]:
s

a     5
b     1
c     2
d     3
e    10
dtype: int64

In [96]:
s1

a    -1
b     1
c     2
d     3
e    10
dtype: int64

- predict the result of the operations

In [97]:
s1 = pd.Series({'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4})
s2 = pd.Series({'d': 0, 'e': 1, 'f': 2, 'g': 3})

s1 + s2           # a:NAN, b: NAN, c: NaN, d: 3, e: 5, f: NaN, g: NaN
s1[3:] * s2[:-2]  # d: 0, e: 4

d    0
e    4
dtype: int64

In [98]:
s1 + s2   

a    NaN
b    NaN
c    NaN
d    3.0
e    5.0
f    NaN
g    NaN
dtype: float64

## DataFrame

Given the DataFrame `df` below:

In [99]:
rng = default_rng(1234)
df = pd.DataFrame(np.array(rng.standard_normal(25)).reshape(5,5),
             index=[1, 0, 4, 3, 2], columns=list("abcde"))
df

Unnamed: 0,a,b,c,d,e
1,-1.603837,0.0641,0.740891,0.152619,0.863744
0,2.913099,-1.478823,0.945473,-1.666135,0.343745
4,-0.512444,1.323759,-0.86028,0.519493,-1.265144
3,-2.159139,0.434734,1.733289,0.520134,-1.002166
2,0.268346,0.767175,1.191272,-1.157411,0.696279


retrieve:
- 2nd row as a Series
- 3rd row as a DataFrame
- rows on even positions
- rows with even indices
- 3d column
- odd (index) rows and columns 'b' to 'd'

In [100]:
df.loc[0]

a    2.913099
b   -1.478823
c    0.945473
d   -1.666135
e    0.343745
Name: 0, dtype: float64

In [101]:
df.iloc[::2]

Unnamed: 0,a,b,c,d,e
1,-1.603837,0.0641,0.740891,0.152619,0.863744
4,-0.512444,1.323759,-0.86028,0.519493,-1.265144
2,0.268346,0.767175,1.191272,-1.157411,0.696279


In [102]:
df[df.index % 2 == 0]

Unnamed: 0,a,b,c,d,e
0,2.913099,-1.478823,0.945473,-1.666135,0.343745
4,-0.512444,1.323759,-0.86028,0.519493,-1.265144
2,0.268346,0.767175,1.191272,-1.157411,0.696279


In [103]:
df[["c"]]

Unnamed: 0,c
1,0.740891
0,0.945473
4,-0.86028
3,1.733289
2,1.191272


In [104]:
df.iloc[1::2, df.columns.get_loc('b'):df.columns.get_loc('d') + 1]

Unnamed: 0,b,c,d
0,-1.478823,0.945473,-1.666135
3,0.434734,1.733289,0.520134


### Merge DataFrames

Given `df1`, `df2` and `df3` apply the following:

- merge df1 and df2 side by side
- merge df1 and df3 stacked
- merge all and reset index

In [105]:
df1 = pd.DataFrame({'name': ['ants', 'bees','wasps'] , 'order':['Hymenoptera']*3})
df2 = pd.DataFrame({'name': ['beetles', 'weevils'] , 'order':['Coleoptera']*2})
df3 = pd.DataFrame({'name': ['butterflies', 'moths'], 'order':['Lepidoptera']*2 })

In [106]:
pd.concat([df1, df2], axis = 1, join = 'outer')

Unnamed: 0,name,order,name.1,order.1
0,ants,Hymenoptera,beetles,Coleoptera
1,bees,Hymenoptera,weevils,Coleoptera
2,wasps,Hymenoptera,,


In [107]:
pd.concat([df1, df2], axis=0, join ='outer')

Unnamed: 0,name,order
0,ants,Hymenoptera
1,bees,Hymenoptera
2,wasps,Hymenoptera
0,beetles,Coleoptera
1,weevils,Coleoptera


In [108]:
pd.concat([df1, df2, df3], axis=0, join='inner').reset_index(drop=True)

Unnamed: 0,name,order
0,ants,Hymenoptera
1,bees,Hymenoptera
2,wasps,Hymenoptera
3,beetles,Coleoptera
4,weevils,Coleoptera
5,butterflies,Lepidoptera
6,moths,Lepidoptera


### Missing values

Given the following DataFrame

In [109]:
df = pd.DataFrame(np.arange(25).reshape(5,5))
df

Unnamed: 0,0,1,2,3,4
0,0,1,2,3,4
1,5,6,7,8,9
2,10,11,12,13,14
3,15,16,17,18,19
4,20,21,22,23,24


set the values to NaN as such to reproduce the following DataFrame:

In [4]:
df = pd.DataFrame({"0":[np.nan, 5, np.nan, 15, 20], "1":[1, 6, np.nan, 16, 21], "2":[np.nan, 7, np.nan, 17, np.nan], "3":[3, 8, np.nan, 17, np.nan], "4":[np.nan, np.nan, np.nan, np.nan, np.nan]})
df

Unnamed: 0,0,1,2,3,4
0,,1.0,,3.0,
1,5.0,6.0,7.0,8.0,
2,,,,,
3,15.0,16.0,17.0,17.0,
4,20.0,21.0,,,


Apply the following on the dataframe with missing values created in the previous step.

Drop missing:
- rows with missing values
- columns with missing values
- rows where all values are missing
- columns where all values are missing

Fill missing:
- with 0
- with mean based on column values
- with median based on row values

In [111]:
df.isna()
df.dropna(axis=0)

Unnamed: 0,0,1,2,3,4


In [5]:
df.dropna(axis=1)

0
1
2
3
4


In [86]:
df.dropna(axis=0, how="all")

Unnamed: 0,0,1,2,3,4
0,,1.0,,3.0,
1,5.0,6.0,7.0,8.0,
3,15.0,16.0,17.0,17.0,
4,20.0,21.0,,,


In [87]:
df.dropna(axis=1, how='all')

Unnamed: 0,0,1,2,3
0,,1.0,,3.0
1,5.0,6.0,7.0,8.0
2,,,,
3,15.0,16.0,17.0,17.0
4,20.0,21.0,,


In [6]:
df.fillna(0)

Unnamed: 0,0,1,2,3,4
0,0.0,1.0,0.0,3.0,0.0
1,5.0,6.0,7.0,8.0,0.0
2,0.0,0.0,0.0,0.0,0.0
3,15.0,16.0,17.0,17.0,0.0
4,20.0,21.0,0.0,0.0,0.0


In [21]:
df.fillna(df.mean(numeric_only=True))

Unnamed: 0,0,1,2,3,4
0,13.333333,1.0,12.0,3.0,
1,5.0,6.0,7.0,8.0,
2,13.333333,11.0,12.0,9.333333,
3,15.0,16.0,17.0,17.0,
4,20.0,21.0,12.0,9.333333,


In [22]:
df.fillna(df.median(numeric_only=True, axis=0))

Unnamed: 0,0,1,2,3,4
0,15.0,1.0,12.0,3.0,
1,5.0,6.0,7.0,8.0,
2,15.0,11.0,12.0,8.0,
3,15.0,16.0,17.0,17.0,
4,20.0,21.0,12.0,8.0,


### Natural gas consumption in the Netherlands

The dataset can be downloaded from [CBS Open data StatLine](https://opendata.cbs.nl/statline/portal.html?_la=en&_catalog=CBS). A version is already included in the data directory of this session's git repository. We will be using this dataset in the exercises to prepare for visualisation later on in the course.

We first read the data with `pd.read_csv`. Here we only select the columns `Periods` and `TotalSupply_1`:

In [23]:
cbs = pd.read_csv("data/00372eng_UntypedDataSet_17032023_161051.csv",sep=";")
df0 = cbs[['Periods','TotalSupply_1']].copy()

The column `Periods`has the year (yyyy) followed by a tag {JJ,KW,MM} representing the yearly, quarterly and monthly terms respectively, and finally ending with two digits `00..12`. The two digit followed by the tags have different meaning per tag. For JJ it is always `00`, MM with `00..12` for 12 months and `KW`  with  `01..04` for four quarters. The column `TotaalAanbod_1` holds the natural gas consumption (MCM).

In order to get more control over the date ranges we will need to split the string based on the pattern `YYYY{MM,KW,JJ}{00,...,12}`. The Series class has a comprehensive set of submodules, one of which being `pandas.Series.str` with the method `split`. The `split` method takes a [regular expression](https://docs.python.org/3/library/re.html) describing the pattern and  splits the string based on the pattern. Regular expressions fall beyond the scope of this course, therefore the solution is given here for the exercise.

In [487]:
df = df0.Periods.str.split(r'(JJ|MM|KW)', regex=True, expand=True)  # expand=True forces the result into
                                                                        # a DataFrame
df = pd.DataFrame({'year': df[0].astype(int),                 # Create DataFrame {year,term,idx}
                        'term': df[1],
                        'idx': df[2].astype(int)})

df = pd.concat([df,cbs[['TotalSupply_1']]],axis=1)
df

Unnamed: 0,year,term,idx,TotalSupply_1
0,1946,JJ,0,0
1,1947,JJ,0,1
2,1948,JJ,0,5
3,1949,JJ,0,7
4,1950,JJ,0,5
...,...,...,...,...
730,2022,MM,12,3812
731,2022,KW,4,8521
732,2022,JJ,0,31227
733,2023,MM,1,3386


1) Write a function given a Series with {year,term,idx} returns a timestamp according to the following specification:

```
JJ : yyyyJJ00 => 31-12-yyyy
KW : yyyyKWmm => where mm in {1,2,3,4}
                 01: 1-1-yyyy to 31-3-yyyy
                 02: 1-4-yyyy to 30-6-yyyy
                 03: 1-7-yyyy to 30-9-yyyy
                 04: 1-10-yyyy to 31-12-yyyy
MM : yyyyMMmm => dd-mm-yyyy where dd is the last day of the month and
                 mm in {1,..,12}
```

2) Create a new DataFrame called `ngc` (natural gas consumption) with three columns {term, date, consumption} :
- term : {JJ,KW,MM}
- date : timestamps as specified in the previous exercise
- consumption: which is `TotalSupply_1` only renamed

In [None]:
# ...

Validate entries in the ngc DataFrame from the previous step:
- whether sum of 3 months consumptions are equal to the corresponding quarterly entries(KW)
- whether sum of 4 quarters addup to the yearly (JJ) entries

In [None]:
# ...