### Preliminary code

In [2]:
from pandas import DataFrame, Series
import numpy as np
from io import StringIO
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns

from IPython.display import Image
IMG_PATH, DATA_PATH = "./img", "./data"


def describe(a):
    if type(a) is np.ndarray:
        print("data:\n{}\nshape:{}\ndtype:{}\ntype: {}".format(a, a.shape, a.dtype, type(a)))
    elif type(a) is pd.Series:
        print("data:\n{}\nshape:{}\ndtype:{}\nname:{}\nindex-name:{}\nindex-type:{}\ntype:{}".format(a, a.shape, a.dtype, a.name, a.index.name,type(a.index), type(a)))
    elif type(a) is pd.DataFrame:
        print("data:\n{}\nshape:{}\ntype:{}".format(a, a.shape,type(a)))
    else:
        print("{}, type:{}".format(a, type(a)))


hrule = lambda x : "="*x
Hrule = lambda x,y: "="*(x//2)+y+"="*(x//2)
Data = lambda file : os.path.join(DATA_PATH, file)
Img  = lambda img : os.path.join(IMG_PATH, img)

# Data Structures

### Series

One-dimensional array-like object containing an array of data (of any NumPy data type) and an associated array of data labels, called its index.

In [None]:
Series(np.arange(10))

0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
9    9
dtype: int64

The difference with numpy.ndarray is the index could be anything, when not specified is an an array of integer like you would expect from an array but it can be of any type, is very similar to a fixed-length.

In [None]:
Series([1,2,3,4], index=list('abcd'))

a    1
b    2
c    3
d    4
dtype: int64

In fact we can construct it from a python dictionary

In [None]:
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
Series(sdata)

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

But the index is not unique, several values can be associated with the same index.

In [None]:
Series(
    [1,2,3,4,5,6,7],
    index=np.array(['Bob', 'Joe', 'Will', 'Bob', 'Will', 'Joe', 'Joe']),
)

Bob     1
Joe     2
Will    3
Bob     4
Will    5
Joe     6
Joe     7
dtype: int64

The index is ordered by default. However we can also specify the exact order we want.

In [None]:
states = ['California', 'Ohio', 'Oregon', 'Texas']
obj = Series(sdata, index=states)

California is not included in the original dict sdata. For this reason in the Series object appears a NaN corresponding to the *California* index

---
We can obtain a mask for selecting all the (not)null values from the Series as

In [None]:
obj.isnull() # or pd.isnull(obj)

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

Name, index name and index can be altered with assignments

In [None]:
obj.name = "population"
obj.index.name = "states"
obj.index = ['Italy', 'France', 'Germany', 'Spain']
obj

Italy          NaN
France     35000.0
Germany    16000.0
Spain      71000.0
Name: population, dtype: float64

## DataFrame
A DataFrame represents a tabular, spreadsheet-like data structure containing an ordered collection of columns, each of which can be a different value type (numeric, string, boolean, etc.).

A DataFrame has both a row and column index; it can be
seen as a dict of Series sharing the same index.

### Creation

In [None]:
data = {
    'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
    'year': [2000, 2001, 2002, 2001, 2002],
    'pop': [1.5, 1.7, 3.6, 2.4, 2.9]
}
DataFrame(data)

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


You can rearrange the columns

In [None]:
DataFrame(data, columns=['year', 'state', 'pop'])

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9


Passing a non existent column name will result in NaN values

In [None]:
df = DataFrame(data, columns=['year', 'state', 'pop', 'dept'])

### Access
#### Direct access to columns

In [None]:
df.state

0      Ohio
1      Ohio
2      Ohio
3    Nevada
4    Nevada
Name: state, dtype: object

In [None]:
df['state']

0      Ohio
1      Ohio
2      Ohio
3    Nevada
4    Nevada
Name: state, dtype: object

In place column adding

In [None]:
df['density'] = 10 # all rows with same value
df

Unnamed: 0,year,state,pop,dept,density
0,2000,Ohio,1.5,,10
1,2001,Ohio,1.7,,10
2,2002,Ohio,3.6,,10
3,2001,Nevada,2.4,,10
4,2002,Nevada,2.9,,10


And modifying (providing a single value like the former case, or an array of values)

In [None]:
df['dept'] = np.arange(df.shape[0])
df

Unnamed: 0,year,state,pop,dept,density
0,2000,Ohio,1.5,0,10
1,2001,Ohio,1.7,1,10
2,2002,Ohio,3.6,2,10
3,2001,Nevada,2.4,3,10
4,2002,Nevada,2.9,4,10


It can be done even with a Series (but they have to share the same index and therefore the same length)

In [None]:
df['pil'] = Series(np.arange(df.shape[0]))

#### Access with iloc and loc

**iloc** gets rows (or columns) at particular positions in the index (so it takes integers or boolean arrays)


In [None]:
df1 = DataFrame(data, columns=['year', 'state', 'pop'], index=np.flip(np.arange(0, len(next(iter(data)))))) # LOOK: the index is in reverse order
df1

Unnamed: 0,year,state,pop
a,2000,Ohio,1.5
b,2001,Ohio,1.7
c,2002,Ohio,3.6
d,2001,Nevada,2.4
e,2002,Nevada,2.9


In [None]:
df.iloc[0]

year     2000
state    Ohio
pop       1.5
Name: 4, dtype: object

In [None]:
df.iloc[1]

year     2001
state    Ohio
pop       1.7
Name: 3, dtype: object

**loc** gets rows (or columns) with particular labels from the index.

In [None]:
df2 = DataFrame(data, columns=['year', 'state', 'pop'], index=[chr(x) for x in range(97, 97+len(next(iter(data))))])
df2

Unnamed: 0,year,state,pop
a,2000,Ohio,1.5
b,2001,Ohio,1.7
c,2002,Ohio,3.6
d,2001,Nevada,2.4
e,2002,Nevada,2.9


In [None]:
df2.loc['a']

year     2000
state    Ohio
pop       1.5
Name: a, dtype: object

In [None]:
df2.loc['a']['pop'] # or df2.loc['a', 'pop']

1.5

# Data Operations

## Re-indexing

New object from the old one just changing the index

In [10]:
obj = Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])

obj2 = obj.reindex(['a','a', 'b','b', 'd','c','d'])
obj2

a   -5.3
a   -5.3
b    7.2
b    7.2
d    4.5
c    3.6
d    4.5
dtype: float64

For each entry whose corresponding index is not already included in the original frame pandas will create a new entry with a given default value, which is NaN if it is not provided.

In [12]:
obj.reindex(['a', 'z'])


a   -5.3
z    NaN
dtype: float64

In [13]:
obj.reindex(['a', 'z'], fill_value=-1)

a   -5.3
z   -1.0
dtype: float64

Same thing for DataFrames, but it returns a **copy** of the original object. It can reindex rows, columns or both.

In [17]:
df = DataFrame(np.arange(9).reshape((3, 3)), index=['a', 'c', 'd'], columns=['Ohio', 'Texas', 'California'])
df

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [18]:
df.reindex(['a', 'b', 'c', 'd'])

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [19]:
df.reindex(columns=['Texas', 'Ohio', 'Ohio'])

Unnamed: 0,Texas,Ohio,Ohio.1
a,1,0,0
c,4,3,3
d,7,6,6


In [20]:
df.reindex(index=['a','a','b'], columns=['Texas', 'Ohio', 'Ohio'])

Unnamed: 0,Texas,Ohio,Ohio.1
a,1.0,0.0,0.0
a,1.0,0.0,0.0
b,,,


Dropping an axis

In [22]:
df.drop(['a', 'c'], axis=0)

Unnamed: 0,Ohio,Texas,California
d,6,7,8


In [23]:
df.drop(['Ohio'], axis=1)

Unnamed: 0,Texas,California
a,1,2
c,4,5
d,7,8


Indexing is very similar to numpy. Every indexing strategy we have seen for numpy also applies in this context.

In [None]:
# TODO

## Arithmetic

Every math based operation works on indexes. It means that pandas performs operations between entries having the same index.

#### Sum

In [24]:
s1 = Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e'])
s2 = Series([-2.1, 3.6, -1.5, 4, 3.1], index=['a', 'c', 'e', 'f', 'g'])
s1 + s2

a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

In [26]:
df1 = DataFrame(np.arange(9.).reshape((3, 3)), columns=list('bcd'), index=['Ohio', 'Texas', 'Colorado'])
df2 = DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon'])
df1 + df2

Unnamed: 0,b,c,d,e
Colorado,,,,
Ohio,3.0,,6.0,
Oregon,,,,
Texas,9.0,,12.0,
Utah,,,,


Fill the NaN corresponding to index mismatching

In [29]:
df1.add(df2, fill_value=2, axis=1)

Unnamed: 0,b,c,d,e
Colorado,8.0,9.0,10.0,
Ohio,3.0,3.0,6.0,7.0
Oregon,11.0,,12.0,13.0
Texas,9.0,6.0,12.0,10.0
Utah,2.0,,3.0,4.0


Fill the NaN corresponding to missing columns

In [30]:
tmp_columns = df2.columns.union(df1.columns)
df1.reindex(columns=tmp_columns, fill_value=2)

Unnamed: 0,b,c,d,e
Ohio,0.0,1.0,2.0,2
Texas,3.0,4.0,5.0,2
Colorado,6.0,7.0,8.0,2


### DataFrame + Series
By default, arithmetic between DataFrame and Series matches the index of the Series on the DataFrame's columns

In [35]:
frame = DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon'])
frame

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [34]:
series = frame.iloc[0]
series

b    0.0
d    1.0
e    2.0
Name: Utah, dtype: float64

In [33]:
frame - series

Unnamed: 0,b,d,e
Utah,0.0,0.0,0.0
Ohio,3.0,3.0,3.0
Texas,6.0,6.0,6.0
Oregon,9.0,9.0,9.0


### Function application and mapping
NumPy ufuncs (element-wise array methods) work fine with pandas objects.

In [38]:
frame = DataFrame(np.random.randn(4, 3), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon'])
frame

Unnamed: 0,b,d,e
Utah,1.080272,-1.261898,1.909509
Ohio,-0.587037,1.531595,0.441623
Texas,0.942455,0.423597,0.422796
Oregon,0.48396,0.338313,0.702462


In [39]:
np.abs(frame)

Unnamed: 0,b,d,e
Utah,1.080272,1.261898,1.909509
Ohio,0.587037,1.531595,0.441623
Texas,0.942455,0.423597,0.422796
Oregon,0.48396,0.338313,0.702462


Define lambda and apply it once to row and once to columns

In [43]:
f = lambda x :  x.max() - x.min()

In [44]:
frame.apply(f, axis=0)

b    1.667310
d    2.793493
e    1.486712
dtype: float64

In [45]:
frame.apply(f, axis=1)

Utah      3.171407
Ohio      2.118632
Texas     0.519659
Oregon    0.364149
dtype: float64

### Sorting and ranking


In [50]:
obj = Series(range(4), index=['d', 'a', 'b', 'c'])
obj

d    0
a    1
b    2
c    3
dtype: int64

In [49]:
obj.sort_index()

a    1
b    2
c    3
d    0
dtype: int64

In [51]:
obj.sort_values(ascending=False)

c    3
b    2
a    1
d    0
dtype: int64

We can sort both the index or the values of a dataframe.

In [53]:
frame = DataFrame(np.arange(8).reshape((2, 4)), index=['three', 'one'], columns=list("dabc"))
frame

Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7


In [54]:
frame.sort_index()

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [55]:
frame.sort_index(axis=1, ascending=False)

Unnamed: 0,d,c,b,a
three,0,3,2,1
one,4,7,6,5


#### Unique values

In [59]:
obj = pd.Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])
unique_values = obj.unique()
unique_values.sort()
unique_values

array(['a', 'b', 'c', 'd'], dtype=object)

In [57]:
value_count = obj.value_counts(sort=True)
value_count

c    3
a    3
b    2
d    1
Name: count, dtype: int64