In [5]:
import pandas as pd

In [6]:
import numpy as np

In [7]:
import matplotlib.pyplot as plt

In [9]:
# Following: https://pandas.pydata.org/pandas-docs/stable/dsintro.html#dsintro

# Remember: "data alignment is intrinsic.  The link between labels and data will not be broken unless done so explicitly by the user."

## Series
# A one-dimensional labeled array capable of holding any data type.
# The axis labels are collectively refered to as the "index."

# The typical way to create a series is:
# s = pd.Series(data, index=index)

# We'll see three examples, creating a Series from:
# ndarray
# dict
# scalar value

In [13]:
# ndarray example:
s1 = pd.Series(np.random.randn(5), index = ['a', 'b', 'c', 'd', 'e'])

In [14]:
s1

a   -0.083664
b   -0.453181
c    2.025443
d   -1.199350
e   -1.857526
dtype: float64

In [12]:
s1.index

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [13]:
pd.Series(np.random.randn(5)) # return an arbitrary series

0    2.017524
1    1.487724
2   -0.356396
3   -0.516028
4   -1.911754
dtype: float64

In [15]:
# dict example:

dict = {'a': 0., 'b': 1., 'c': 2.}

In [16]:
pd.Series(dict) # here, the index is given the values of the dict keys

a    0.0
b    1.0
c    2.0
dtype: float64

In [17]:
pd.Series(dict, index=['d', 'c', 'b', 'a']) # here, the data values are arranged in order of the dict keys presented in "index"

d    NaN
c    2.0
b    1.0
a    0.0
dtype: float64

In [10]:
# scalar value example:

pd.Series(5., index=['a', 'b', 'c', 'd', 'e'])

a    5.0
b    5.0
c    5.0
d    5.0
e    5.0
dtype: float64

In [15]:
# Features by which a Series is like a NumPy ndarray:
# Slices:
s1[0]

-0.083664267981352611

In [18]:
s1

a   -0.083664
b   -0.453181
c    2.025443
d   -1.199350
e   -1.857526
dtype: float64

In [19]:
# Expressions:
np.exp(s1)

a    0.919740
b    0.635603
c    7.579469
d    0.301390
e    0.156058
dtype: float64

In [22]:
s1[ s1 > s1.median() ]

a   -0.083664
c    2.025443
dtype: float64

In [26]:
# Features by which Series is like a dict:
s1['a']

-0.083664267981352611

In [27]:
'e' in s1

True

In [28]:
'f' in s1

False

In [30]:
s1['f']

KeyError: 'f'

In [31]:
# Vector operations:
s1 + s1

a   -0.167329
b   -0.906362
c    4.050886
d   -2.398700
e   -3.715052
dtype: float64

In [32]:
# "A key difference between Series and ndarray is that 
# operations between Series automatically align the data 
# based on label. Thus, you can write computations without 
# giving consideration to whether the Series involved have the same labels."

s1[1:] + s1[:-1] # Add the last four plus the first four values, so the middle three.

a         NaN
b   -0.906362
c    4.050886
d   -2.398700
e         NaN
dtype: float64

In [36]:
# Series can also have a 'name' attribute:
s1.name
# no error returned when 'name' is empty?

In [38]:
s1.name = 'my_little_series'
s1.name

'my_little_series'

In [44]:
s1.name = 'new_name'
s1.name

'new_name'

In [45]:
# 
# 
# New data structure:
#
### DataFrame
#
# is "like a spreadsheet, or a SQL table, or a dict of Series objects."

# DataFrames have index (row) and column labels.

# Index and column labels can be "guaranteed" by including them in the argument.

# DataFrames accept, amongst other types, dicts as an argument.

d = {
    'col one' : pd.Series([1., 2., 3.], index=['row one', 'row two', 'row three']),
    'col two' : pd.Series([1., 2., 3., 4.], index=['row one', 'row two', 'row three', 'row four'])
}

d

{'col one': row one      1.0
 row two      2.0
 row three    3.0
 dtype: float64, 'col two': row one      1.0
 row two      2.0
 row three    3.0
 row four     4.0
 dtype: float64}

In [47]:
df1 = pd.DataFrame(d)
df1

Unnamed: 0,col one,col two
row four,,4.0
row one,1.0,1.0
row three,3.0,3.0
row two,2.0,2.0
