In [2]:
import pandas as pd

In [3]:
import numpy as np

In [4]:
import matplotlib.pyplot as plt

In [5]:
# Following: https://pandas.pydata.org/pandas-docs/stable/dsintro.html#dsintro

# Remember: "data alignment is intrinsic.  The link between labels and data will not be broken unless done so explicitly by the user."

## Series
# A one-dimensional labeled array capable of holding any data type.
# The axis labels are collectively refered to as the "index."

# The typical way to create a series is:
# s = pd.Series(data, index=index)

# We'll see three examples, creating a Series from:
# ndarray
# dict
# scalar value

In [6]:
# ndarray example:
s1 = pd.Series(np.random.randn(5), index = ['a', 'b', 'c', 'd', 'e'])

In [7]:
s1

a    1.933418
b   -1.358841
c   -0.482009
d    0.983371
e   -0.974146
dtype: float64

In [8]:
s1.index

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [9]:
pd.Series(np.random.randn(5)) # return an arbitrary series

0    2.648681
1   -2.170239
2   -1.535222
3    0.384111
4   -0.057328
dtype: float64

In [10]:
# dict example:

dict = {'a': 0., 'b': 1., 'c': 2.}

In [11]:
pd.Series(dict) # here, the index is given the values of the dict keys

a    0.0
b    1.0
c    2.0
dtype: float64

In [12]:
pd.Series(dict, index=['d', 'c', 'b', 'a']) # here, the data values are arranged in order of the dict keys presented in "index"

d    NaN
c    2.0
b    1.0
a    0.0
dtype: float64

In [13]:
# scalar value example:

pd.Series(5., index=['a', 'b', 'c', 'd', 'e'])

a    5.0
b    5.0
c    5.0
d    5.0
e    5.0
dtype: float64

In [14]:
# Features by which a Series is like a NumPy ndarray:
# Slices:
s1[0]

1.933418093655435

In [15]:
s1

a    1.933418
b   -1.358841
c   -0.482009
d    0.983371
e   -0.974146
dtype: float64

In [16]:
# Expressions:
np.exp(s1)

a    6.913100
b    0.256959
c    0.617541
d    2.673454
e    0.377515
dtype: float64

In [17]:
s1[ s1 > s1.median() ]

a    1.933418
d    0.983371
dtype: float64

In [18]:
# Features by which Series is like a dict:
s1['a']

1.933418093655435

In [19]:
'e' in s1

True

In [20]:
'f' in s1

False

In [21]:
s1['f']

KeyError: 'f'

In [22]:
# Vector operations:
s1 + s1

a    3.866836
b   -2.717681
c   -0.964019
d    1.966742
e   -1.948292
dtype: float64

In [23]:
# "A key difference between Series and ndarray is that 
# operations between Series automatically align the data 
# based on label. Thus, you can write computations without 
# giving consideration to whether the Series involved have the same labels."

s1[1:] + s1[:-1] # Add the last four plus the first four values, so the middle three.

a         NaN
b   -2.717681
c   -0.964019
d    1.966742
e         NaN
dtype: float64

In [24]:
# Series can also have a 'name' attribute:
s1.name
# no error returned when 'name' is empty?

In [25]:
s1.name = 'my_little_series'
s1.name

'my_little_series'

In [26]:
s1.name = 'new_name'
s1.name

'new_name'

In [27]:
# 
# 
# New data structure:
#
### DataFrame
#
# is "like a spreadsheet, or a SQL table, or a dict of Series objects."

# DataFrames have index (row) and column labels.

# Index and column labels can be "guaranteed" by including them in the argument.

# DataFrames accept, amongst other types, dicts as an argument.

d = {
    'col one' : pd.Series([1., 2., 3.], index=['row one', 'row two', 'row three']),
    'col two' : pd.Series([1., 2., 3., 4.], index=['row one', 'row two', 'row three', 'row four'])
}

d

{'col one': row one      1.0
 row two      2.0
 row three    3.0
 dtype: float64, 'col two': row one      1.0
 row two      2.0
 row three    3.0
 row four     4.0
 dtype: float64}

In [28]:
df1 = pd.DataFrame(d)
df1

Unnamed: 0,col one,col two
row four,,4.0
row one,1.0,1.0
row three,3.0,3.0
row two,2.0,2.0


In [29]:
# If no index is given in arguments, then the index will be automatically numbered:

d = {
    'one' : [1., 2., 3., 4.],
    'two' : [2., 4., 6., 8.]
}

pd.DataFrame(d)

Unnamed: 0,one,two
0,1.0,2.0
1,2.0,4.0
2,3.0,6.0
3,4.0,8.0


In [30]:
# DataFrame from a structure or record array
# "Handled identically to a dict of arrays"

data = np.zeros((2,), dtype=[('A', 'i4'), ('B', 'f4'), ('C', 'a10')]) # what are 'i4', 'f4', and 'a10'???

In [31]:
data[:] = [(1, 2., 'Hello'), (2, 3., 'World')]

In [32]:
pd.DataFrame(data)

Unnamed: 0,A,B,C
0,1,2.0,b'Hello'
1,2,3.0,b'World'


In [33]:
pd.DataFrame(data, index=['first', 'second'])

Unnamed: 0,A,B,C
first,1,2.0,b'Hello'
second,2,3.0,b'World'


In [34]:
pd.DataFrame(data, columns=['C', 'A', 'B'])

Unnamed: 0,C,A,B
0,b'Hello',1,2.0
1,b'World',2,3.0


In [35]:
# Ok, I didn't really understand that example because I didn't understand the complex np array input.

In [36]:
# Create a DataFrame from a list of dicts

data2 = [{'a' : 1, 'b': 2}, {'a' : 3, 'c' : 4, 'd' : 5}]
pd.DataFrame(data2) # why are some values Int and some Float?

Unnamed: 0,a,b,c,d
0,1,2.0,,
1,3,,4.0,5.0


In [37]:
# From a dict of tuples
# "Automatically create a multi-indexed DF by passing a tuples dict"---this will be useful.

tupledict = {('2017', 'Q4') : {('Generation', 'Predicted') : 100, ('Generation', 'Actual') : 98, ('Profit', '') : 22},
            ('2018', 'Q1') : {('Generation', 'Predicted') : 80, ('Generation', 'Actual') : 78, ('Profit', '') : 4},
            ('2018', 'Q2') : {('Generation', 'Predicted') : 100, ('Generation', 'Actual') : 98, ('Profit', '') : 9},
            ('2018', 'Q3') : {('Generation', 'Predicted') : 120, ('Generation', 'Actual') : 118, ('Profit', '') : 37},
            ('2018', 'Q4') : {('Generation', 'Predicted') : 100, ('Generation', 'Actual') : 98, ('Profit', '') : 22}}

year_in_quarters = pd.DataFrame(tupledict)

year_in_quarters

Unnamed: 0_level_0,Unnamed: 1_level_0,2017,2018,2018,2018,2018
Unnamed: 0_level_1,Unnamed: 1_level_1,Q4,Q1,Q2,Q3,Q4
Generation,Actual,98,78,98,118,98
Generation,Predicted,100,80,100,120,100
Profit,,22,4,9,37,22


In [38]:
# Alternate constructors: from array or dict.  Good to trivia to know.

In [39]:
### Column Selection, Addition, and Deletion
year_in_quarters['2018']

Unnamed: 0,Unnamed: 1,Q1,Q2,Q3,Q4
Generation,Actual,78,98,118,98
Generation,Predicted,80,100,120,100
Profit,,4,9,37,22


In [40]:
year_in_quarters['Q4']

KeyError: 'Q4'

In [41]:
year_in_quarters['2018', 'Q4']

Generation  Actual        98
            Predicted    100
Profit                    22
Name: (2018, Q4), dtype: int64

In [42]:
year_in_quarters['2019'] = year_in_quarters['2018']*0.98

ValueError: Wrong number of items passed 4, placement implies 1

In [46]:
another_tuple_dict = {
    'Actual Generation (kWh)' : {(2017, 1) : 2, (2017, 2) : 3, (2018, 1): 2, (2018, 2): 3},
    'Predicted Generation (kWh)' : {(2017, 1) : 2, (2017, 2) : 3, (2018, 1): 2, (2018, 2): 3}
}

year_halves = pd.DataFrame(another_tuple_dict)

year_halves

Unnamed: 0,Unnamed: 1,Actual Generation (kWh),Predicted Generation (kWh)
2017,1,2,2
2017,2,3,3
2018,1,2,2
2018,2,3,3


In [48]:
year_halves.groupby(level=0).sum()

Unnamed: 0,Actual Generation (kWh),Predicted Generation (kWh)
2017,5,5
2018,5,5
