In [29]:
import pandas as pd
from pandas import Series, DataFrame
import numpy as np

# 5.1 Introduction to pandas Data Structures

### Series

In [9]:
obj = pd.Series([4, 7, -5, 3])

In [10]:
obj

0    4
1    7
2   -5
3    3
dtype: int64

And so on...

In [11]:
obj.values

array([ 4,  7, -5,  3])

In [12]:
obj.index

RangeIndex(start=0, stop=4, step=1)

In [17]:
obj2 = pd.Series([4, 7, -5, 3], index = ['d', 'b', 'a', 'c'])

In [18]:
obj2

d    4
b    7
a   -5
c    3
dtype: int64

In [19]:
obj2.index

Index(['d', 'b', 'a', 'c'], dtype='object')

In [20]:
obj2['a']

-5

In [24]:
obj2['d']

4

In [23]:
obj2[['c', 'a', 'd']]

c    3
a   -5
d    4
dtype: int64

#### notice for single values passed to a df, a single bracke is needed, but for multiple values, double brackets are needed

In [25]:
obj2[obj2 > 0]

d    4
b    7
c    3
dtype: int64

#### above exampl is filtering with boolean. only those values over zero are kept in the series

In [26]:
obj2 * 2

d     8
b    14
a   -10
c     6
dtype: int64

#### by 2  times a series, you double the values in the series

In [30]:
np.exp(obj2)

d      54.598150
b    1096.633158
a       0.006738
c      20.085537
dtype: float64

#### np.exp is e ^ x

In [31]:
'b' in obj2

True

#### checks if value is in 'key' of 'dictionary'

In [32]:
'e' in obj2

False

In [51]:
sdata = {'Ohio': 35000, 'Texas': 71000, "Oregon": 16000, "Utah": 5000} #this is a dictionary

In [52]:
obj3 = pd.Series(sdata)
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

#### in the above example, sdata is a dictionary. Dictionaries can be casted by pd.Series and assigned to a df obj

In [49]:
states = ['California', "Ohio", "Oregon", "Texas"]

In [53]:
obj4 = pd.Series(sdata, index = states)

In [54]:
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

#### states is a list of the states in sdata. Indexes are sorted when dicts are passed but you can order the indexes by passing a list in that order, with Series('series to pass in', 'index' = 'list of ordered index)

#### notice California was not in the dictionary and although Utah was, it's not in our passed in list

In [55]:
pd.isnull(obj4) #pandas has the isnull and notnull functions 

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [56]:
pd.notnull(obj4)

California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

In [58]:
obj4.isnull() #obj4 is a series so it has the instance method isnull()

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [59]:
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

#### when doing arithmetic operations between 2 series, pandas automatically aligns the indexes with each other

In [60]:
obj3 + obj4

California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64

In [61]:
obj4.name = 'population'

In [62]:
obj4.index.name = 'state'

In [63]:
obj4

state
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64

#### in a series, you can assign a name to the index and a name to the series itself ('state', 'population')

In [64]:
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [65]:
obj.index = ['Bob', 'Steve', 'Jeff', 'Ryan']
obj

Bob      4
Steve    7
Jeff    -5
Ryan     3
dtype: int64

#### indexes can be set (notice they're in the order the list is)

In [68]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'], 
       'year': [2000, 2001, 2002, 2001, 2002, 2003],
       'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame = pd.DataFrame(data)
#create a dataframe from a dictionary and equal length value entries

In [69]:
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [70]:
frame.head()

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [71]:
pd.DataFrame(data, columns = ['year', 'state', 'pop'])

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


#### above, a data frame was created from the data dictionary, but we specified the order of the columns with 'columns'

In [74]:
frame2 = pd.DataFrame(data, columns = ['year', 'state', 'pop', 'debt'], 
                    index = ['one', 'two', 'three', 'four', 'five', 'six'])

In [75]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,
six,2003,Nevada,3.2,


#### notice we have debt as a column, but because debt is not in the data dictionary, it's populated by NaN

In [77]:
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [78]:
frame2['year']

one      2000
two      2001
three    2002
four     2001
five     2002
six      2003
Name: year, dtype: int64

#### series can be retrived with dict like notation from a df or attribute

In [79]:
frame2.year

one      2000
two      2001
three    2002
four     2001
five     2002
six      2003
Name: year, dtype: int64

#### probably better to use ['column'], can account for spaces and dot notation only works with python variables

#### series returned from a df have the original indexes of the dataframe

In [80]:
frame2.loc['three']

year     2002
state    Ohio
pop       3.6
debt      NaN
Name: three, dtype: object

In [81]:
frame2['debt'] = 16.5

In [83]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,16.5
two,2001,Ohio,1.7,16.5
three,2002,Ohio,3.6,16.5
four,2001,Nevada,2.4,16.5
five,2002,Nevada,2.9,16.5
six,2003,Nevada,3.2,16.5


#### columns can be assigned a single (scalar value) to modify them or an array of values to modify

In [86]:
frame2['debt'] = np.arange(6.)

In [87]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,0.0
two,2001,Ohio,1.7,1.0
three,2002,Ohio,3.6,2.0
four,2001,Nevada,2.4,3.0
five,2002,Nevada,2.9,4.0
six,2003,Nevada,3.2,5.0


#### np.arange returns a list of evenly spaced values in interval, interval would be 6. in this example

In [88]:
val = pd.Series([-1.2, -1.5, -1.7], index = ['two', 'four', 'five'])

In [89]:
frame2['debt'] = val

In [90]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7
six,2003,Nevada,3.2,


#### PRETTY COOL!! You passed a series with the values, and then the indexes assigned to the values. You then assign frame 2's debt column with the val series. When this happens, the indexes from the val are alligned with the indexes in frame2. However where there are missing indexes (ergo missing values), the values will be assigned NaN

In [91]:
frame2['eastern'] = frame2.state == 'Ohio'

In [92]:
frame2

Unnamed: 0,year,state,pop,debt,eastern
one,2000,Ohio,1.5,,True
two,2001,Ohio,1.7,-1.2,True
three,2002,Ohio,3.6,,True
four,2001,Nevada,2.4,-1.5,False
five,2002,Nevada,2.9,-1.7,False
six,2003,Nevada,3.2,,False


#### okay so frame2 creates a new column where if frame2.state == 'Ohio', then it evaluates to true and fills up the 'eastern' column, apparently you can't create new columns from frame2.eastern

In [93]:
del frame2['eastern']

In [94]:
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

#### pretty self explanatory, delete the column in df frame2

#### i think it's saying modifications to data frames are reflected in indexing

In [95]:
pop = {'Nevada': {2001: 2.4, 2002: 2.9}, 'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}

#### nested dictionaries will assume the outer dictionary is the column, the inner key will be the row indices

In [96]:
frame3 = pd.DataFrame(pop)

In [97]:
frame3

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [98]:
frame3.T

Unnamed: 0,2001,2002,2000
Nevada,2.4,2.9,
Ohio,1.7,3.6,1.5


#### where T is transposing the data

In [100]:
pd.DataFrame(pop, index = [2001, 2002, 2003])

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2003,,


#### well it says in the book the keys are combined and then sorted to form the index, but that's not the case in my above examples

In [119]:
pdata = {'Ohio': frame3['Ohio'][:-1], 
        'Nevada': frame3['Nevada'][:2]}

pd.DataFrame(pdata)

Unnamed: 0,Ohio,Nevada
2001,1.7,2.4
2002,3.6,2.9


#### pdata is a dict. the keys are 'Ohio' and 'Nevada' and they're the column names. We pass in the series from frame 3's column, 'ohio', and frame3's column 'nevada'. The indexes are like lists [:-1] is everything except last, [:2] everything up to nut not including index 2

In [118]:
frame3

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [123]:
frame3['Nevada'][:2]

2001    2.4
2002    2.9
Name: Nevada, dtype: float64

In [124]:
frame3.index.name = 'year'
frame3.columns.name = 'state'

#### you can assign the index name and attribute

In [125]:
frame3

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [126]:
frame3.values

array([[2.4, 1.7],
       [2.9, 3.6],
       [nan, 1.5]])

#### .values returns the data as a 2 dim ndarray, in the case above, it would be a 3 x 2 array


In [127]:
frame2.values

array([[2000, 'Ohio', 1.5, nan],
       [2001, 'Ohio', 1.7, -1.2],
       [2002, 'Ohio', 3.6, nan],
       [2001, 'Nevada', 2.4, -1.5],
       [2002, 'Nevada', 2.9, -1.7],
       [2003, 'Nevada', 3.2, nan]], dtype=object)

#### notice above the values even in each row are a different data type, but the values array will accomodate

In [129]:
obj = pd.Series(range(3), index = ['a', 'b', 'c'])

In [130]:
index = obj.index

#### so each series and dataframe has an index object. in the above value, we assign obj's index to the index variable

In [131]:
index

Index(['a', 'b', 'c'], dtype='object')

In [132]:
index[1:]

Index(['b', 'c'], dtype='object')

#### once again lists behave like python lists cause this is pythong

#### index objects are immutable in the sense they can be reassigned but you can't change the elemnt of an index, say '2001'[0] = 'something'.

#### or in the above index, you can't reassign an index. index[1] = 'd' gives a type error 

In [133]:
index[1] = 'd'

TypeError: Index does not support mutable operations

In [135]:
labels = pd.Index(np.arange(3))

In [136]:
labels

Int64Index([0, 1, 2], dtype='int64')

In [137]:
obj2 = pd.Series([1.5, -2.5, 0], index = labels)

#### remember you're passing in a list and the index to the series function so they must be inside the parentheses

In [138]:
obj2

0    1.5
1   -2.5
2    0.0
dtype: float64

In [139]:
obj2.index is labels

True

#### line 139 compares obj2's index to the labels index. Note pd.Index takes in np.arange(3) which creates a list with values of [0, 1, 2]

In [140]:
frame3

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [141]:
frame3.columns

Index(['Nevada', 'Ohio'], dtype='object', name='state')

#### indexes are fix sized sets in that they can't be added to? but you can combine them with append?????

In [143]:
dup_labels = [pd.Index(['foo', 'foo', 'bar', 'bar'])]

In [144]:
dup_labels

[Index(['foo', 'foo', 'bar', 'bar'], dtype='object')]

#### indexes can have duplicates, and selections of duplicates will select all occurences of that duplicate

In [145]:
frame3.index, index = ['d', 'b', 'a', 'c']

Int64Index([2001, 2002, 2000], dtype='int64', name='year')

In [153]:
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index = ['d', 'b', 'a', 'c'])

In [156]:
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [154]:
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])

In [155]:
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

#### reindexing takes an object and reindexes according to the list you pass. Reindexing also fills in missing values

In [161]:
obj3 = pd.Series(['blue', 'purple', 'yellow'], index = [0, 2, 4])

In [162]:
obj3

0      blue
2    purple
4    yellow
dtype: object

In [163]:
obj3.reindex(range(6), method = 'ffill')

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

#### ffill takes the last valid value and fills in the empty space ahead

In [169]:
frame = pd.DataFrame(np.arange(9).reshape((3,3)),
index = ['a', 'c', 'd'],
columns = ['Ohio', 'Texas', 'California'])

In [170]:
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


#### np.reshape takes a tuple (?) of the shape of a 3 x 3 and then arange(9) is used to fill in the values from 0 - 8

In [173]:
frame2 = frame.reindex(['a', 'b', 'c', 'd'])

In [174]:
frame2

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


#### above because we passed only a sequence, we reindexed the rows. Notice there is nothing around the list we passed to reindex because this is an index sequence

In [175]:
states = ['Texas', 'Utah', 'California']

In [176]:
frame.reindex(columns = states)

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


#### and because we assign columns to the states list, it knows to reindex columns 

In [178]:
frame.loc[['a', 'b', 'c', 'd'], states]

KeyError: "Passing list-likes to .loc or [] with any missing labels is no longer supported. The following labels were missing: Index(['b'], dtype='object'). See https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike"

#### this book did come out in 2018... that's pretty much a century ago (if you see this kris i will be impressed)

In [180]:
obj = pd.Series(np.arange(5.), index = ['a', 'b', 'c', 'd', 'e'])

In [181]:
obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [182]:
new_obj = obj.drop('c')

In [183]:
new_obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

#### notice obj.drop returns a new obj, the original is not affected

In [187]:
obj.drop(['c', 'd'])

a    0.0
b    1.0
e    4.0
dtype: float64

#### AND NOTICE (as usual in pandas) that you have to pass a list WHEN working on multiple values at once

In [188]:
data = pd.DataFrame(np.arange(16).reshape((4,4)), 
                   index = ['Ohio', 'Colorado', 'Utah', 'New York'],
                   columns = ['one', 'two', 'three', 'four'])

In [189]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [190]:
data.drop(['Colorado', 'Ohio'])

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
New York,12,13,14,15


In [192]:
data.drop('two', axis = 1)

Unnamed: 0,one,three,four
Ohio,0,2,3
Colorado,4,6,7
Utah,8,10,11
New York,12,14,15


#### axis default value is 0. axis = 1 is the value to drop a column

In [195]:
data.drop(['two', 'four'], axis = 1)

Unnamed: 0,one,three
Ohio,0,2
Colorado,4,6
Utah,8,10
New York,12,14


In [196]:
obj.drop('c', inplace = True)

#### and here's our old friend inplace. Buyer beware: inplace destroys and dropped data

In [197]:
obj = pd.Series(np.arange(4.), index = ['a', 'b', 'c', 'd'])

In [198]:
obj

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [199]:
obj[1]

1.0

In [200]:
obj[2:4]

c    2.0
d    3.0
dtype: float64

In [202]:
obj[['b', 'a', 'd']]

b    1.0
a    0.0
d    3.0
dtype: float64

In [203]:
obj[[1,3]]

b    1.0
d    3.0
dtype: float64

#### you can use the index as the key to call the value OR you can pass in the value? how the hell does it know if the value is an index or a value?

In [204]:
obj[obj < 2] #same as obj[:2] ?

a    0.0
b    1.0
dtype: float64

In [205]:
obj['b':'c']

b    1.0
c    2.0
dtype: float64

#### slicing is inclusive. 

In [206]:
obj['b':'c'] = 5

In [207]:
obj

a    0.0
b    5.0
c    5.0
d    3.0
dtype: float64

#### you can also set values by slicing

In [208]:
data = pd.DataFrame(np.arange(16).reshape((4,4)), index = ['Ohio', 'Colorado', 'Utah', 'New Yrok'],
                    columns = ['one', 'two', 'three', 'four'])

In [209]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New Yrok,12,13,14,15


In [210]:
data['two']

Ohio         1
Colorado     5
Utah         9
New Yrok    13
Name: two, dtype: int64

In [211]:
data[['three', 'one']]

Unnamed: 0,three,one
Ohio,2,0
Colorado,6,4
Utah,10,8
New Yrok,14,12


#### indexing a df, just pass the column names. Indexes will be filled

In [212]:
data[:2]

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7


#### slicing only works on indexes (duh)

In [213]:
data[data['three'] > 5] #this calls the rows where three > 5

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
New Yrok,12,13,14,15


In [214]:
data < 5

Unnamed: 0,one,two,three,four
Ohio,True,True,True,True
Colorado,True,False,False,False
Utah,False,False,False,False
New Yrok,False,False,False,False


#### takes the data df and creates a boolean table based on if data < 5

In [216]:
data[data < 5] = 0

In [217]:
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New Yrok,12,13,14,15


#### all values that are less are assigned the value of zero

In [218]:
data.loc['Colorado', ['two', 'three']]

two      5
three    6
Name: Colorado, dtype: int64

In [219]:
data.loc['Colorado', ['two', 'three']]

two      5
three    6
Name: Colorado, dtype: int64

#### loc passes the index we want first, and then the columns we want. I do believe loc uses index based on axis label

In [220]:
data.iloc[2, [3, 0, 1]]

four    11
one      8
two      9
Name: Utah, dtype: int64

In [221]:
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New Yrok,12,13,14,15


#### first value passed into iloc is 2. 2 is the index of Utah. The list after the two is the columns we want based on index and what order we want them in. Hence the values are 11, 8, 9

In [222]:
data.iloc[2]

one       8
two       9
three    10
four     11
Name: Utah, dtype: int64

#### 2 is the index number of the index we want - Utah. No columns passed in so it returns all and in the order they're displayed


In [223]:
data.iloc[[1, 2], [3, 0, 1]] # i'm gonna guess we want CO, UT, and index four, one, two in that order

Unnamed: 0,four,one,two
Colorado,7,0,5
Utah,11,8,9


In [224]:
data.loc[:'Utah', 'two'] # i'm gonna guess index values up (inc) Utah and in column 2

Ohio        0
Colorado    5
Utah        9
Name: two, dtype: int64

In [225]:
data.iloc[:, :3][data.three > 5]

Unnamed: 0,one,two,three
Colorado,0,5,6
Utah,8,9,10
New Yrok,12,13,14
