In [2]:
import pandas as pd
from pandas import Series, DataFrame
import numpy as np

# 5.1 Introduction to pandas Data Structures

### Series

In [3]:
obj = pd.Series([4, 7, -5, 3])

In [4]:
obj

0    4
1    7
2   -5
3    3
dtype: int64

And so on...

In [5]:
obj.values

array([ 4,  7, -5,  3])

In [6]:
obj.index

RangeIndex(start=0, stop=4, step=1)

In [7]:
obj2 = pd.Series([4, 7, -5, 3], index = ['d', 'b', 'a', 'c'])

In [8]:
obj2

d    4
b    7
a   -5
c    3
dtype: int64

In [9]:
obj2.index

Index(['d', 'b', 'a', 'c'], dtype='object')

In [10]:
obj2['a']

-5

In [11]:
obj2['d']

4

In [12]:
obj2[['c', 'a', 'd']]

c    3
a   -5
d    4
dtype: int64

#### notice for single values passed to a df, a single bracke is needed, but for multiple values, double brackets are needed

In [13]:
obj2[obj2 > 0]

d    4
b    7
c    3
dtype: int64

#### above exampl is filtering with boolean. only those values over zero are kept in the series

In [14]:
obj2 * 2

d     8
b    14
a   -10
c     6
dtype: int64

#### by 2  times a series, you double the values in the series

In [15]:
np.exp(obj2)

d      54.598150
b    1096.633158
a       0.006738
c      20.085537
dtype: float64

#### np.exp is e ^ x

In [16]:
'b' in obj2

True

#### checks if value is in 'key' of 'dictionary'

In [17]:
'e' in obj2

False

In [18]:
sdata = {'Ohio': 35000, 'Texas': 71000, "Oregon": 16000, "Utah": 5000} #this is a dictionary

In [19]:
obj3 = pd.Series(sdata)
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

#### in the above example, sdata is a dictionary. Dictionaries can be casted by pd.Series and assigned to a df obj

In [20]:
states = ['California', "Ohio", "Oregon", "Texas"]

In [21]:
obj4 = pd.Series(sdata, index = states)

In [22]:
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

#### states is a list of the states in sdata. Indexes are sorted when dicts are passed but you can order the indexes by passing a list in that order, with Series('series to pass in', 'index' = 'list of ordered index)

#### notice California was not in the dictionary and although Utah was, it's not in our passed in list

In [23]:
pd.isnull(obj4) #pandas has the isnull and notnull functions 

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [24]:
pd.notnull(obj4)

California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

In [25]:
obj4.isnull() #obj4 is a series so it has the instance method isnull()

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [26]:
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

#### when doing arithmetic operations between 2 series, pandas automatically aligns the indexes with each other

In [27]:
obj3 + obj4

California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64

In [28]:
obj4.name = 'population'

In [29]:
obj4.index.name = 'state'

In [30]:
obj4

state
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64

#### in a series, you can assign a name to the index and a name to the series itself ('state', 'population')

In [31]:
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [32]:
obj.index = ['Bob', 'Steve', 'Jeff', 'Ryan']
obj

Bob      4
Steve    7
Jeff    -5
Ryan     3
dtype: int64

#### indexes can be set (notice they're in the order the list is)

### DataFrame

In [33]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'], 
       'year': [2000, 2001, 2002, 2001, 2002, 2003],
       'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame = pd.DataFrame(data)
#create a dataframe from a dictionary and equal length value entries

In [34]:
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [35]:
frame.head()

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [36]:
pd.DataFrame(data, columns = ['year', 'state', 'pop'])

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


#### above, a data frame was created from the data dictionary, but we specified the order of the columns with 'columns'

In [37]:
frame2 = pd.DataFrame(data, columns = ['year', 'state', 'pop', 'debt'], 
                    index = ['one', 'two', 'three', 'four', 'five', 'six'])

In [38]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,
six,2003,Nevada,3.2,


#### notice we have debt as a column, but because debt is not in the data dictionary, it's populated by NaN

In [39]:
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [40]:
frame2['year']

one      2000
two      2001
three    2002
four     2001
five     2002
six      2003
Name: year, dtype: int64

#### series can be retrived with dict like notation from a df or attribute

In [41]:
frame2.year

one      2000
two      2001
three    2002
four     2001
five     2002
six      2003
Name: year, dtype: int64

#### probably better to use ['column'], can account for spaces and dot notation only works with python variables

#### series returned from a df have the original indexes of the dataframe

In [42]:
frame2.loc['three']

year     2002
state    Ohio
pop       3.6
debt      NaN
Name: three, dtype: object

In [43]:
frame2['debt'] = 16.5

In [44]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,16.5
two,2001,Ohio,1.7,16.5
three,2002,Ohio,3.6,16.5
four,2001,Nevada,2.4,16.5
five,2002,Nevada,2.9,16.5
six,2003,Nevada,3.2,16.5


#### columns can be assigned a single (scalar value) to modify them or an array of values to modify

In [45]:
frame2['debt'] = np.arange(6.)

In [46]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,0.0
two,2001,Ohio,1.7,1.0
three,2002,Ohio,3.6,2.0
four,2001,Nevada,2.4,3.0
five,2002,Nevada,2.9,4.0
six,2003,Nevada,3.2,5.0


#### np.arange returns a list of evenly spaced values in interval, interval would be 6. in this example

In [47]:
val = pd.Series([-1.2, -1.5, -1.7], index = ['two', 'four', 'five'])

In [48]:
frame2['debt'] = val

In [49]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7
six,2003,Nevada,3.2,


#### PRETTY COOL!! You passed a series with the values, and then the indexes assigned to the values. You then assign frame 2's debt column with the val series. When this happens, the indexes from the val are alligned with the indexes in frame2. However where there are missing indexes (ergo missing values), the values will be assigned NaN

In [50]:
frame2['eastern'] = frame2.state == 'Ohio'

In [51]:
frame2

Unnamed: 0,year,state,pop,debt,eastern
one,2000,Ohio,1.5,,True
two,2001,Ohio,1.7,-1.2,True
three,2002,Ohio,3.6,,True
four,2001,Nevada,2.4,-1.5,False
five,2002,Nevada,2.9,-1.7,False
six,2003,Nevada,3.2,,False


#### okay so frame2 creates a new column where if frame2.state == 'Ohio', then it evaluates to true and fills up the 'eastern' column, apparently you can't create new columns from frame2.eastern

In [52]:
del frame2['eastern']

In [53]:
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

#### pretty self explanatory, delete the column in df frame2

#### i think it's saying modifications to data frames are reflected in indexing

In [54]:
pop = {'Nevada': {2001: 2.4, 2002: 2.9}, 'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}

#### nested dictionaries will assume the outer dictionary is the column, the inner key will be the row indices

In [55]:
frame3 = pd.DataFrame(pop)

In [56]:
frame3

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [57]:
frame3.T

Unnamed: 0,2001,2002,2000
Nevada,2.4,2.9,
Ohio,1.7,3.6,1.5


#### where T is transposing the data

In [58]:
pd.DataFrame(pop, index = [2001, 2002, 2003])

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2003,,


#### well it says in the book the keys are combined and then sorted to form the index, but that's not the case in my above examples

In [59]:
pdata = {'Ohio': frame3['Ohio'][:-1], 
        'Nevada': frame3['Nevada'][:2]}

pd.DataFrame(pdata)

Unnamed: 0,Ohio,Nevada
2001,1.7,2.4
2002,3.6,2.9


#### pdata is a dict. the keys are 'Ohio' and 'Nevada' and they're the column names. We pass in the series from frame 3's column, 'ohio', and frame3's column 'nevada'. The indexes are like lists [:-1] is everything except last, [:2] everything up to nut not including index 2

In [60]:
frame3

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [61]:
frame3['Nevada'][:2]

2001    2.4
2002    2.9
Name: Nevada, dtype: float64

In [62]:
frame3.index.name = 'year'
frame3.columns.name = 'state'

#### you can assign the index name and attribute

In [63]:
frame3

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [64]:
frame3.values

array([[2.4, 1.7],
       [2.9, 3.6],
       [nan, 1.5]])

#### .values returns the data as a 2 dim ndarray, in the case above, it would be a 3 x 2 array


In [65]:
frame2.values

array([[2000, 'Ohio', 1.5, nan],
       [2001, 'Ohio', 1.7, -1.2],
       [2002, 'Ohio', 3.6, nan],
       [2001, 'Nevada', 2.4, -1.5],
       [2002, 'Nevada', 2.9, -1.7],
       [2003, 'Nevada', 3.2, nan]], dtype=object)

#### notice above the values even in each row are a different data type, but the values array will accomodate

### Index Objects


In [66]:
obj = pd.Series(range(3), index = ['a', 'b', 'c'])

In [67]:
index = obj.index

#### so each series and dataframe has an index object. in the above value, we assign obj's index to the index variable

In [68]:
index

Index(['a', 'b', 'c'], dtype='object')

In [69]:
index[1:]

Index(['b', 'c'], dtype='object')

#### once again lists behave like python lists cause this is pythong

#### index objects are immutable in the sense they can be reassigned but you can't change the elemnt of an index, say '2001'[0] = 'something'.

#### or in the above index, you can't reassign an index. index[1] = 'd' gives a type error 

In [70]:
index[1] = 'd'

TypeError: Index does not support mutable operations

In [None]:
labels = pd.Index(np.arange(3))

In [None]:
labels

In [None]:
obj2 = pd.Series([1.5, -2.5, 0], index = labels)

#### remember you're passing in a list and the index to the series function so they must be inside the parentheses

In [None]:
obj2

In [None]:
obj2.index is labels

#### line 139 compares obj2's index to the labels index. Note pd.Index takes in np.arange(3) which creates a list with values of [0, 1, 2]

In [None]:
frame3

In [None]:
frame3.columns

#### indexes are fix sized sets in that they can't be added to? but you can combine them with append?????

In [None]:
dup_labels = [pd.Index(['foo', 'foo', 'bar', 'bar'])]

In [None]:
dup_labels

#### indexes can have duplicates, and selections of duplicates will select all occurences of that duplicate

In [None]:
frame3.index, index = ['d', 'b', 'a', 'c']

# 5.2 Essential Functionality

## Reindexing

In [None]:
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index = ['d', 'b', 'a', 'c'])

In [None]:
obj

In [None]:
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])

In [None]:
obj2

#### reindexing takes an object and reindexes according to the list you pass. Reindexing also fills in missing values

In [None]:
obj3 = pd.Series(['blue', 'purple', 'yellow'], index = [0, 2, 4])

In [None]:
obj3

In [None]:
obj3.reindex(range(6), method = 'ffill')

#### ffill takes the last valid value and fills in the empty space ahead

In [None]:
frame = pd.DataFrame(np.arange(9).reshape((3,3)),
index = ['a', 'c', 'd'],
columns = ['Ohio', 'Texas', 'California'])

In [None]:
frame

#### np.reshape takes a tuple (?) of the shape of a 3 x 3 and then arange(9) is used to fill in the values from 0 - 8

In [None]:
frame2 = frame.reindex(['a', 'b', 'c', 'd'])

In [None]:
frame2

#### above because we passed only a sequence, we reindexed the rows. Notice there is nothing around the list we passed to reindex because this is an index sequence

In [None]:
states = ['Texas', 'Utah', 'California']

In [None]:
frame.reindex(columns = states)

#### and because we assign columns to the states list, it knows to reindex columns 

In [None]:
frame.loc[['a', 'b', 'c', 'd'], states]

#### this book did come out in 2018... that's pretty much a century ago (if you see this kris i will be impressed)

### Dropping Entries from an Axis

In [None]:
obj = pd.Series(np.arange(5.), index = ['a', 'b', 'c', 'd', 'e'])

In [None]:
obj

In [None]:
new_obj = obj.drop('c')

In [None]:
new_obj

#### notice obj.drop returns a new obj, the original is not affected

In [None]:
obj.drop(['c', 'd'])

#### AND NOTICE (as usual in pandas) that you have to pass a list WHEN working on multiple values at once

In [None]:
data = pd.DataFrame(np.arange(16).reshape((4,4)), 
                   index = ['Ohio', 'Colorado', 'Utah', 'New York'],
                   columns = ['one', 'two', 'three', 'four'])

In [None]:
data

In [None]:
data.drop(['Colorado', 'Ohio'])

In [None]:
data.drop('two', axis = 1)

#### axis default value is 0. axis = 1 is the value to drop a column

In [None]:
data.drop(['two', 'four'], axis = 1)

In [None]:
obj.drop('c', inplace = True)

#### and here's our old friend inplace. Buyer beware: inplace destroys and dropped data

### Indexing, Selection, and Filtering

In [None]:
obj = pd.Series(np.arange(4.), index = ['a', 'b', 'c', 'd'])

In [None]:
obj

In [None]:
obj[1]

In [None]:
obj[2:4]

In [None]:
obj[['b', 'a', 'd']]

In [None]:
obj[[1,3]]

#### you can use the index as the key to call the value OR you can pass in the value? how the hell does it know if the value is an index or a value?

In [None]:
obj[obj < 2] #same as obj[:2] ?

In [None]:
obj['b':'c']

#### slicing is inclusive. 

In [None]:
obj['b':'c'] = 5

In [None]:
obj

#### you can also set values by slicing

In [None]:
data = pd.DataFrame(np.arange(16).reshape((4,4)), index = ['Ohio', 'Colorado', 'Utah', 'New Yrok'],
                    columns = ['one', 'two', 'three', 'four'])

In [None]:
data

In [None]:
data['two']

In [None]:
data[['three', 'one']]

#### indexing a df, just pass the column names. Indexes will be filled

In [None]:
data[:2]

#### slicing only works on indexes (duh)

In [None]:
data[data['three'] > 5] #this calls the rows where three > 5

In [None]:
data < 5

#### takes the data df and creates a boolean table based on if data < 5

In [None]:
data[data < 5] = 0

In [None]:
data

#### all values that are less are assigned the value of zero

### Selection with loc and iloc

In [None]:
data.loc['Colorado', ['two', 'three']]

In [None]:
data.loc['Colorado', ['two', 'three']]

#### loc passes the index we want first, and then the columns we want. I do believe loc uses index based on axis label

In [None]:
data.iloc[2, [3, 0, 1]]

In [None]:
data

#### first value passed into iloc is 2. 2 is the index of Utah. The list after the two is the columns we want based on index and what order we want them in. Hence the values are 11, 8, 9

In [None]:
data.iloc[2]

#### 2 is the index number of the index we want - Utah. No columns passed in so it returns all and in the order they're displayed


In [None]:
data.iloc[[1, 2], [3, 0, 1]] # i'm gonna guess we want CO, UT, and index four, one, two in that order

In [None]:
data.loc[:'Utah', 'two'] # i'm gonna guess index values up (inc) Utah and in column 2

In [None]:
data.iloc[:, :3][data.three > 5]

#### what i'm seeing: we want columns index 0 to 2 and the first : means we want all rows (but we don't get ohio). Then we want rows where the value in column 3 (data.three) is > 5. Because Ohio has no values greater than 5, ohio is not shown here

#### [ :, : 3] means all rows. :3 means show only the first 3 columns. [data.three > 5] of all the rows, show me which rows have values in column 3 that are > 5

### Integer Indexes

In [None]:
ser = pd.Series(np.arange(3.))
ser

#### so we have the indexes on the left, and if we wanted to specify a row, we could not use ser[-1]. Cause are you referring to the indexes' index at ser[-1] or the row titled '-1]

In [None]:
ser2 = pd.Series(np.arange(3.), index = ['a', 'b', 'c'])
ser2[-1]

#### however when the indexes are non integer, there is no ambiguity. Use loc for labels and iloc for integers

In [None]:
ser[:2]

### Arithmetic and Data Alignment

In [None]:
s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index = ['a', 'c', 'd', 'e'])
s2 = pd. Series([-2.1, 3.6, -1.5, 4, 3.1], index =['a', 'c', 'e', 'f', 'g'])

In [None]:
s1

In [None]:
s2

In [None]:
s1 + s2

#### when you add 2 objects the value will be the union of the index pairs. Notice index f= 4.0 from s2 does not exist in s1 so f when you add s1 + s2 = NaN

In [None]:
df1 = pd.DataFrame(np.arange(9.).reshape(3,3), columns = list('bcd'), index = ['Utah', 'Texas', 'Colorado'])

In [None]:
df2 = pd.DataFrame(np.arange(12.).reshape(4,3), columns = list('bde'), index = ['Utah', 'Ohio', 'Texas', 'Oregon'])

In [None]:
df1

In [None]:
df2

In [None]:
df1 + df2

#### the addition rule from above (where there are no matching indexes > NaN) also applies to columns

In [None]:
df1 = pd.DataFrame({'A': [1, 2]})

In [None]:
df2 = pd.DataFrame({'B': [3, 4]})

In [None]:
df1

In [None]:
df2

In [None]:
df1 + df2

#### and cause neither columns align, you get a df with NaN values

### Arithmetic methods with fill values

In [71]:
df1 = pd.DataFrame(np.arange(12.).reshape((3,4)), columns = list('abcd'))

In [72]:
df2 = pd.DataFrame(np.arange(20.).reshape((4,5)),columns = list('abcde'))

In [73]:
df2.loc[1, 'b'] = np.nan

In [74]:
df1

Unnamed: 0,a,b,c,d
0,0.0,1.0,2.0,3.0
1,4.0,5.0,6.0,7.0
2,8.0,9.0,10.0,11.0


In [75]:
df2

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,4.0
1,5.0,,7.0,8.0,9.0
2,10.0,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [76]:
df1 + df2

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,,13.0,15.0,
2,18.0,20.0,22.0,24.0,
3,,,,,


In [77]:
df1.add(df2, fill_value = 0)

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,4.0
1,9.0,5.0,13.0,15.0,9.0
2,18.0,20.0,22.0,24.0,14.0
3,15.0,16.0,17.0,18.0,19.0


#### fill_value fills in missing data

In [78]:
1/df1

Unnamed: 0,a,b,c,d
0,inf,1.0,0.5,0.333333
1,0.25,0.2,0.166667,0.142857
2,0.125,0.111111,0.1,0.090909


In [80]:
df1.rdiv(1)

Unnamed: 0,a,b,c,d
0,inf,1.0,0.5,0.333333
1,0.25,0.2,0.166667,0.142857
2,0.125,0.111111,0.1,0.090909


#### division example ^, each arith operation has a method that starts wtih r

In [84]:
df1.reindex(columns = df2.columns, fill_value = 0)

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,0
1,4.0,5.0,6.0,7.0,0
2,8.0,9.0,10.0,11.0,0


#### we're reindexing df1 with df2 columns, but column e does not exist in df1. column e then has NaN values > fill value converts the NaN to 0, hence why column e above has 0

### Operations between DataFrame and Series

In [86]:
arr = np.arange(12.).reshape((3,4))

In [87]:
arr

array([[ 0.,  1.,  2.,  3.],
       [ 4.,  5.,  6.,  7.],
       [ 8.,  9., 10., 11.]])

In [89]:
arr[0]

array([0., 1., 2., 3.])

#### you can't see it here but arr minused the first index which is arr[0]. therefore the values in arr[0] (but not returned) are now [0, 0, 0, 0]. This is known as broadcasting. 

In [96]:
frame = pd.DataFrame(np.arange(12.).reshape((4,3)), columns = list('bde'), index = ['Utah', 'Ohio', 'Texas', 'Oregon'])

In [97]:
frame

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [99]:
series = frame.iloc[0]

In [100]:
series

b    0.0
d    1.0
e    2.0
Name: Utah, dtype: float64

In [101]:
frame - series

Unnamed: 0,b,d,e
Utah,0.0,0.0,0.0
Ohio,3.0,3.0,3.0
Texas,6.0,6.0,6.0
Oregon,9.0,9.0,9.0


In [106]:
series2 = pd.Series(range(3), index = ['b', 'e', 'f'])
series2

b    0
e    1
f    2
dtype: int64

In [105]:
frame + series2 #frame has no d, series has no f. therefore they evaluate to NaN

Unnamed: 0,b,d,e,f
Utah,0.0,,3.0,
Ohio,3.0,,6.0,
Texas,6.0,,9.0,
Oregon,9.0,,12.0,


In [112]:
series3 = frame['d']

In [109]:
frame

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [113]:
series3

Utah       1.0
Ohio       4.0
Texas      7.0
Oregon    10.0
Name: d, dtype: float64

In [115]:
frame.sub(series3, axis = 'index')

Unnamed: 0,b,d,e
Utah,-1.0,0.0,1.0
Ohio,-1.0,0.0,1.0
Texas,-1.0,0.0,1.0
Oregon,-1.0,0.0,1.0


#### so we're subtracting based on the column ergo from left to right. we tell the function this by writing: axis = 'index'. Hence from the values in column b subtract the values in column d, column d subtracts itself, and the values in column e subtracts colulmn d

In [116]:
obj = pd.Series(range(4), index = ['d', 'a', 'b', 'c'])

In [117]:
obj.sort_index()

a    1
b    2
c    3
d    0
dtype: int64

### Function Aplication and Mapping 

In [118]:
frame = pd.DataFrame(np.random.randn(4,3), columns = list('bde'), index = ['Utah', 'Ohio', 'Texas', 'Oregon'])

In [119]:
frame

Unnamed: 0,b,d,e
Utah,0.808605,-1.10086,0.283038
Ohio,1.00139,-0.581789,-1.613676
Texas,-1.341156,0.208925,-0.665643
Oregon,0.634848,-0.47483,-0.001749


#### np.random.randn creates a random value based on a guassian distribution, the values you pass into randn are the column dimensions

In [120]:
np.abs(frame)

Unnamed: 0,b,d,e
Utah,0.808605,1.10086,0.283038
Ohio,1.00139,0.581789,1.613676
Texas,1.341156,0.208925,0.665643
Oregon,0.634848,0.47483,0.001749


In [121]:
f = lambda x: x.max() - x.min() #ergo the max value in column b - the min volumn in column b

In [122]:
frame.apply(f)

b    2.342546
d    1.309785
e    1.896714
dtype: float64

In [123]:
frame.apply(f, axis = 'columns')

Utah      1.909465
Ohio      2.615067
Texas     1.550081
Oregon    1.109678
dtype: float64

#### this time, the swap is based on based on axis = 'columns'

In [126]:
def f(x):
    return pd.Series([x.min(), x.max()], index = ['min', 'max'])

In [127]:
frame.apply(f)

Unnamed: 0,b,d,e
min,-1.341156,-1.10086,-1.613676
max,1.00139,0.208925,0.283038


#### and the above applies the min and max per column

In [128]:
format = lambda x: '%.2f' % x #formats - converts float to a 2 decimal float and then converts to a str

In [129]:
frame

Unnamed: 0,b,d,e
Utah,0.808605,-1.10086,0.283038
Ohio,1.00139,-0.581789,-1.613676
Texas,-1.341156,0.208925,-0.665643
Oregon,0.634848,-0.47483,-0.001749


In [131]:
frame.applymap(format) #notice you have to use applymap, not apply to format strings
#the reason why is that Series has a map method for applying an element wise function. Ergo data frame uses applymap
#series uses .map

Unnamed: 0,b,d,e
Utah,0.81,-1.1,0.28
Ohio,1.0,-0.58,-1.61
Texas,-1.34,0.21,-0.67
Oregon,0.63,-0.47,-0.0


In [132]:
frame['e'].map(format)

Utah       0.28
Ohio      -1.61
Texas     -0.67
Oregon    -0.00
Name: e, dtype: object

### Sorting and Ranking

In [135]:
obj = pd.Series(range(4), index = ['d','a', 'b', 'c'])

In [136]:
obj.sort_index() #sorting for series

a    1
b    2
c    3
d    0
dtype: int64

In [137]:
frame = pd.DataFrame(np.arange(8).reshape((2,4)), index = ['three', 'one'], columns = ['d', 'a', 'b', 'c'])

In [138]:
frame.sort_index()

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [139]:
frame.sort_index(axis = 1) # and here we see axis = 1 so we know we're sorting by column

Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4


In [141]:
frame.sort_index(axis = 1, ascending = False) #here you can swap to desc for indexes

Unnamed: 0,d,c,b,a
three,0,3,2,1
one,4,7,6,5


In [142]:
obj = pd.Series([4, 7, -3, 2])

In [144]:
obj.sort_values() #here you sort by values, NOT index

2   -3
3    2
0    4
1    7
dtype: int64

In [145]:
obj = pd.Series([4, np.nan, 7, np.nan, -3, 2])

In [146]:
obj.sort_values() #missing values are pushed to the end of the series when we sort by values

4   -3.0
5    2.0
0    4.0
2    7.0
1    NaN
3    NaN
dtype: float64

In [147]:
frame = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})

In [148]:
frame

Unnamed: 0,b,a
0,4,0
1,7,1
2,-3,0
3,2,1


In [149]:
frame.sort_values(by = 'b') #we can use a specific column as the key in our sort. We do this by using 'by = column name*'

Unnamed: 0,b,a
2,-3,0
3,2,1
0,4,0
1,7,1


In [150]:
frame.sort_values(by = ['a', 'b']) #and here you can sort by multiple vlaues

Unnamed: 0,b,a
2,-3,0
0,4,0
3,2,1
1,7,1


In [3]:
obj = pd.Series([7, -4, 7, 4, 2, 0, 4])

In [5]:
obj

0    7
1   -4
2    7
3    4
4    2
5    0
6    4
dtype: int64

In [6]:
obj.rank()

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

In [7]:
obj.rank(method = 'first') #doesn't take averages, doesn't give top or lower rank between ties. Just gives the rank and ties are decided by top down order

0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64

In [9]:
obj.rank(ascending = False, method = 'max')

0    2.0
1    7.0
2    2.0
3    4.0
4    5.0
5    6.0
6    4.0
dtype: float64

In [10]:
frame = pd.DataFrame({'b': [4.3, 7, -3, 2], 'a': [0, 1, 0, 1], 'c': [-2, 5, 8, -2.5]})

In [11]:
frame

Unnamed: 0,b,a,c
0,4.3,0,-2.0
1,7.0,1,5.0
2,-3.0,0,8.0
3,2.0,1,-2.5


In [12]:
frame.rank(axis = 'columns') #here rank can be calculated based on columns

Unnamed: 0,b,a,c
0,3.0,2.0,1.0
1,3.0,1.0,2.0
2,1.0,2.0,3.0
3,3.0,2.0,1.0


In [14]:
obj = pd.Series(range(5), index = ['a', 'a', 'b', 'b', 'c'])

In [15]:
obj

a    0
a    1
b    2
b    3
c    4
dtype: int64

In [16]:
obj.index.is_unique

False

In [17]:
obj['a'] #indexing a label with multiple entries returns a series, while single entries return scalars

a    0
a    1
dtype: int64

In [18]:
obj['c']

4

In [19]:
df = pd.DataFrame(np.random.randn(4,3), index = ['a', 'a', 'b', 'b'])

In [20]:
df

Unnamed: 0,0,1,2
a,0.25313,0.057896,-0.186234
a,-0.460466,0.667822,0.667899
b,0.198541,0.188877,0.231311
b,1.866919,1.40974,-0.072494


In [21]:
df.loc['b'] #returns both rows - very cool

Unnamed: 0,0,1,2
b,0.198541,0.188877,0.231311
b,1.866919,1.40974,-0.072494


## Summarizing and COmputing Descriptive Statistics

In [24]:
df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]], index = ['a', 'b', 'c', 'd'], columns = ['one', 'two'])

In [25]:
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [27]:
df.sum()

one    9.25
two   -5.80
dtype: float64

In [28]:
df.sum(axis = 'columns') #nan's are considered 0

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [29]:
df.mean(axis = 'columns', skipna = False) # na's can stay na's with skipna

a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64

In [30]:
df.idxmax()

one    b
two    d
dtype: object

In [32]:
df.cumsum() # adds up as you go down

Unnamed: 0,one,two
a,1.4,
b,8.5,-4.5
c,,
d,9.25,-5.8


In [33]:
df.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


In [36]:
obj = pd.Series(['a', 'a', 'b', 'c'] * 4)

In [37]:
obj.describe()

count     16
unique     3
top        a
freq       8
dtype: object

In [38]:
conda install pandas-datareader

Collecting package metadata (current_repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /Users/sean/opt/anaconda3

  added / updated specs:
    - pandas-datareader


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    pandas-datareader-0.9.0    |             py_0          72 KB
    ------------------------------------------------------------
                                           Total:          72 KB

The following NEW packages will be INSTALLED:

  pandas-datareader  pkgs/main/noarch::pandas-datareader-0.9.0-py_0



Downloading and Extracting Packages
pandas-datareader-0. | 72 KB     | ##################################### | 100% 
Preparing transaction: done
Verifying transaction: done
Executing transaction: done

Note: you may need to restart the kernel to use updated packages.


In [42]:
import pandas_datareader.data as web

In [43]:
all_data = {ticker: web.get_data_yahoo(ticker) for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']}

RemoteDataError: Unable to read URL: https://finance.yahoo.com/quote/AAPL/history?period1=1468224000&period2=1625990399&interval=1d&frequency=1d&filter=history
Response Text:
b'<!DOCTYPE html>\n  <html lang="en-us"><head>\n  <meta http-equiv="content-type" content="text/html; charset=UTF-8">\n      <meta charset="utf-8">\n      <title>Yahoo</title>\n      <meta name="viewport" content="width=device-width,initial-scale=1,minimal-ui">\n      <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">\n      <style>\n  html {\n      height: 100%;\n  }\n  body {\n      background: #fafafc url(https://s.yimg.com/nn/img/sad-panda-201402200631.png) 50% 50%;\n      background-size: cover;\n      height: 100%;\n      text-align: center;\n      font: 300 18px "helvetica neue", helvetica, verdana, tahoma, arial, sans-serif;\n  }\n  table {\n      height: 100%;\n      width: 100%;\n      table-layout: fixed;\n      border-collapse: collapse;\n      border-spacing: 0;\n      border: none;\n  }\n  h1 {\n      font-size: 42px;\n      font-weight: 400;\n      color: #400090;\n  }\n  p {\n      color: #1A1A1A;\n  }\n  #message-1 {\n      font-weight: bold;\n      margin: 0;\n  }\n  #message-2 {\n      display: inline-block;\n      *display: inline;\n      zoom: 1;\n      max-width: 17em;\n      _width: 17em;\n  }\n      </style>\n  <script>\n    document.write(\'<img src="//geo.yahoo.com/b?s=1197757129&t=\'+new Date().getTime()+\'&src=aws&err_url=\'+encodeURIComponent(document.URL)+\'&err=%<pssc>&test=\'+encodeURIComponent(\'%<{Bucket}cqh[:200]>\')+\'" width="0px" height="0px"/>\');var beacon = new Image();beacon.src="//bcn.fp.yahoo.com/p?s=1197757129&t="+new Date().getTime()+"&src=aws&err_url="+encodeURIComponent(document.URL)+"&err=%<pssc>&test="+encodeURIComponent(\'%<{Bucket}cqh[:200]>\');\n  </script>\n  </head>\n  <body>\n  <!-- status code : 404 -->\n  <!-- Not Found on Server -->\n  <table>\n  <tbody><tr>\n      <td>\n      <img src="https://s.yimg.com/rz/p/yahoo_frontpage_en-US_s_f_p_205x58_frontpage.png" alt="Yahoo Logo">\n      <h1 style="margin-top:20px;">Will be right back...</h1>\n      <p id="message-1">Thank you for your patience.</p>\n      <p id="message-2">Our engineers are working quickly to resolve the issue.</p>\n      </td>\n  </tr>\n  </tbody></table>\n  </body></html>'

### Unique Values, Value Counts, and memberships 

In [44]:
obj = pd.Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])

In [45]:
unique = obj.unique()

In [47]:
unique # uniques are not sorted in order 

array(['c', 'a', 'd', 'b'], dtype=object)

In [48]:
obj.value_counts() # computes frequency of values in desc order

a    3
c    3
b    2
d    1
dtype: int64

In [49]:
pd.value_counts(obj.values, sort = False) # this doesn't sort it <<

b    2
a    3
d    1
c    3
dtype: int64

In [50]:
obj

0    c
1    a
2    d
3    a
4    a
5    b
6    b
7    c
8    c
dtype: object

In [51]:
# isin performs membership check based on values in series or column in a dataframe, returns a boolean table

In [54]:
mask = obj.isin(['b', 'c'])

In [55]:
mask

0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
dtype: bool

In [56]:
obj[mask] # which creates a dataframe of the true isins

0    c
5    b
6    b
7    c
8    c
dtype: object

In [57]:
data = pd.DataFrame({'Qu1': [1, 3, 4, 3, 4], 
                    'Qu2': [2, 3, 1, 2, 3],
                    'Qu3': [1, 5, 2, 4, 4]})

In [58]:
data

Unnamed: 0,Qu1,Qu2,Qu3
0,1,2,1
1,3,3,5
2,4,1,2
3,3,2,4
4,4,3,4


In [59]:
result = data.apply(pd.value_counts).fillna(0)

In [60]:
result #how many 1's are in each column, how many 2s, etc. is what happens above 

Unnamed: 0,Qu1,Qu2,Qu3
1,1.0,1.0,1.0
2,0.0,2.0,1.0
3,2.0,2.0,0.0
4,2.0,0.0,2.0
5,0.0,0.0,1.0
