## Using Pandas DataFrame Structure

In [26]:
import numpy as np
import pandas as pd
import matplotlib as plt

In [27]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]} # dictionary of values

# we can make this data into a DataFrame - which gives us a nicely formatted output
df = pd.DataFrame(data) # data within a dataframe column must be of the same type
df

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [28]:
df.year

0    2000
1    2001
2    2002
3    2001
4    2002
5    2003
Name: year, dtype: int64

In [29]:
df.head(3) # no params default to first 5

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6


In [30]:
df.tail(2) # no params default to last 5

Unnamed: 0,state,year,pop
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [31]:
df.count()

state    6
year     6
pop      6
dtype: int64

In [32]:
df.describe()

Unnamed: 0,year,pop
count,6.0,6.0
mean,2001.5,2.55
std,1.048809,0.836062
min,2000.0,1.5
25%,2001.0,1.875
50%,2001.5,2.65
75%,2002.0,3.125
max,2003.0,3.6


In [33]:
# members of a dataframe can be reordered using columns=
# some columns can be skipped, new columns can also be added
# dataframe rows can be re-indexed (instead of default 0 1 2..) using index=
df2 = pd.DataFrame(data, columns=['year', 'pop', 'state', 'debt'],
               index=['oldest', 'old', 'middle', 'recent', 'penultimate', 'latest'])
df2

Unnamed: 0,year,pop,state,debt
oldest,2000,1.5,Ohio,
old,2001,1.7,Ohio,
middle,2002,3.6,Ohio,
recent,2001,2.4,Nevada,
penultimate,2002,2.9,Nevada,
latest,2003,3.2,Nevada,


### Accessing members of a dataframe

In [34]:
df.year

0    2000
1    2001
2    2002
3    2001
4    2002
5    2003
Name: year, dtype: int64

In [35]:
df2.year # see how index for df and df2 are different

oldest         2000
old            2001
middle         2002
recent         2001
penultimate    2002
latest         2003
Name: year, dtype: int64

In [36]:
df2.loc['middle'] # loc returns location by index value

year     2002
pop       3.6
state    Ohio
debt      NaN
Name: middle, dtype: object

In [37]:
# populate values in a dataframe
df2.debt = 26.5
df2

Unnamed: 0,year,pop,state,debt
oldest,2000,1.5,Ohio,26.5
old,2001,1.7,Ohio,26.5
middle,2002,3.6,Ohio,26.5
recent,2001,2.4,Nevada,26.5
penultimate,2002,2.9,Nevada,26.5
latest,2003,3.2,Nevada,26.5


In [38]:
# populate values in a dataframe using values derived from another column
df2.debt = df2['pop'] * 1000
df2

Unnamed: 0,year,pop,state,debt
oldest,2000,1.5,Ohio,1500.0
old,2001,1.7,Ohio,1700.0
middle,2002,3.6,Ohio,3600.0
recent,2001,2.4,Nevada,2400.0
penultimate,2002,2.9,Nevada,2900.0
latest,2003,3.2,Nevada,3200.0


In [39]:
# populate values in a dataframe using values derived from a range
df2.debt = np.arange(6.) # adding dot makes it a float
df2

Unnamed: 0,year,pop,state,debt
oldest,2000,1.5,Ohio,0.0
old,2001,1.7,Ohio,1.0
middle,2002,3.6,Ohio,2.0
recent,2001,2.4,Nevada,3.0
penultimate,2002,2.9,Nevada,4.0
latest,2003,3.2,Nevada,5.0


In [40]:
# populate values in a dataframe selectively 
vals = pd.Series([-1.2, -1.5, -1.7], index=['oldest','latest','recent'])
#vals
df2.debt = vals # careful - for ones which are not selectively updated, they're updated as NaN
df2

Unnamed: 0,year,pop,state,debt
oldest,2000,1.5,Ohio,-1.2
old,2001,1.7,Ohio,
middle,2002,3.6,Ohio,
recent,2001,2.4,Nevada,-1.7
penultimate,2002,2.9,Nevada,
latest,2003,3.2,Nevada,-1.5


In [41]:
# insert additional data columns, and populate selectively
df2['Eastern'] = df2.state == 'Ohio' # ['name'] notation for new column and .name notation for existing columns, except..
df2

Unnamed: 0,year,pop,state,debt,Eastern
oldest,2000,1.5,Ohio,-1.2,True
old,2001,1.7,Ohio,,True
middle,2002,3.6,Ohio,,True
recent,2001,2.4,Nevada,-1.7,False
penultimate,2002,2.9,Nevada,,False
latest,2003,3.2,Nevada,-1.5,False


## Mini Challenge
#### Derive the debt per population into a new column 'ratio'

In [42]:
df2['ratio'] = df2.debt/df2['pop'] # can't use df2.pop as pop is a reserved method name
df2

Unnamed: 0,year,pop,state,debt,Eastern,ratio
oldest,2000,1.5,Ohio,-1.2,True,-0.8
old,2001,1.7,Ohio,,True,
middle,2002,3.6,Ohio,,True,
recent,2001,2.4,Nevada,-1.7,False,-0.708333
penultimate,2002,2.9,Nevada,,False,
latest,2003,3.2,Nevada,-1.5,False,-0.46875


In [43]:
# we can spot replace values
df2 = df2.fillna(0)

In [44]:
df2

Unnamed: 0,year,pop,state,debt,Eastern,ratio
oldest,2000,1.5,Ohio,-1.2,True,-0.8
old,2001,1.7,Ohio,0.0,True,0.0
middle,2002,3.6,Ohio,0.0,True,0.0
recent,2001,2.4,Nevada,-1.7,False,-0.708333
penultimate,2002,2.9,Nevada,0.0,False,0.0
latest,2003,3.2,Nevada,-1.5,False,-0.46875
