# Pandas Basics

In [3]:
import pandas as pd
from pandas import Series, DataFrame
import numpy as np

## 1. Series

Series is a 1D array-like object containing a sequence of values and an associated array of data labels, called *index*.

In [36]:
a = [4,7,-5,3]
obj = pd.Series(a)
print("obj value:", obj)

print("\nList a times 4:", a*4)
print("\nSeries obj times 4", obj*4)
# index and values

obj value: 0    4
1    7
2   -5
3    3
dtype: int64

List a times 4: [4, 7, -5, 3, 4, 7, -5, 3, 4, 7, -5, 3, 4, 7, -5, 3]

Series obj times 4 0    16
1    28
2   -20
3    12
dtype: int64


Difference between List and Series:
- Math operation "times": 
 * for a list, the list replicates n times 
 * for a series (like the numpy), the value times 4 or the value itself replicates n times if not a number.

In [28]:
b = [4,7,-5,"string"]
obj = pd.Series(b)
print("obj value:", obj)

print("\nList a times 4:", b*4)    # for a list, the list replicates 4 times
print("\nSeries obj times 4", obj*4) # for a series, the value times 4 if a number, or the value replicates 4 time if string.

obj value: 0         4
1         7
2        -5
3    string
dtype: object

List a times 4: [4, 7, -5, 'string', 4, 7, -5, 'string', 4, 7, -5, 'string', 4, 7, -5, 'string']

Series obj times 4 0                          16
1                          28
2                         -20
3    stringstringstringstring
dtype: object


### Values and Index in the Series 
Series has the value and index attributes

In [29]:
# value
print("Values of the series:", obj.values)
# index
print("Indexes of the series:", obj.index)  # like range(4)

Values of the series: [4 7 -5 'string']
Indexes of the series: RangeIndex(start=0, stop=4, step=1)


In [30]:
# Get the value of the series through the index

print("the second value of the series", obj[1])

the second value of the series 7


### Changes the default index of the Series

In [37]:
objs = pd.Series(a, index = ['a','b','c','d'])
print(objs)
print('the index of the series:', objs.index)

a    4
b    7
c   -5
d    3
dtype: int64
the index of the series: Index(['a', 'b', 'c', 'd'], dtype='object')


In [38]:
print('The second item in the series:', objs['b'])

The second item in the series: 7


### Slicing 

Using the Boolean Value

In [39]:
objs[objs > 0]

a    4
b    7
d    3
dtype: int64

In [58]:
objs[['a','d']]

a    4
d    3
dtype: int64

Series is **Mutable**.

In [61]:
objs['e'] = 5
objs

a    4
b    7
c   -5
d    3
e    5
dtype: int64

### Consider the Series as a fixed-length ordered dictionary.
If you input a dictionary into a panda seires construction function, the *key* in the dictionary will automatically become the *index* in the series.

In [55]:
state  = {'aa':1, 'bb':2, 'cc':3, 'dd':4}
state_series = pd.Series(state)
state_series

aa    1
bb    2
cc    3
dd    4
dtype: int64

In [56]:
# change the index
state_index = ['aa', 'bb', 'ccc', 'ddd']
state_series2 = pd.Series(state, index = state_index)
state_series2

aa     1.0
bb     2.0
ccc    NaN
ddd    NaN
dtype: float64

In [57]:
state_series + state_series2

aa     2.0
bb     4.0
cc     NaN
ccc    NaN
dd     NaN
ddd    NaN
dtype: float64

In [54]:
state_series.index = state_index
state_series

aa     1
bb     2
ccc    3
ddd    4
dtype: int64

## 2. DataFrame

Intuitively, the dataframe is just like a spreadsheet. An collection of orderred columns, each of which can be a different value type. <br>
The dataframe can be thought of a dictionary of series all sharing the same index.
* The dataframe has a **ROW** and **COLUMN** indexed.

In [90]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame = pd.DataFrame(data)  # change the default index
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


### DataFrame has the attributes of **columns** and **index**

We then can change the value of the columns and index.

In [91]:
frame_columns = frame.columns
frame_columns

Index(['state', 'year', 'pop'], dtype='object')

In [92]:
frame_index = frame.index
frame_index

RangeIndex(start=0, stop=6, step=1)

### Index option changes the index

In [94]:
frame = pd.DataFrame(data, index = ['obs1', 'obs2', 'obs3', 'obs4', 'obs5', 'obs6'])  # change the default index
frame

Unnamed: 0,state,year,pop
obs1,Ohio,2000,1.5
obs2,Ohio,2001,1.7
obs3,Ohio,2002,3.6
obs4,Nevada,2001,2.4
obs5,Nevada,2002,2.9
obs6,Nevada,2003,3.2


### Columns option changes the order of the columns
If you specify a sequence of columns, then the dataframe's columns will be arranged in that order.

In [96]:
frame = pd.DataFrame(data, columns = ['year', 'state', 'pop'], index = ['obs1', 'obs2', 'obs3', 'obs4', 'obs5', 'obs6'])  # the order of the column
frame

Unnamed: 0,year,state,pop
obs1,2000,Ohio,1.5
obs2,2001,Ohio,1.7
obs3,2002,Ohio,3.6
obs4,2001,Nevada,2.4
obs5,2002,Nevada,2.9
obs6,2003,Nevada,3.2


In [97]:
# when you pass a nonexist column name
frame = pd.DataFrame(data, columns = ['year', 'state', 'pop', 'gdp_pc'], index = ['obs1', 'obs2', 'obs3', 'obs4', 'obs5', 'obs6'])  # the order of the column
frame

Unnamed: 0,year,state,pop,gdp_pc
obs1,2000,Ohio,1.5,
obs2,2001,Ohio,1.7,
obs3,2002,Ohio,3.6,
obs4,2001,Nevada,2.4,
obs5,2002,Nevada,2.9,
obs6,2003,Nevada,3.2,


### Choose multiple columns

In [98]:
frame[['state', 'pop']]

Unnamed: 0,state,pop
obs1,Ohio,1.5
obs2,Ohio,1.7
obs3,Ohio,3.6
obs4,Nevada,2.4
obs5,Nevada,2.9
obs6,Nevada,3.2


In [99]:
frame['state']

obs1      Ohio
obs2      Ohio
obs3      Ohio
obs4    Nevada
obs5    Nevada
obs6    Nevada
Name: state, dtype: object

In [100]:
frame.state

obs1      Ohio
obs2      Ohio
obs3      Ohio
obs4    Nevada
obs5    Nevada
obs6    Nevada
Name: state, dtype: object

### Choose Multiple Rows

In [101]:
frame.iloc[2]

year      2002
state     Ohio
pop        3.6
gdp_pc     NaN
Name: obs3, dtype: object

In [103]:
# try frame.loc[2]

In [105]:
frame.loc['obs3']

year      2002
state     Ohio
pop        3.6
gdp_pc     NaN
Name: obs3, dtype: object

In [147]:
# try frame.iloc['obs3']
# also try frame.loc['state']

Choose one or more than one typical cells

In [150]:
frame.loc[['obs1', 'obs3'], ['state']]

Unnamed: 0,state
obs1,Ohio
obs3,Ohio


### Modify the columns or cells

In [120]:
frame.gdp_pc = "NR"
frame

Unnamed: 0,year,state,pop,gdp_pc
obs1,2000,Ohio,1.5,NR
obs2,2001,Ohio,1.7,NR
obs3,2002,Ohio,3.6,NR
obs4,2001,Nevada,2.4,NR
obs5,2002,Nevada,2.9,NR
obs6,2003,Nevada,3.2,NR


In [125]:
frame.gdp_pc = np.arange(60000, 120000, 10000)
frame

Unnamed: 0,year,state,pop,gdp_pc
obs1,2000,Ohio,1.5,60000
obs2,2001,Ohio,1.7,70000
obs3,2002,Ohio,3.6,80000
obs4,2001,Nevada,2.4,90000
obs5,2002,Nevada,2.9,100000
obs6,2003,Nevada,3.2,110000


In [134]:
# If you assign a Series, then its will be realigned by its labels, inserting missing values in any holes.
debt = pd.Series([1.0, 2.0, 3.0], index=['obs1', 'obs2', 'obs3'])
frame['debt'] = debt
frame

Unnamed: 0,year,state,pop,gdp_pc,debt
obs1,2000,Ohio,1.5,60000,1.0
obs2,2001,Ohio,1.7,70000,2.0
obs3,2002,Ohio,3.6,80000,3.0
obs4,2001,Nevada,2.4,90000,
obs5,2002,Nevada,2.9,100000,
obs6,2003,Nevada,3.2,110000,


In [145]:
frame['estern'] = (frame['state'] == 'Ohio')
frame['eastern'] = (frame['state'] == 'Ohio')
frame

Unnamed: 0,year,state,pop,gdp_pc,debt,eastern,estern
obs1,2000,Ohio,1.5,60000,1.0,True,True
obs2,2001,Ohio,1.7,70000,2.0,True,True
obs3,2002,Ohio,3.6,80000,3.0,True,True
obs4,2001,Nevada,2.4,90000,,False,False
obs5,2002,Nevada,2.9,100000,,False,False
obs6,2003,Nevada,3.2,110000,,False,False


In [146]:
del frame['estern']
frame

Unnamed: 0,year,state,pop,gdp_pc,debt,eastern
obs1,2000,Ohio,1.5,60000,1.0,True
obs2,2001,Ohio,1.7,70000,2.0,True
obs3,2002,Ohio,3.6,80000,3.0,True
obs4,2001,Nevada,2.4,90000,,False
obs5,2002,Nevada,2.9,100000,,False
obs6,2003,Nevada,3.2,110000,,False


### Nested Dict in the construction
If a nested dict is passed to the DataFrame, then pandas will interpret the outer dict keys as the columns, and the inner keys as the row indices. <br>
This is probably because python stores data by columns.

In [153]:
pop = {'Nevada': {2001:2.4, 2002:2.9}, 'Ohio': {2000:1.5, 2001:1.7, 2002:3.6}}
pop_data = pd.DataFrame(pop)
pop_data

Unnamed: 0,Nevada,Ohio
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [154]:
# Transpose
pop_data.T

Unnamed: 0,2000,2001,2002
Nevada,,2.4,2.9
Ohio,1.5,1.7,3.6


## Index Object: The index object of the data frame is not mutable.

In [157]:
labels = pd.Index(range(5))
labels

RangeIndex(start=0, stop=5, step=1)

In [165]:
obs = pd.Series(range(5), index = labels)
obs

0    0
1    1
2    2
3    3
4    4
dtype: int64

In [166]:
obs2 = pd.Series(range(5), index = range(5))
obs2

0    0
1    1
2    2
3    3
4    4
dtype: int64

In [168]:
obs.index is labels

True

### Series/DataFrame Essential Functionality

#### Reindexing method
Create a **new object** with the data conformed to a new index. It funcions like **re-order** the data based on the index. <br>
But it dose not change the original data.

In [5]:
data = [4.5, 7.2,-5.3, 3.6]
index = ['d','b','a','c']
obj = pd.Series(data, index = index)
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [14]:
# The reindex method does not change the original data.
obj.reindex(['a','b','c','d','e'])
print('This is the obj:\n',obj)
print("The reindex() method does not change the original data.")
# It create a new object.
obj2 = obj.reindex(['a','b','c','d','e'])
print("This is the obj2:\n", obj2)
print("The method returns a new data object.")

This is the obj:
 d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64
The reindex() method does not change the original data.
This is the obj2:
 a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64
The method returns a new data object.


--

When reindexing, using the **ffill** moethods to do some interpolation or filling the values.

In [20]:
obj3 = pd.Series(['blue','purple', 'yellow'], index= (0,2,4))
obj3

0      blue
2    purple
4    yellow
dtype: object

In [22]:
obj4 = obj3.reindex(range(6))
print("Obj4 without ffill method: \n", obj4)

obj4 = obj3.reindex(range(6), method ='ffill')
print("\nObj4 with ffill method: \n", obj4)

obj4 = obj3.reindex(range(6), method ='bfill')
print("\nObj4 with bfill method: \n", obj4)

Obj4 without ffill method: 
 0      blue
1       NaN
2    purple
3       NaN
4    yellow
5       NaN
dtype: object

Obj4 with ffill method: 
 0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

Obj4 with bfill method: 
 0      blue
1    purple
2    purple
3    yellow
4    yellow
5       NaN
dtype: object


--

### Reindex Example on Dataframe

In [28]:
frame = pd.DataFrame(np.arange(9).reshape((3,3)), index = ['a','c','d'], columns = ['Ohio', 'Texas', 'California'])
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [29]:
# reindex, the default is the row index
frame = frame.reindex(['a','b','c','d'])
frame

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [32]:
# reindex, index the column
frame = frame.reindex(columns = ['California', 'Texas', 'Ohio', 'Pennsyvinia'])
frame

Unnamed: 0,California,Texas,Ohio,Pennsyvinia
a,2.0,1.0,0.0,
b,,,,
c,5.0,4.0,3.0,
d,8.0,7.0,6.0,


### Dropping Entries from an Axis **Drop()** Method
The drop() method returns a new data object.

In [34]:
series = pd.Series(np.arange(5.), index = ['a', 'b','c','d','e'])
series

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [35]:
# remove the obs indicated iwth index "c"
series.drop('c')
series

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [36]:
# remove the obs indicated iwth index "c"
series = series.drop('c')
series

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

-- <br>
Example on DataFrame

In [56]:
frame = pd.DataFrame(np.arange(16).reshape((4,4)), 
                    index = ['Ohio', 'Colorado', 'Utah', 'New York'], 
                    columns = ['one', 'two', 'three', 'four'])
frame

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [58]:
frame = frame.drop(['Colorado', 'Ohio'], axis = 0)
frame

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
New York,12,13,14,15


In [47]:
frame = frame.drop(['one', 'two'], axis = 1)
frame

Unnamed: 0,three,four
Colorado,6,7
Utah,10,11
New York,14,15


### Indexing, Selection & Filtering

In [59]:
frame = pd.DataFrame(np.arange(16).reshape((4,4)), 
                    index = ['Ohio', 'Colorado', 'Utah', 'New York'], 
                    columns = ['one', 'two', 'three', 'four'])
frame

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [67]:
frame[['one', 'two']]

Unnamed: 0,one,two
Ohio,0,1
Colorado,4,5
Utah,8,9
New York,12,13


In [68]:
frame.loc[:, ['one', 'two']]

Unnamed: 0,one,two
Ohio,0,1
Colorado,4,5
Utah,8,9
New York,12,13


--

In [76]:
print(frame.loc[['Ohio'],['two']])
type(frame.loc[['Ohio'],['two']])

      two
Ohio    1


pandas.core.frame.DataFrame

In [77]:
print(frame.loc['Ohio','two'])
type(frame.loc['Ohio','two'])

1


numpy.int32

In [82]:
frame[frame['two'] > 1]

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [83]:
frame > 5

Unnamed: 0,one,two,three,four
Ohio,False,False,False,False
Colorado,False,False,True,True
Utah,True,True,True,True
New York,True,True,True,True


In [85]:
frame[frame >5] = 0
frame

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,0,0
Utah,0,0,0,0
New York,0,0,0,0


In [92]:
frame.iloc[1:2]

Unnamed: 0,one,two,three,four
Colorado,4,5,0,0


In [96]:
frame.loc[['Colorado', 'Utah']]

Unnamed: 0,one,two,three,four
Colorado,4,5,0,0
Utah,0,0,0,0
