In [1]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np

In [2]:
# Series - one-dimensional array-like object containing an array of data and associated array of data labels, called its index
obj = Series([4,5,-7,3])
obj
# default index consisting of the intefers 0 through N-1, where N is the length of the data

0    4
1    5
2   -7
3    3
dtype: int64

In [3]:
# can get the array representation and index object the series via its values and index attributes
obj.values

array([ 4,  5, -7,  3])

In [4]:
obj.index

RangeIndex(start=0, stop=4, step=1)

In [5]:
# can also create our own index values and their related values
obj2 = Series([4,7,-5,3], index = ['d','v','r','s'])
obj2

d    4
v    7
r   -5
s    3
dtype: int64

In [6]:
# and it's index
obj2.index

Index(['d', 'v', 'r', 's'], dtype='object')

In [7]:
# can use values in the index when selecting single values
obj2['s']

3

In [8]:
# also can change particular index value
obj2['v'] = 23
obj2

d     4
v    23
r    -5
s     3
dtype: int64

In [9]:
# can find particular selected index
obj2[['d','v']]

d     4
v    23
dtype: int64

In [10]:
# filtering with boolean array, scalar multiplication or applying math functions
obj2[obj2>0]

d     4
v    23
s     3
dtype: int64

In [11]:
obj2 * 2

d     8
v    46
r   -10
s     6
dtype: int64

In [12]:
np.exp(obj2) # exponential of each value of obj2

d    5.459815e+01
v    9.744803e+09
r    6.737947e-03
s    2.008554e+01
dtype: float64

In [13]:
# to find out whether this is in the series or not
'b' in obj2

False

In [14]:
'v' in obj2 

True

In [15]:
# Also can create a series from it by passing the dict
sdata = {'Ohio': 25000, 'Michigan': 45000, 'Utah':10000, 'Texas':6000}
obj3 = Series(sdata)
obj3

Michigan    45000
Ohio        25000
Texas        6000
Utah        10000
dtype: int64

In [16]:
# when only passing a dict, the index in the resulting Series will have the dict's keys in sorted order.
states = ['california','Michigan','Texas','Ohio']
obj4 = Series(sdata, index=states)
obj4

california        NaN
Michigan      45000.0
Texas          6000.0
Ohio          25000.0
dtype: float64

In [17]:
# 'isnull' or 'notnull' functions in pandas should be used to detect missing data
pd.isnull(obj4) # same as obj4.isnull()

california     True
Michigan      False
Texas         False
Ohio          False
dtype: bool

In [18]:
pd.notnull(obj4) 

california    False
Michigan       True
Texas          True
Ohio           True
dtype: bool

In [19]:
# Series will automatically aligns differently indexed data in arithmatic operations
obj3

Michigan    45000
Ohio        25000
Texas        6000
Utah        10000
dtype: int64

In [20]:
obj4

california        NaN
Michigan      45000.0
Texas          6000.0
Ohio          25000.0
dtype: float64

In [21]:
obj3 + obj4

Michigan      90000.0
Ohio          50000.0
Texas         12000.0
Utah              NaN
california        NaN
dtype: float64

In [22]:
# In Series itself and index have a name attribure
obj4.name = 'population'  # value column name
obj4.index.name = 'state' # index column name
obj4

state
california        NaN
Michigan      45000.0
Texas          6000.0
Ohio          25000.0
Name: population, dtype: float64

In [23]:
# Data Frames
data = {'state':['Ohio','Ohio','Ohio','Nevada','Nevada'],
        'year' : [ 2000,2001,2002,2001,2002],
        'pop': [1.5,1.7,3.6,2.4,2.9]}
frame = DataFrame(data)
frame

Unnamed: 0,pop,state,year
0,1.5,Ohio,2000
1,1.7,Ohio,2001
2,3.6,Ohio,2002
3,2.4,Nevada,2001
4,2.9,Nevada,2002


In [24]:
# If we specify a sequence of columns, DataFrame's columns will be exactly what we pass
DataFrame(data, columns=['year','state','pop'])

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9


In [25]:
# If we pass a column that isn't contained in data, it will appear with NA values.
frame2 = DataFrame(data, columns = ['year','state','pop','debt'],
                  index = ['one','two','three','four','five'])
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,


In [26]:
# to know only which index are used for columns
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [27]:
# DataFrame can be retrived as a Series
frame2['state'] # dictionary like

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
Name: state, dtype: object

In [28]:
# or by attribute
frame2.year

one      2000
two      2001
three    2002
four     2001
five     2002
Name: year, dtype: int64

In [29]:
# Rows can also be retrived by position or name
# here we are using 'ix' indexing
frame2.ix[3] #or frame2.ix['three']

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  This is separate from the ipykernel package so we can avoid doing imports until


year       2001
state    Nevada
pop         2.4
debt        NaN
Name: four, dtype: object

In [30]:
# columns can be modified by assignment
frame2['debt'] = 16.5
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,16.5
two,2001,Ohio,1.7,16.5
three,2002,Ohio,3.6,16.5
four,2001,Nevada,2.4,16.5
five,2002,Nevada,2.9,16.5


In [31]:
# or
frame2['debt'] = np.arange(5.) # In arange, if you put only 5, it will show integer 
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,0.0
two,2001,Ohio,1.7,1.0
three,2002,Ohio,3.6,2.0
four,2001,Nevada,2.4,3.0
five,2002,Nevada,2.9,4.0


In [32]:
# value's length must match the length of the DataFrame. If not then it will automatically inserting  missing values in any holes
val = Series([-1.2,-1.5,-1.7], index=['two','three','five'])
frame2['debt'] = val
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,-1.5
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,-1.7


In [33]:
# Assigning a column that doesn't exist will create a new column
frame2['eastern']= frame2.state == 'Ohio'
frame2

Unnamed: 0,year,state,pop,debt,eastern
one,2000,Ohio,1.5,,True
two,2001,Ohio,1.7,-1.2,True
three,2002,Ohio,3.6,-1.5,True
four,2001,Nevada,2.4,,False
five,2002,Nevada,2.9,-1.7,False


In [34]:
# 'del' keyworld will delete columns as with dict
del frame2['eastern']
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [35]:
# Another form of data is a nested dict of dicts format
pop = {'Nevada':{2001:2.4,2002:2.9},
       'Ohio': {2000:1.5,2001:1.7,2002:3.6}}
frame3 = DataFrame(pop)
frame3

Unnamed: 0,Nevada,Ohio
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [36]:
# To transpose the result
frame3.T

Unnamed: 0,2000,2001,2002
Nevada,,2.4,2.9
Ohio,1.5,1.7,3.6


In [37]:
DataFrame(frame3, index = [2001,2002,2003]) # also can use 'pop' instead of frame3

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2003,,


In [38]:
# Dicts of Series are treated much in the same way
pdata = {'Ohio': frame3['Ohio'][:-1],
         'Nevada': frame3['Nevada'][:2]}
DataFrame(pdata)

Unnamed: 0,Nevada,Ohio
2000,,1.5
2001,2.4,1.7


In [39]:
# If a DataFrame's index and columns have their name attributes set, these will also be displayed.
frame3.index.name = 'year'
frame3.columns.name = 'state'
frame3  # we can also join these both statements by using ';'

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [40]:
# Like in series the 'values' attrubute returns the data contained in the data frame as 2D ndarray
frame3.values

array([[ nan,  1.5],
       [ 2.4,  1.7],
       [ 2.9,  3.6]])

In [41]:
# different dtype will also accomodate all of the columns
frame2.values

array([[2000, 'Ohio', 1.5, nan],
       [2001, 'Ohio', 1.7, -1.2],
       [2002, 'Ohio', 3.6, -1.5],
       [2001, 'Nevada', 2.4, nan],
       [2002, 'Nevada', 2.9, -1.7]], dtype=object)

In [42]:
# Index Objects
obj = Series(range(3), index = ['a','b','c'])
index = obj.index
index

Index(['a', 'b', 'c'], dtype='object')

In [43]:
obj

a    0
b    1
c    2
dtype: int64

#### Index objects are immutable and thus can't be modified by the user: (Won't execute)
index[1] = 'd'

In [44]:
# Just describe no of values are same as no of index
ind = pd.Index(np.arange(3))
obj2 = Series([1.5,-2.5,0],index = ind)
obj2.index is ind

True

In [45]:
frame3

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [46]:
#
'Ohio' in frame3.columns

True

In [47]:
2003 in frame3.index

False

In [48]:
# Essential Functionality
# Reindexing - create a new object with the data conformed to a new index.
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
obj = Series([4.5,7.2,-5.3,3.6], index=['d','b','a','c'])
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [49]:
# for reindex, introducing missing values if any index values were not already present
obj2 = obj.reindex(['a','b','c','d','e'])
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

In [50]:
# to fill value(e.g 0) rather than NaN
obj.reindex(['a','b','c','d','e'], fill_value = 0)

a   -5.3
b    7.2
c    3.6
d    4.5
e    0.0
dtype: float64

In [51]:
# METHOD - with using FFILL which fills tha values
# reindex method ( interpolation) options
# ffill or pad = fill(or carry) values forward
# bfill or backfill = fill(or carry) values backward
obj3 = Series(['blue', 'purple', 'yellow'], index=[0,2,5])
obj3.reindex(range(7), method = 'ffill')

0      blue
1      blue
2    purple
3    purple
4    purple
5    yellow
6    yellow
dtype: object

In [52]:
# with DataFrame, "REINDEX" can alter either the (row) index, columns, or both.
frame = DataFrame(np.arange(9).reshape((3,3)), index=['a','c','d'], columns= ['Ohio','Texas','California'])
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [53]:
frame2 = frame.reindex(['a','b','c','d'])
frame2

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [54]:
# columns can be reindexed using the "columns" keyword
states = ['Texas', 'Utah', 'California']
frame.reindex(columns=states)

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


In [55]:
# reindexing can be done more succinctly by label-indexing with "ix"
frame.ix[['a','b','c','d'], states]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  


Unnamed: 0,Texas,Utah,California
a,1.0,,2.0
b,,,
c,4.0,,5.0
d,7.0,,8.0


In [56]:
# Dropping entries from an axis
obj = Series(np.arange(5.), index=['a','b','c','d','e'])
new_obj = obj.drop('c')
new_obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [57]:
obj.drop(['d','c'])

a    0.0
b    1.0
e    4.0
dtype: float64

In [58]:
# with DataFrame, index values can be deleted from either axis
data = DataFrame(np.arange(16).reshape((4,4)), 
                 index = ['Ohio','Colorado','Utah','New York'], 
                 columns = ['one', 'two','three','four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [59]:
data.drop(['Colorado', 'Ohio'])

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
New York,12,13,14,15


In [60]:
data.drop('two', axis = 1)

Unnamed: 0,one,three,four
Ohio,0,2,3
Colorado,4,6,7
Utah,8,10,11
New York,12,14,15


In [61]:
data.drop(['two', 'four'], axis = 1)

Unnamed: 0,one,three
Ohio,0,2
Colorado,4,6
Utah,8,10
New York,12,14


In [62]:
# Indexing, selection and filtering
obj = Series(np.arange(4.), index=['a','b','c','d'])
obj

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [63]:
obj['b']

1.0

In [64]:
obj[1]

1.0

In [65]:
obj[2:4]

c    2.0
d    3.0
dtype: float64

In [66]:
obj[['b','a','d']]

b    1.0
a    0.0
d    3.0
dtype: float64

In [67]:
obj[[1,3]]

b    1.0
d    3.0
dtype: float64

In [68]:
obj[obj<2]

a    0.0
b    1.0
dtype: float64

In [69]:
# slicing with labels behaves differently than normal python slicing in that end data point is inclusive
obj['b':'c']

b    1.0
c    2.0
dtype: float64

In [70]:
obj['b':'c'] = 5
obj

a    0.0
b    5.0
c    5.0
d    3.0
dtype: float64

In [71]:
data = DataFrame(np.arange(16).reshape((4,4)), 
                 index = ['Ohio','Colorado','Utah','New York'], 
                 columns = ['one', 'two','three','four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [72]:
# Now retriving one or more columns
data['two']

Ohio         1
Colorado     5
Utah         9
New York    13
Name: two, dtype: int64

In [73]:
data[['three','one']]

Unnamed: 0,three,one
Ohio,2,0
Colorado,6,4
Utah,10,8
New York,14,12


In [74]:
data[:2]

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7


In [75]:
data[data['three']>5]

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [76]:
# Can use in boolean dataframe
data < 5

Unnamed: 0,one,two,three,four
Ohio,True,True,True,True
Colorado,True,False,False,False
Utah,False,False,False,False
New York,False,False,False,False


In [77]:
data[data < 5] = 0
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [78]:
# "ix" = it enables to select a subset of the rows and columns from a DataFrame with NumPy 
data.ix['Colorado', ['two', 'three']]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  


two      5
three    6
Name: Colorado, dtype: int64

In [79]:
data.ix[['Colorado','Utah'], [3,0,1]]

Unnamed: 0,four,one,two
Colorado,7,0,5
Utah,11,8,9


In [80]:
data.ix[2]

one       8
two       9
three    10
four     11
Name: Utah, dtype: int64

In [81]:
data.ix[:'Utah', 'two']

Ohio        0
Colorado    5
Utah        9
Name: two, dtype: int64

In [82]:
data.ix[data.three > 5, :3]  # data.three/four = rows which is > 5, :3 = upto three columns ( first rows and after comma columns)

Unnamed: 0,one,two,three
Colorado,0,5,6
Utah,8,9,10
New York,12,13,14


In [83]:
# Arithmatic and data alignments
# when indexes adding together, if any index pairs are not the same result would be null and same pairs will do function
s1 = Series([7.3,-2.5,3.4,1.5], index=['a','c','d','e'])
s2 = Series([-2.1,3.6,-1.5,4,3.1], index=['a','c','e','f','g'])

In [84]:
s1

a    7.3
c   -2.5
d    3.4
e    1.5
dtype: float64

In [85]:
s2

a   -2.1
c    3.6
e   -1.5
f    4.0
g    3.1
dtype: float64

In [86]:
s1+s2

a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

In [87]:
# for DataFrame, alignment is performed on both  the rows and the columns
df1 = DataFrame(np.arange(9.).reshape((3,3)), columns=list('bcd'),
               index=['Ohio','Texas','Colorado'])
df2 = DataFrame(np.arange(12.).reshape((4,3)), columns=list('bde'),
               index= ['Utah','Ohio','Texas','Oragon'])

In [88]:
df1

Unnamed: 0,b,c,d
Ohio,0.0,1.0,2.0
Texas,3.0,4.0,5.0
Colorado,6.0,7.0,8.0


In [89]:
df2

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oragon,9.0,10.0,11.0


In [90]:
df1+df2

Unnamed: 0,b,c,d,e
Colorado,,,,
Ohio,3.0,,6.0,
Oragon,,,,
Texas,9.0,,12.0,
Utah,,,,


In [91]:
# Arithmatic method with fill values
df1 = DataFrame(np.arange(12.).reshape((3,4)), columns=list('abcd'))
df2 = DataFrame(np.arange(20.).reshape((4,5)), columns=list('abcde'))

In [92]:
df2

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.0,8.0,9.0
2,10.0,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [93]:
df1

Unnamed: 0,a,b,c,d
0,0.0,1.0,2.0,3.0
1,4.0,5.0,6.0,7.0
2,8.0,9.0,10.0,11.0


In [94]:
df1+df2

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,11.0,13.0,15.0,
2,18.0,20.0,22.0,24.0,
3,,,,,


In [95]:
df2.add(df1,fill_value=0)   # add df1 to df2 and in df1 put value = 0 (either way both are same)

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,4.0
1,9.0,11.0,13.0,15.0,9.0
2,18.0,20.0,22.0,24.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [96]:
df1.reindex(columns=df2.columns, fill_value=0) # In basic df1 DF add column of df2 with fillin value = 0

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,0
1,4.0,5.0,6.0,7.0,0
2,8.0,9.0,10.0,11.0,0


In [97]:
# Operations between DataFrame and Series
arr = np.arange(12.).reshape((3,4))
arr

array([[  0.,   1.,   2.,   3.],
       [  4.,   5.,   6.,   7.],
       [  8.,   9.,  10.,  11.]])

In [98]:
arr[0]

array([ 0.,  1.,  2.,  3.])

In [99]:
arr-arr[0]

array([[ 0.,  0.,  0.,  0.],
       [ 4.,  4.,  4.,  4.],
       [ 8.,  8.,  8.,  8.]])

In [100]:
frame = DataFrame(np.arange(12.).reshape((4,3)),columns=list('bde'),
                 index = ['Utah','Ohio','Texas','Oregon'])
series = frame.ix[0]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  This is separate from the ipykernel package so we can avoid doing imports until


In [101]:
frame

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [102]:
series

b    0.0
d    1.0
e    2.0
Name: Utah, dtype: float64

In [103]:
frame-series

Unnamed: 0,b,d,e
Utah,0.0,0.0,0.0
Ohio,3.0,3.0,3.0
Texas,6.0,6.0,6.0
Oregon,9.0,9.0,9.0


In [104]:
from pandas import Series,DataFrame
series2 = Series(range(3), index=['b','e','f'])
series2

b    0
e    1
f    2
dtype: int64

In [105]:
frame+series2

Unnamed: 0,b,d,e,f
Utah,0.0,,3.0,
Ohio,3.0,,6.0,
Texas,6.0,,9.0,
Oregon,9.0,,12.0,


In [106]:
series3 = frame['d']
series3

Utah       1.0
Ohio       4.0
Texas      7.0
Oregon    10.0
Name: d, dtype: float64

In [107]:
frame

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [108]:
frame.sub(series3,axis=0)

Unnamed: 0,b,d,e
Utah,-1.0,0.0,1.0
Ohio,-1.0,0.0,1.0
Texas,-1.0,0.0,1.0
Oregon,-1.0,0.0,1.0


In [109]:
# Function application and mapping
frame = DataFrame(np.random.randn(4,3), columns=list('bde'),
                 index=['Utah','Ohio','Texas','Oregon'])
frame

Unnamed: 0,b,d,e
Utah,-0.201902,-1.800828,-0.083156
Ohio,-0.582689,2.374714,-3.215339
Texas,0.546881,-0.638724,1.029821
Oregon,0.318273,-0.353171,0.035839


In [110]:
# For absolute values
np.abs(frame)

Unnamed: 0,b,d,e
Utah,0.201902,1.800828,0.083156
Ohio,0.582689,2.374714,3.215339
Texas,0.546881,0.638724,1.029821
Oregon,0.318273,0.353171,0.035839


In [111]:
# APPLY function
f = lambda x: x.max() - x.min()

In [112]:
frame.apply(f)  # it will take each columns max and min value to apply 

b    1.129569
d    4.175543
e    4.245161
dtype: float64

In [113]:
frame.apply(f, axis=1) # axis = 1 means row, if axis = 0 it will take column

Utah      1.717672
Ohio      5.590054
Texas     1.668545
Oregon    0.671444
dtype: float64

In [114]:
# function passed to "apply" need not return a scalar value, it can also return as scalar
def f(x):
    return Series([x.min(), x.max()], index=['min', 'max'])
frame.apply(f)

Unnamed: 0,b,d,e
min,-0.582689,-1.800828,-3.215339
max,0.546881,2.374714,1.029821


In [115]:
# Wanted to compute a formatted string from each floating point value in "frame". use APPLYMAP
format = lambda x: '%.2f' % x
frame.applymap(format)                  # applymap for whole frame

Unnamed: 0,b,d,e
Utah,-0.2,-1.8,-0.08
Ohio,-0.58,2.37,-3.22
Texas,0.55,-0.64,1.03
Oregon,0.32,-0.35,0.04


In [116]:
# reason for the name "applymap" is that Series has 'map' method for applying an element-wise function
frame['e'].map(format)                # map for only mentioned Series

Utah      -0.08
Ohio      -3.22
Texas      1.03
Oregon     0.04
Name: e, dtype: object

In [117]:
# Sorting and ranking
# SORT INDEX returns a new, sorted object
obj = Series(range(4), index = ['d','b','a','c'])
obj.sort_index()

a    2
b    1
c    3
d    0
dtype: int64

In [118]:
# Sorting of DF
frame = DataFrame(np.arange(8).reshape((2,4)), index= ['three','one'],
                 columns=['d','b','a','c'])
frame

Unnamed: 0,d,b,a,c
three,0,1,2,3
one,4,5,6,7


In [119]:
frame.sort_index()     # this will do for row (default, need not to use axis=0)

Unnamed: 0,d,b,a,c
one,4,5,6,7
three,0,1,2,3


In [120]:
frame.sort_index(axis=1)

Unnamed: 0,a,b,c,d
three,2,1,3,0
one,6,5,7,4


In [121]:
# Above data are sorted in Ascending order can change it
frame.sort_index(axis=1, ascending=False)

Unnamed: 0,d,c,b,a
three,0,3,1,2
one,4,7,5,6


In [122]:
# To sort a Series by its value, use its ORDER 
obj = Series([4,7,-2,1])
obj

0    4
1    7
2   -2
3    1
dtype: int64

In [126]:
# Any missing values are sored to the end of the Series 
obj = Series([4,np.nan,7,np.nan,-3,2])
obj

0    4.0
1    NaN
2    7.0
3    NaN
4   -3.0
5    2.0
dtype: float64

In [127]:
# In DF, may want to sort by the values in one or more columns. use "BY"
frame = DataFrame({'b':[4,7,-3,2], 'a':[0,1,0,1]})
frame

Unnamed: 0,a,b
0,0,4
1,1,7
2,0,-3
3,1,2


In [128]:
frame.sort_index(by='b')

  """Entry point for launching an IPython kernel.


Unnamed: 0,a,b
2,0,-3
3,1,2
0,0,4
1,1,7


In [129]:
# To sort by multiple columns, pass a LIST of names
frame.sort_index(by=['a','b'])

  


Unnamed: 0,a,b
2,0,-3
0,0,4
3,1,2
1,1,7


In [130]:
# rank breaks ties by assigning each group the mean rank
obj = Series([7,-5,7,4,2,0,4])
obj.rank()

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

In [131]:
# ranks can also be assigned according to the order they're observed in the data
obj.rank(method='first')

0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64

In [132]:
# Naturally, can rank in descending order 
obj.rank(ascending=False, method='max')

0    2.0
1    7.0
2    2.0
3    4.0
4    5.0
5    6.0
6    4.0
dtype: float64

In [133]:
# DataFrame can compute ranks over the rows ot the columns
frame = DataFrame({'b': [4.3,7,-3,2], 'a':[0,1,0,1],
                  'c':[-2,5,8,-2.5]})
frame

Unnamed: 0,a,b,c
0,0,4.3,-2.0
1,1,7.0,5.0
2,0,-3.0,8.0
3,1,2.0,-2.5


In [134]:
frame.rank(axis=1)

Unnamed: 0,a,b,c
0,2.0,3.0,1.0
1,1.0,3.0,2.0
2,2.0,1.0,3.0
3,2.0,3.0,1.0


In [135]:
# Axis indexes with duplicate values
obj = Series(range(5), index = ['a','a','b','c','c'])
obj

a    0
a    1
b    2
c    3
c    4
dtype: int64

In [136]:
# "is_unique" property can tell you whether its values are unique or not
obj.index.is_unique

False

In [137]:
obj['a']

a    0
a    1
dtype: int64

In [138]:
obj['b']

2

In [139]:
# Same logic in DataFrame
df = DataFrame(np.random.randn(4,3), index=['a','a','b','b'])
df

Unnamed: 0,0,1,2
a,0.130151,0.952332,-1.263113
a,-0.604965,-0.696657,-0.150491
b,-0.074325,1.434696,-0.413063
b,0.889225,-0.766861,0.342667


In [140]:
df[0]

a    0.130151
a   -0.604965
b   -0.074325
b    0.889225
Name: 0, dtype: float64

In [141]:
df.ix['b']

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


Unnamed: 0,0,1,2
b,-0.074325,1.434696,-0.413063
b,0.889225,-0.766861,0.342667


In [142]:
# Summerizing and Computing Descriptive Statistics
# Reduction or summary statistics
df = DataFrame([[1.4,np.nan],[7.1,-4.5],
               [np.nan,np.nan],[0.75,-1.3]],
               index=['a','b','c','d'],
               columns=['one','two'])
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [143]:
df.sum()           # by default column operation( it won't include NaN values)

one    9.25
two   -5.80
dtype: float64

In [144]:
df.sum(axis=1)   # for row operations(it won't include NaN values)

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [145]:
# can consider NaN values by putting SKIPNA function
df.mean(axis=1, skipna=False)           

a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64

In [146]:
# "idxmax" and "idxmin" return indirect statistics like where maximum and minimum values are attain
df.idxmax()

one    b
two    d
dtype: object

In [147]:
# CUMSUM = accumulation (ascending summation)
df.cumsum()

Unnamed: 0,one,two
a,1.4,
b,8.5,-4.5
c,,
d,9.25,-5.8


In [148]:
# DESCRIBE = produce multiple summary statistics 
df.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


In [149]:
# For non-numerical data, "describe" prodeces alternate summary statistics
obj = Series(['a','a','b','c'] * 4)

In [150]:
obj.describe()

count     16
unique     3
top        a
freq       8
dtype: object

In [151]:
# Uniques Values, Value counts and Membership
obj = Series(['c','a','d','a','a','b','b','c','c'])
uniques = obj.unique()
uniques

array(['c', 'a', 'd', 'b'], dtype=object)

In [152]:
# for sorting unique values
uniques.sort()
uniques

array(['a', 'b', 'c', 'd'], dtype=object)

In [153]:
# VALUE_COUNTS computes a Series containing values 
obj.value_counts()

c    3
a    3
b    2
d    1
dtype: int64

In [154]:
# can also available as a top-level pandas method that can be used with any array or sequence
pd.value_counts(obj.values, sort=False)

b    2
a    3
d    1
c    3
dtype: int64

In [155]:
# "isin" = useful for filtering the data set down to a subset of values in a Series and DataFrame
mask = obj.isin(['b','c'])
mask

0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
dtype: bool

In [156]:
# for it's location
obj[mask]

0    c
5    b
6    b
7    c
8    c
dtype: object

In [157]:
data = DataFrame({'Qu1': [1,3,4,3,4],
                  'Qu2': [2,3,1,2,3],
                  'Qu3': [1,5,2,4,4]})
data

Unnamed: 0,Qu1,Qu2,Qu3
0,1,2,1
1,3,3,5
2,4,1,2
3,3,2,4
4,4,3,4


In [158]:
# if we may want to compute a histogram on multiple related columns in a dataframe for above DataFrame
result = data.apply(pd.value_counts).fillna(0)
result                                           

Unnamed: 0,Qu1,Qu2,Qu3
1,1.0,1.0,1.0
2,0.0,2.0,1.0
3,2.0,2.0,0.0
4,2.0,0.0,2.0
5,0.0,0.0,1.0


In [159]:
# Handling missing data
string_data = Series(['aardvark','artichoke',np.nan,'avocado'])
string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [160]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [161]:
# Built-in function "None" is also treated as NA in object arrays
string_data[0] = None          # first value will be None
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

In [162]:
# Filtering out Missing data
from numpy import nan as NA
data = Series([1,NA,3.5,NA,7])
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [163]:
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [164]:
# Could computed this by boolean indexing
data[data.notnull()]        # other way to drop NA

0    1.0
2    3.5
4    7.0
dtype: float64

In [165]:
# for DataFrame, 'dropna' will drop rows or columns which are all NA or just those containing any NAs
# in DF, "dropna" by default drops any row containig missing value
data = DataFrame([[1,6.5,3],[1,NA,NA],
                 [NA,NA,NA],[NA,6.5,3]])
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [166]:
data.dropna()

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [167]:
# Passing how='all' will only drop rows that are all NA
data.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [168]:
data[4] = NA
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [169]:
data.dropna(axis=1, how='all')    # showing example that column will remove by passing axis=1

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [170]:
# suppose we want to keep only rows containing a certain number of observations.
df = DataFrame(np.random.randn(7,3))
df

Unnamed: 0,0,1,2
0,0.681143,1.707077,-1.500115
1,-0.779458,1.082851,-1.835123
2,0.945243,-0.421524,1.479021
3,1.668314,0.287062,-0.678329
4,-0.706704,-0.215925,-1.724845
5,0.674911,-0.270072,0.328236
6,-0.222429,-0.811017,-0.225761


In [171]:
df.ix[:4,1] = NA; df.ix[:2,2] = NA
df

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


Unnamed: 0,0,1,2
0,0.681143,,
1,-0.779458,,
2,0.945243,,
3,1.668314,,-0.678329
4,-0.706704,,-1.724845
5,0.674911,-0.270072,0.328236
6,-0.222429,-0.811017,-0.225761


In [172]:
# suppose we want to keep only certain number of observations. use "thresh" argument
df.dropna(thresh=3)

Unnamed: 0,0,1,2
5,0.674911,-0.270072,0.328236
6,-0.222429,-0.811017,-0.225761


In [173]:
# filling in missing data
df.fillna(0)

Unnamed: 0,0,1,2
0,0.681143,0.0,0.0
1,-0.779458,0.0,0.0
2,0.945243,0.0,0.0
3,1.668314,0.0,-0.678329
4,-0.706704,0.0,-1.724845
5,0.674911,-0.270072,0.328236
6,-0.222429,-0.811017,-0.225761


In [174]:
# can aso use different values
df.fillna({1: 0.5, 2: -1})

Unnamed: 0,0,1,2
0,0.681143,0.5,-1.0
1,-0.779458,0.5,-1.0
2,0.945243,0.5,-1.0
3,1.668314,0.5,-0.678329
4,-0.706704,0.5,-1.724845
5,0.674911,-0.270072,0.328236
6,-0.222429,-0.811017,-0.225761


In [175]:
# "fillna" returns a new object, but you can modify the existing object in place
# always returns a reference to the filled object
df.fillna(0, inplace = True)
df

Unnamed: 0,0,1,2
0,0.681143,0.0,0.0
1,-0.779458,0.0,0.0
2,0.945243,0.0,0.0
3,1.668314,0.0,-0.678329
4,-0.706704,0.0,-1.724845
5,0.674911,-0.270072,0.328236
6,-0.222429,-0.811017,-0.225761


In [176]:
# same interpolation methods available for reindexing 
df = DataFrame(np.random.randn(6,3))
df.ix[2:, 1] = NA; df.ix[4:,2] = NA
df

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,0,1,2
0,2.165952,2.107144,0.810513
1,-0.195389,-0.231342,0.154177
2,-0.518758,,-0.384526
3,0.835257,,-2.136183
4,-0.006309,,
5,0.496148,,


In [177]:
# use ffill method to fil same number
df.fillna(method='ffill')

Unnamed: 0,0,1,2
0,2.165952,2.107144,0.810513
1,-0.195389,-0.231342,0.154177
2,-0.518758,-0.231342,-0.384526
3,0.835257,-0.231342,-2.136183
4,-0.006309,-0.231342,-2.136183
5,0.496148,-0.231342,-2.136183


In [178]:
# to put some limit on ffill
df.fillna(method = 'ffill', limit=2)

Unnamed: 0,0,1,2
0,2.165952,2.107144,0.810513
1,-0.195389,-0.231342,0.154177
2,-0.518758,-0.231342,-0.384526
3,0.835257,-0.231342,-2.136183
4,-0.006309,,-2.136183
5,0.496148,,-2.136183


In [179]:
# with "fillna" can also pass the mean or median values of a  Series
data = Series([1,NA, 3.5,NA, 7])
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

In [180]:
# Hierarchical Indexing - enabling you to have multiple(two or more) index levels on an axis
data = Series(np.random.randn(10),
             index=[['a','a','a','b','b','b','c','c','d','d'],
                    [1,2,3,1,2,3,4,5,1,2]])
data

a  1   -1.281443
   2    0.739233
   3   -1.085855
b  1    2.482651
   2   -0.707192
   3    0.266310
c  4    0.109815
   5   -1.397819
d  1   -0.280349
   2    0.911621
dtype: float64

In [181]:
data.index

MultiIndex(levels=[['a', 'b', 'c', 'd'], [1, 2, 3, 4, 5]],
           labels=[[0, 0, 0, 1, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 1, 2, 3, 4, 0, 1]])

In [182]:
# For partially indexing is possible, enabling you to concisely select subsets of the data
data['b']

1    2.482651
2   -0.707192
3    0.266310
dtype: float64

In [183]:
data['b':'c'] #here is for 'b' to 'c'

b  1    2.482651
   2   -0.707192
   3    0.266310
c  4    0.109815
   5   -1.397819
dtype: float64

In [184]:
data.ix[['b', 'd']]     #here is for 'b' and 'd' that's why we took "ix" and double prenthesis

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


b  1    2.482651
   2   -0.707192
   3    0.266310
d  1   -0.280349
   2    0.911621
dtype: float64

In [185]:
# selection is possible from inner level
data[:,2]

a    0.739233
b   -0.707192
d    0.911621
dtype: float64

In [186]:
# This data can be rearranged into a DataFrame using "unstack"  (this option is temporary)
data.unstack()

Unnamed: 0,1,2,3,4,5
a,-1.281443,0.739233,-1.085855,,
b,2.482651,-0.707192,0.26631,,
c,,,,0.109815,-1.397819
d,-0.280349,0.911621,,,


In [187]:
# inverse operation of unstack is stack
data.unstack().stack()

a  1   -1.281443
   2    0.739233
   3   -1.085855
b  1    2.482651
   2   -0.707192
   3    0.266310
c  4    0.109815
   5   -1.397819
d  1   -0.280349
   2    0.911621
dtype: float64

In [188]:
# Also for DataFrame
frame = DataFrame(np.arange(12).reshape((4,3)), index = [['a','a','b','b'],[1,2,1,2]],
                  columns = [['Ohio', 'Ohio','Colorado'],['Green','Red','Green']])
frame

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [189]:
# This hierarchical levels can have names
frame.index.names = ['key1', 'key2']
frame.columns.names = ['state','color']
frame

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [190]:
# can also do partial column indexing
frame['Ohio']

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,0,1
a,2,3,4
b,1,6,7
b,2,9,10


In [191]:
frame.index

MultiIndex(levels=[['a', 'b'], [1, 2]],
           labels=[[0, 0, 1, 1], [0, 1, 0, 1]],
           names=['key1', 'key2'])

In [192]:
# Reordering and sorting levels
# "swaplevel" takes two level numbers or names and returns a new object with the levels interchanged
frame.swaplevel('key1','key2')

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
2,a,3,4,5
1,b,6,7,8
2,b,9,10,11


In [193]:
# "sortlevel" - sort the data only the values in a single level
frame.sortlevel(1)       # '0' for key 1

  


Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
b,1,6,7,8
a,2,3,4,5
b,2,9,10,11


In [194]:
frame.swaplevel(0, 1).sortlevel(0)

  """Entry point for launching an IPython kernel.


Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
1,b,6,7,8
2,a,3,4,5
2,b,9,10,11


In [195]:
# Summay statistics by level
frame.sum(level='key2')

state,Ohio,Ohio,Colorado
color,Green,Red,Green
key2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,6,8,10
2,12,14,16


In [196]:
frame.sum(level='color', axis=1)   # summation of color(all green etc)

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,2,1
a,2,8,4
b,1,14,7
b,2,20,10


In [197]:
# Using a DataFrame's Columns
frame = DataFrame({'a': range(7), 'b': range(7,0,-1),
                   'c': ['one','one','one','two','two','two','two'],
                   'd': [0,1,2,0,1,2,3]})
frame

Unnamed: 0,a,b,c,d
0,0,7,one,0
1,1,6,one,1
2,2,5,one,2
3,3,4,two,0
4,4,3,two,1
5,5,2,two,2
6,6,1,two,3


In [198]:
# "set_index" function will create a new DataFrame using one or more of its columns as the index
frame2 = frame.set_index(['c','d'])
frame2

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1
one,0,0,7
one,1,1,6
one,2,2,5
two,0,3,4
two,1,4,3
two,2,5,2
two,3,6,1


In [199]:
# we can also leave them in DataFrame
frame.set_index(['c','d'], drop=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c,d
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,0,0,7,one,0
one,1,1,6,one,1
one,2,2,5,one,2
two,0,3,4,two,0
two,1,4,3,two,1
two,2,5,2,two,2
two,3,6,1,two,3


In [200]:
# "reset_index" is opposite of 'set_index'
frame2.reset_index()

Unnamed: 0,c,d,a,b
0,one,0,0,7
1,one,1,1,6
2,one,2,2,5
3,two,0,3,4
4,two,1,4,3
5,two,2,5,2
6,two,3,6,1
