### Pandas designed for working with tabular or heterogeneous data

In [2]:
import pandas as pd
import numpy as np

In [3]:
obj = pd.Series([4, 7, -5,3])
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [4]:
obj.values

array([ 4,  7, -5,  3], dtype=int64)

In [5]:
obj.index

RangeIndex(start=0, stop=4, step=1)

In [6]:
obj2 = pd.Series([4, 7, - 5, 3], index=['d' , 'b' , 'a' , 'c' ])
print(obj2)
print()
print(obj2.index)

d    4
b    7
a   -5
c    3
dtype: int64

Index(['d', 'b', 'a', 'c'], dtype='object')


In [7]:
print(obj2['b'])
print(obj2['a'])

7
-5


In [8]:
obj2['d'] = 6  # Update an element in array
obj2

d    6
b    7
a   -5
c    3
dtype: int64

In [9]:
obj2[['c', 'a', 'd']]

c    3
a   -5
d    6
dtype: int64

In [10]:
print(obj2[obj2 > 0])
print()
print(obj2 * 2)
print()
print(np.exp(obj2))

d    6
b    7
c    3
dtype: int64

d    12
b    14
a   -10
c     6
dtype: int64

d     403.428793
b    1096.633158
a       0.006738
c      20.085537
dtype: float64


In [11]:
# Create series from python dictionary
sdata = {'Ohio' : 35000, 'Texas' : 71000, 'Oregon' : 16000, 'Utah' : 5000}
print(sdata)
print()
obj3 = pd.Series(sdata)
print(obj3)

{'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64


In [12]:
states = ['California' , 'Ohio' , 'Oregon' , 'Texas' ]
obj4 = pd.Series(sdata, index=states)
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [13]:
# detect null value
pd.isnull(obj4)

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [14]:
# detect non null value
pd.notnull(obj4)

California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

In [15]:
print(obj3)
print()
print(obj4)

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64


In [16]:
obj3 + obj4

California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64

In [17]:
obj4.name = 'population'
obj4.index.name = 'state'
obj4


state
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64

In [18]:
# A Series's index can be altered in-place by assingment
print(obj)
print()
obj.index = ['Bob', 'Steve', 'Mac', 'Ryan']
print(obj)

0    4
1    7
2   -5
3    3
dtype: int64

Bob      4
Steve    7
Mac     -5
Ryan     3
dtype: int64


In [19]:
# DataFrame - represents rectangular table of data & contains an ordered collection of columns
# and each of which can be diff. value type like Numeric, string, boolean

data = {'state' : ['Ohio' , 'Ohio' , 'Ohio' , 'Nevada' , 'Nevada' , 'Nevada' ],
 'year' : [2000, 2001, 2002, 2001, 2002, 2003],
 'pop' : [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame = pd. DataFrame(data)
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [20]:
frame.head()

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [21]:
# u can specify column sequence
pd.DataFrame(data, columns=['year', 'state', 'pop'])

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


In [22]:
frame2 = pd. DataFrame(data, columns=['year' , 'state' , 'pop' , 'debt' ],
                       index=['one' , 'two' , 'three' , 'four' , 'five' , 'six' ])
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,
six,2003,Nevada,3.2,


In [23]:
# fetch column names in list
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [24]:
# fetch index in list
frame2.index

Index(['one', 'two', 'three', 'four', 'five', 'six'], dtype='object')

In [25]:
# Select unique column
frame2['state']    

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
six      Nevada
Name: state, dtype: object

In [26]:
# Select unique column
frame2.year

one      2000
two      2001
three    2002
four     2001
five     2002
six      2003
Name: year, dtype: int64

In [27]:
frame2.loc['three']

year     2002
state    Ohio
pop       3.6
debt      NaN
Name: three, dtype: object

In [28]:
# adding a column
frame2['debt'] = 16.5  # or same as 'frame2.debt = 16.5'
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,16.5
two,2001,Ohio,1.7,16.5
three,2002,Ohio,3.6,16.5
four,2001,Nevada,2.4,16.5
five,2002,Nevada,2.9,16.5
six,2003,Nevada,3.2,16.5


In [29]:
# When u add a list or arrays to a column, the value's length must match the length of dataframe
# Adding a column to an existing frame

val = pd. Series([- 1.2, - 1.5, - 1.7], index=['two' , 'four' , 'five' ])
print(val)
frame2['debt'] = val
frame2

two    -1.2
four   -1.5
five   -1.7
dtype: float64


Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7
six,2003,Nevada,3.2,


In [30]:
# Adding boolean values to the frame

frame2['eastern'] = frame2.state == 'Ohio'
frame2

Unnamed: 0,year,state,pop,debt,eastern
one,2000,Ohio,1.5,,True
two,2001,Ohio,1.7,-1.2,True
three,2002,Ohio,3.6,,True
four,2001,Nevada,2.4,-1.5,False
five,2002,Nevada,2.9,-1.7,False
six,2003,Nevada,3.2,,False


In [31]:
# delete the column
del frame2['eastern']  # delete the 'eastern' column from frame2
print(frame2.columns)
print()
print(frame2)

Index(['year', 'state', 'pop', 'debt'], dtype='object')

       year   state  pop  debt
one    2000    Ohio  1.5   NaN
two    2001    Ohio  1.7  -1.2
three  2002    Ohio  3.6   NaN
four   2001  Nevada  2.4  -1.5
five   2002  Nevada  2.9  -1.7
six    2003  Nevada  3.2   NaN


In [32]:
## Nested dict of dicts
pop = {'Nevada' : {2001: 2.4, 2002: 2.9},
       'Ohio' : {2000: 1.5, 2001: 1.7, 2002: 3.6}}
print(pop)
frame3 = pd.DataFrame(pop)
frame3

{'Nevada': {2001: 2.4, 2002: 2.9}, 'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}


Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [33]:
frame3.T    # frame 3 transpose

Unnamed: 0,2001,2002,2000
Nevada,2.4,2.9,
Ohio,1.7,3.6,1.5


In [34]:
pdata = {'Ohio' : frame3['Ohio'][: - 1],
         'Nevada' : frame3['Nevada'][: 2]}
pd.DataFrame(pdata)

Unnamed: 0,Ohio,Nevada
2001,1.7,2.4
2002,3.6,2.9


In [35]:
frame3.index.name = 'year' ; 
frame3.columns.name = 'state'

In [36]:
print(frame3)
print()
print(frame3.values)

state  Nevada  Ohio
year               
2001      2.4   1.7
2002      2.9   3.6
2000      NaN   1.5

[[2.4 1.7]
 [2.9 3.6]
 [nan 1.5]]


In [37]:
frame2.values

array([[2000, 'Ohio', 1.5, nan],
       [2001, 'Ohio', 1.7, -1.2],
       [2002, 'Ohio', 3.6, nan],
       [2001, 'Nevada', 2.4, -1.5],
       [2002, 'Nevada', 2.9, -1.7],
       [2003, 'Nevada', 3.2, nan]], dtype=object)

In [38]:
# Index Objects
obj = pd. Series(range(5), index=['a' , 'b' , 'c' , 'd' , 'e' ])
obj

a    0
b    1
c    2
d    3
e    4
dtype: int64

In [39]:
index = obj.index
index

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [40]:
print(index[0])
print(index[1])
print(index[4])

a
b
e


In [41]:
print(index[0:])
print(index[2:])

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')
Index(['c', 'd', 'e'], dtype='object')


In [42]:
print(index[:1])
print(index[:4])

Index(['a'], dtype='object')
Index(['a', 'b', 'c', 'd'], dtype='object')


In [43]:
### Essential Functionality 
# Reindexing

obj = pd.Series([4.5, 7.2, - 5.3, 3.6], index=['d' , 'b' , 'a' , 'c' ])
print(obj)

obj2 = obj.reindex(['a' , 'b' , 'c' , 'd' , 'e' ])
print(obj2)
obj3 = pd.Series(['blue' , 'purple' , 'yellow' ], index=[0, 2, 4])
print(obj3)
obj3.reindex(range(6), method='ffill' )

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64
a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64
0      blue
2    purple
4    yellow
dtype: object


0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

In [44]:
frame = pd.DataFrame(np. arange(9). reshape((3, 3)),
                      index=['a' , 'c' , 'd' ],
                      columns=['Ohio' , 'Texas' , 'California' ])
print(frame)

frame2 = frame.reindex(['a', 'b', 'c', 'd'])
frame2

   Ohio  Texas  California
a     0      1           2
c     3      4           5
d     6      7           8


Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [45]:
states = ['Texas' , 'Utah' , 'California' ]
frame.reindex(columns=states)

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


In [46]:
# Dropping entries from axis

obj = pd.Series(np.arange(5.), index=['a' , 'b' , 'c' , 'd' , 'e' ])
print(obj)

new_obj = obj.drop('c')
print(new_obj)

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64
a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64


In [47]:
obj.drop(['d' , 'c' ])

a    0.0
b    1.0
e    4.0
dtype: float64

In [48]:
# With dataframe index values can be deleted from either axis
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                     index=['Ohio' , 'Colorado' , 'Utah' , 'New York' ],
                     columns=['one' , 'two' , 'three' , 'four' ])

print(data)
data.drop(['Colorado' , 'Ohio' ])

          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7
Utah        8    9     10    11
New York   12   13     14    15


Unnamed: 0,one,two,three,four
Utah,8,9,10,11
New York,12,13,14,15


In [49]:
print(data.drop('two' , axis=1))  # or use axis ='columns' | 1 for column, 0 for rows
print()
print(data.drop(['two' , 'four' ], axis='columns' ))

          one  three  four
Ohio        0      2     3
Colorado    4      6     7
Utah        8     10    11
New York   12     14    15

          one  three
Ohio        0      2
Colorado    4      6
Utah        8     10
New York   12     14


In [50]:
# use drop method ( drop modifies size & shape of Series or Dataframe) it destroys any data that is dropped
obj.drop('c' , inplace=True)
obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [51]:
obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [52]:
# Indexing, Selection & Filtering
obj = pd. Series(np. arange(4. ), index=['a' , 'b' , 'c' , 'd' ])
obj

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [53]:
print(obj['b'])
print(obj[1])
print()
print(obj[2:4])
print()
print(obj[['b' , 'a' , 'd' ]])
print()
print(obj[[1, 3]])
print()
print(obj[obj < 2])

1.0
1.0

c    2.0
d    3.0
dtype: float64

b    1.0
a    0.0
d    3.0
dtype: float64

b    1.0
d    3.0
dtype: float64

a    0.0
b    1.0
dtype: float64


In [54]:
obj

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [55]:
print(obj['b':'c'])
print()
obj['b':'c'] = 5  # Assign new values 
print(obj)

b    1.0
c    2.0
dtype: float64

a    0.0
b    5.0
c    5.0
d    3.0
dtype: float64


In [56]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                     index=['Ohio' , 'Colorado' , 'Utah' , 'New York' ],
                     columns=['one' , 'two' , 'three' , 'four' ])

data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [57]:
print(data['three'])
print()
print(data[['three', 'one']])

Ohio         2
Colorado     6
Utah        10
New York    14
Name: three, dtype: int32

          three  one
Ohio          2    0
Colorado      6    4
Utah         10    8
New York     14   12


In [58]:
data[data['three' ] > 5]

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [59]:
print(data)
print()
print(data < 5)

          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7
Utah        8    9     10    11
New York   12   13     14    15

            one    two  three   four
Ohio       True   True   True   True
Colorado   True  False  False  False
Utah      False  False  False  False
New York  False  False  False  False


In [60]:
# Selection with iloc & loc
data.loc['Colorado' , ['two' , 'three' ]]


two      5
three    6
Name: Colorado, dtype: int32

In [61]:
print(data)
print()
print(data.iloc[2, [3, 0, 1]])
print()
print(data.iloc[2])
print()
print(data.iloc[[1, 2], [3, 0, 1]])
print()
print(data.loc[: 'Utah' , 'two' ])
print()
print(data.iloc[:, : 3][data.three > 5])

          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7
Utah        8    9     10    11
New York   12   13     14    15

four    11
one      8
two      9
Name: Utah, dtype: int32

one       8
two       9
three    10
four     11
Name: Utah, dtype: int32

          four  one  two
Colorado     7    4    5
Utah        11    8    9

Ohio        1
Colorado    5
Utah        9
Name: two, dtype: int32

          one  two  three
Colorado    4    5      6
Utah        8    9     10
New York   12   13     14


In [62]:
# Integer Indexes
ser = pd.Series(np.arange(3.))
print(ser)
print()
ser2 = pd.Series(np.arange(3.), index=['a' , 'b' , 'c' ])
ser2

0    0.0
1    1.0
2    2.0
dtype: float64



a    0.0
b    1.0
c    2.0
dtype: float64

In [63]:
print(ser2[-1])
print()
print(ser[:1])
print()
print(ser.loc[: 1])
print()
print(ser.iloc[: 1])

2.0

0    0.0
dtype: float64

0    0.0
1    1.0
dtype: float64

0    0.0
dtype: float64


In [64]:
### Arithmetic & Data Alingment

# For Series
s1 = pd.Series([7.3, - 2.5, 3.4, 1.5], index=['a' , 'c' , 'd' , 'e' ])
s2 = pd.Series([- 2.1, 3.6, - 1.5, 4, 3.1], index=['a' , 'c' , 'e' , 'f' , 'g' ])
s1, s2

(a    7.3
 c   -2.5
 d    3.4
 e    1.5
 dtype: float64,
 a   -2.1
 c    3.6
 e   -1.5
 f    4.0
 g    3.1
 dtype: float64)

In [65]:
s1 + s2

a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

In [66]:
# For DataFrames
df1 = pd.DataFrame(np.arange(9.). reshape((3, 3)), columns=list('bcd' ),
                    index=['Ohio' , 'Texas' , 'Colorado' ])
df2 = pd.DataFrame(np.arange(12.). reshape((4, 3)), columns=list('bde' ),
                    index=['Utah' , 'Ohio' , 'Texas' , 'Oregon' ])

df1,  df2

(            b    c    d
 Ohio      0.0  1.0  2.0
 Texas     3.0  4.0  5.0
 Colorado  6.0  7.0  8.0,
           b     d     e
 Utah    0.0   1.0   2.0
 Ohio    3.0   4.0   5.0
 Texas   6.0   7.0   8.0
 Oregon  9.0  10.0  11.0)

In [67]:
df1 + df2

Unnamed: 0,b,c,d,e
Colorado,,,,
Ohio,3.0,,6.0,
Oregon,,,,
Texas,9.0,,12.0,
Utah,,,,


In [68]:
df1 = pd.DataFrame(np.arange(12.). reshape((3, 4)),
                    columns=list('abcd' ))
df2 = pd.DataFrame(np.arange(20.). reshape((4, 5)),
                    columns=list('abcde' ))

df1, df2
    


(     a    b     c     d
 0  0.0  1.0   2.0   3.0
 1  4.0  5.0   6.0   7.0
 2  8.0  9.0  10.0  11.0,
       a     b     c     d     e
 0   0.0   1.0   2.0   3.0   4.0
 1   5.0   6.0   7.0   8.0   9.0
 2  10.0  11.0  12.0  13.0  14.0
 3  15.0  16.0  17.0  18.0  19.0)

In [69]:
df2.loc[1, 'b' ] = np.nan
print(df2)
print()
print(df1 + df2)

      a     b     c     d     e
0   0.0   1.0   2.0   3.0   4.0
1   5.0   NaN   7.0   8.0   9.0
2  10.0  11.0  12.0  13.0  14.0
3  15.0  16.0  17.0  18.0  19.0

      a     b     c     d   e
0   0.0   2.0   4.0   6.0 NaN
1   9.0   NaN  13.0  15.0 NaN
2  18.0  20.0  22.0  24.0 NaN
3   NaN   NaN   NaN   NaN NaN


In [70]:
df1.add(df2, fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,4.0
1,9.0,5.0,13.0,15.0,9.0
2,18.0,20.0,22.0,24.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [71]:
1/df1, df1.rdiv(1)   # both are same

(       a         b         c         d
 0    inf  1.000000  0.500000  0.333333
 1  0.250  0.200000  0.166667  0.142857
 2  0.125  0.111111  0.100000  0.090909,
        a         b         c         d
 0    inf  1.000000  0.500000  0.333333
 1  0.250  0.200000  0.166667  0.142857
 2  0.125  0.111111  0.100000  0.090909)

In [72]:
df1.reindex(columns=df2.columns, fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,0
1,4.0,5.0,6.0,7.0,0
2,8.0,9.0,10.0,11.0,0


In [73]:
# Operation between Dataframe & Series

arr = np.arange(12).reshape((3,4))
arr

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [74]:
arr[0]

array([0, 1, 2, 3])

In [75]:
arr - arr[0]

array([[0, 0, 0, 0],
       [4, 4, 4, 4],
       [8, 8, 8, 8]])

In [76]:
frame = pd.DataFrame(np.arange(12. ). reshape((4, 3)),
                      columns=list('bde' ),
                      index=['Utah' , 'Ohio' , 'Texas' , 'Oregon' ])
frame

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [77]:
series = frame.iloc[0]
series

b    0.0
d    1.0
e    2.0
Name: Utah, dtype: float64

In [78]:
frame - series

Unnamed: 0,b,d,e
Utah,0.0,0.0,0.0
Ohio,3.0,3.0,3.0
Texas,6.0,6.0,6.0
Oregon,9.0,9.0,9.0


In [79]:
series2 = pd.Series(range(3), index=['b' , 'e' , 'f' ])
print(series2)

frame + series2

b    0
e    1
f    2
dtype: int64


Unnamed: 0,b,d,e,f
Utah,0.0,,3.0,
Ohio,3.0,,6.0,
Texas,6.0,,9.0,
Oregon,9.0,,12.0,


In [80]:
print(frame)
series3 = frame['d']
series3

          b     d     e
Utah    0.0   1.0   2.0
Ohio    3.0   4.0   5.0
Texas   6.0   7.0   8.0
Oregon  9.0  10.0  11.0


Utah       1.0
Ohio       4.0
Texas      7.0
Oregon    10.0
Name: d, dtype: float64

In [81]:
frame.sub(series3, axis='index' )

Unnamed: 0,b,d,e
Utah,-1.0,0.0,1.0
Ohio,-1.0,0.0,1.0
Texas,-1.0,0.0,1.0
Oregon,-1.0,0.0,1.0


In [82]:
# Function application & mapping

frame = pd. DataFrame(np. random. randn(4, 3), columns=list('bde' ),
                        index=['Utah' , 'Ohio' , 'Texas' , 'Oregon' ])
frame

Unnamed: 0,b,d,e
Utah,-0.203481,2.78944,1.59306
Ohio,0.630506,1.22279,-0.14159
Texas,0.371237,-2.011284,0.281664
Oregon,0.05918,1.198876,2.578235


In [83]:
np.abs(frame)

Unnamed: 0,b,d,e
Utah,0.203481,2.78944,1.59306
Ohio,0.630506,1.22279,0.14159
Texas,0.371237,2.011284,0.281664
Oregon,0.05918,1.198876,2.578235


In [84]:
f = lambda x: x.max() - x.min() # which computes the difference between the maximum and mini‐mum of a Series
print(frame)

frame.apply(f)

               b         d         e
Utah   -0.203481  2.789440  1.593060
Ohio    0.630506  1.222790 -0.141590
Texas   0.371237 -2.011284  0.281664
Oregon  0.059180  1.198876  2.578235


b    0.833987
d    4.800724
e    2.719825
dtype: float64

In [85]:
frame.apply(f, axis='columns' )

Utah      2.992921
Ohio      1.364381
Texas     2.382521
Oregon    2.519055
dtype: float64

In [86]:
def f(x):return pd.Series([x. min(), x. max()], index=['min' , 'max' ]) 
frame.apply(f)

Unnamed: 0,b,d,e
min,-0.203481,-2.011284,-0.14159
max,0.630506,2.78944,2.578235


In [87]:
format = lambda x: ' %.2f' % x
frame.applymap(format)

Unnamed: 0,b,d,e
Utah,-0.2,2.79,1.59
Ohio,0.63,1.22,-0.14
Texas,0.37,-2.01,0.28
Oregon,0.06,1.2,2.58


In [88]:
frame['e'].map(format)

Utah        1.59
Ohio       -0.14
Texas       0.28
Oregon      2.58
Name: e, dtype: object

In [89]:
# Sorting & Ranking

obj = pd.Series(range(4), index=['d' , 'a' , 'b' , 'c' ])
obj.sort_index()

a    1
b    2
c    3
d    0
dtype: int64

In [90]:
frame = pd. DataFrame(np. arange(8). reshape((2, 4)),
                      index=['three' , 'one' ],
                      columns=['d' , 'a' , 'b' , 'c' ])
frame.sort_index()  # Sorted in ascending order by default

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [91]:
frame.sort_index(axis=1)

Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4


In [92]:
frame.sort_index(axis=1, ascending=False)

Unnamed: 0,d,c,b,a
three,0,3,2,1
one,4,7,6,5


In [93]:
obj = pd.Series([4, 7, - 3, 2])
obj.sort_values()

2   -3
3    2
0    4
1    7
dtype: int64

In [94]:
obj = pd.Series([4, np. nan, 7, np. nan, - 3, 2])
obj.sort_values()  # missing values sorted to the end of the series

4   -3.0
5    2.0
0    4.0
2    7.0
1    NaN
3    NaN
dtype: float64

In [95]:
frame = pd.DataFrame({'b' : [4, 7, - 3, 2], 'a' : [0, 1, 0, 1]})
print(frame)
frame.sort_values(by='b')  # Sorting dataframe using column

   b  a
0  4  0
1  7  1
2 -3  0
3  2  1


Unnamed: 0,b,a
2,-3,0
3,2,1
0,4,0
1,7,1


In [96]:
print(frame)
frame.sort_values(by=['a' , 'b'])

   b  a
0  4  0
1  7  1
2 -3  0
3  2  1


Unnamed: 0,b,a
2,-3,0
0,4,0
3,2,1
1,7,1


In [97]:
obj = pd.Series([7, - 5, 7, 4, 2, 0, 4])
print(obj)
print(obj.rank())   # bydefault rank in ascending order

0    7
1   -5
2    7
3    4
4    2
5    0
6    4
dtype: int64
0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64


In [None]:
obj. rank(method='first' )

In [None]:
obj.rank(ascending=False, method='max' )  # rank in descending order

In [None]:
frame = pd.DataFrame({'b' : [4.3, 7, - 3, 2], 'a' : [0, 1, 0, 1],
                       'c' : [- 2, 5, 8, - 2.5]})
print(frame)
print()
print(frame.rank(axis='columns'))
print()
print(frame.max())


In [None]:
# Axis indexes with duplicate labels

obj = pd.Series(range(5), index=['a' , 'a' , 'b' , 'b' , 'c' ])
obj

In [None]:
obj.index.is_unique

In [None]:
obj['b'] 

In [None]:
obj['c']

In [None]:
df = pd.DataFrame(np.random.randn(4, 3), index=['a' , 'a' , 'b' , 'b' ])
print(df)
df.loc['b']

In [None]:
# Computing descriptive statics

df = pd.DataFrame([[1.4, np.nan], [7.1, - 4.5],
                   [np.nan, np.nan], [0.75, - 1.3]],
                   index=['a' , 'b' , 'c' , 'd' ],
                   columns=['one' , 'two' ])
df

In [None]:
df.sum()

In [None]:
df.sum(axis='columns')

In [None]:
df.mean(axis='columns' , skipna=False)

In [None]:
print(df.idxmax())
print()
print(df.cumsum()) 
print()
print(df.describe())
print()
print(df.count())
print()
print(df.mean())

In [None]:
# Corelation & Covariance

import pandas_datareader.data as web
all_data = {ticker: web. get_data_yahoo(ticker)
            for ticker in ['AAPL' , 'IBM' , 'MSFT' , 'GOOG' ]}

price = pd.DataFrame({ticker: data['Adj Close']
                       for ticker, data in all_data. items()})
volume = pd.DataFrame({ticker: data['Volume']
                        for ticker, data in all_data. items()})
