# Pandas Essential Functionalities

## How To Reindex Pandas Objects

In [1]:
import pandas as pd
import numpy as np

In [2]:
pd.Series.reindex?

In [3]:
ob = pd.Series([1, 2, 3, 6], index=['d', 'b', 'a', 'c'])
ob

d    1
b    2
a    3
c    6
dtype: int64

In [4]:
ob2 = ob.reindex(index=['a', 'b', 'c', 'd'])
ob2

a    3
b    2
c    6
d    1
dtype: int64

In [5]:
ob3 = ob.reindex(index=['a', 'b', 'c', 'd', 'e'])
ob3

a    3.0
b    2.0
c    6.0
d    1.0
e    NaN
dtype: float64

In [6]:
ob4 = pd.Series([1, 2, 3], index = [0, 1, 2])
ob4

0    1
1    2
2    3
dtype: int64

In [7]:
ob4.reindex(index=np.arange(6))

0    1.0
1    2.0
2    3.0
3    NaN
4    NaN
5    NaN
dtype: float64

In [8]:
ob4.reindex(index=np.arange(6), method = 'ffill')

0    1
1    2
2    3
3    3
4    3
5    3
dtype: int64

In [9]:
ob5 = pd.DataFrame(np.arange(9).reshape((3, 3)), 
                  index=['a', 'c', 'd'], columns=['Andhra', 'Tamilnadu', 'Kerala'])
ob5

Unnamed: 0,Andhra,Tamilnadu,Kerala
a,0,1,2
c,3,4,5
d,6,7,8


In [10]:
pd.DataFrame.reindex?

In [11]:
ob6 = ob5.reindex(index=['a', 'b', 'c', 'd'])
ob6

Unnamed: 0,Andhra,Tamilnadu,Kerala
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [12]:
capitals = ['Andhra', 'Telangana', 'Tamilnadu', 'Kerala']
print(ob5)
ob5.reindex(columns=capitals)

   Andhra  Tamilnadu  Kerala
a       0          1       2
c       3          4       5
d       6          7       8


Unnamed: 0,Andhra,Telangana,Tamilnadu,Kerala
a,0,,1,2
c,3,,4,5
d,6,,7,8


In [13]:
ob5.loc[['a', 'b', 'c', 'd']]
ob5

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
  """Entry point for launching an IPython kernel.


Unnamed: 0,Andhra,Tamilnadu,Kerala
a,0,1,2
c,3,4,5
d,6,7,8


## Droping Entries From an Axis

In [14]:
import pandas as pd
import numpy as np

In [15]:
pd.Series.drop?

In [16]:
pd.DataFrame.drop?

In [17]:
data = pd.Series(np.arange(6), index=['a', 'b', 'c', 'd', 'e', 'f'])
data

a    0
b    1
c    2
d    3
e    4
f    5
dtype: int32

In [18]:
data.drop('a')

b    1
c    2
d    3
e    4
f    5
dtype: int32

In [19]:
n_data = data.drop('a')

In [20]:
data.drop(['a', 'd'])

b    1
c    2
e    4
f    5
dtype: int32

In [21]:
dataframe = pd.DataFrame(np.arange(16).reshape((4, 4)), 
                  index=['a', 'b', 'd', 'e'], columns=['Karnataka', 'Andhra', 'Tamilnadu', 'Kerala'])
dataframe

Unnamed: 0,Karnataka,Andhra,Tamilnadu,Kerala
a,0,1,2,3
b,4,5,6,7
d,8,9,10,11
e,12,13,14,15


In [22]:
dataframe.drop(['a', 'e'])

Unnamed: 0,Karnataka,Andhra,Tamilnadu,Kerala
b,4,5,6,7
d,8,9,10,11


In [23]:
dataframe.drop('Kerala', axis=1)

Unnamed: 0,Karnataka,Andhra,Tamilnadu
a,0,1,2
b,4,5,6
d,8,9,10
e,12,13,14


In [24]:
dataframe.drop(['Kerala', 'Andhra'], axis=1)

Unnamed: 0,Karnataka,Tamilnadu
a,0,2
b,4,6
d,8,10
e,12,14


In [25]:
dataframe.drop(['Kerala', 'Tamilnadu'], axis='columns')

Unnamed: 0,Karnataka,Andhra
a,0,1
b,4,5
d,8,9
e,12,13


In [26]:
dataframe

Unnamed: 0,Karnataka,Andhra,Tamilnadu,Kerala
a,0,1,2,3
b,4,5,6,7
d,8,9,10,11
e,12,13,14,15


In [27]:
dataframe.drop(['Kerala', 'Andhra'], axis=1, inplace=True)

In [28]:
dataframe

Unnamed: 0,Karnataka,Tamilnadu
a,0,2
b,4,6
d,8,10
e,12,14


In [29]:
dataframe

Unnamed: 0,Karnataka,Tamilnadu
a,0,2
b,4,6
d,8,10
e,12,14


## Arithmetic and Data Alignment

In [30]:
import pandas as pd 
import numpy as np

In [31]:
ser1 = pd.Series([7, 5, 4, 1], index=['a', 'c', 'd', 'e'])
ser1

a    7
c    5
d    4
e    1
dtype: int64

In [32]:
ser2 = pd.Series([7, 5, 4, 1, 3], index=['a', 'c', 'e', 'f', 'g'])
ser2

a    7
c    5
e    4
f    1
g    3
dtype: int64

In [33]:
ser1 + ser2

a    14.0
c    10.0
d     NaN
e     5.0
f     NaN
g     NaN
dtype: float64

In [34]:
df1 = pd.DataFrame(np.arange(9).reshape((3, 3)), 
                  columns=['a', 'c', 'd'], index=['Andhra', 'Tamilnadu', 'Kerala'])
df1

Unnamed: 0,a,c,d
Andhra,0,1,2
Tamilnadu,3,4,5
Kerala,6,7,8


In [35]:
df2 = pd.DataFrame(np.arange(16).reshape((4, 4)), 
                  columns=['a', 'b', 'd', 'e'], index=['Karnataka', 'Andhra', 'Tamilnadu', 'Kerala'])
df2

Unnamed: 0,a,b,d,e
Karnataka,0,1,2,3
Andhra,4,5,6,7
Tamilnadu,8,9,10,11
Kerala,12,13,14,15


In [36]:
print('df1:'); print(df1)
print('df2:'); print(df2)
df1 + df2

df1:
           a  c  d
Andhra     0  1  2
Tamilnadu  3  4  5
Kerala     6  7  8
df2:
            a   b   d   e
Karnataka   0   1   2   3
Andhra      4   5   6   7
Tamilnadu   8   9  10  11
Kerala     12  13  14  15


Unnamed: 0,a,b,c,d,e
Andhra,4.0,,,8.0,
Karnataka,,,,,
Kerala,18.0,,,22.0,
Tamilnadu,11.0,,,15.0,


In [37]:
df3 = pd.DataFrame({'A': [1, 2]})
df3

Unnamed: 0,A
0,1
1,2


In [38]:
df4 = pd.DataFrame({'B': [3, 4]})
df4

Unnamed: 0,B
0,3
1,4


In [39]:
df3 + df4

Unnamed: 0,A,B
0,,
1,,


In [40]:
df3 - df4

Unnamed: 0,A,B
0,,
1,,


## Arithmetic methods with fill values

In [43]:
import pandas as pd
import numpy as np

In [44]:
df5 = pd.DataFrame(np.arange(12).reshape((3, 4)), columns=list('abcd'))
df5

Unnamed: 0,a,b,c,d
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11


In [45]:
df6 = pd.DataFrame(np.arange(20).reshape((4, 5)), columns=list('abcde'))
df6

Unnamed: 0,a,b,c,d,e
0,0,1,2,3,4
1,5,6,7,8,9
2,10,11,12,13,14
3,15,16,17,18,19


In [46]:
df6.loc[1, 'c'] = np.nan
df6

Unnamed: 0,a,b,c,d,e
0,0,1,2.0,3,4
1,5,6,,8,9
2,10,11,12.0,13,14
3,15,16,17.0,18,19


In [47]:
print('df5:'); print(df5);
print('df6:'); print(df6)
df5 + df6

df5:
   a  b   c   d
0  0  1   2   3
1  4  5   6   7
2  8  9  10  11
df6:
    a   b     c   d   e
0   0   1   2.0   3   4
1   5   6   NaN   8   9
2  10  11  12.0  13  14
3  15  16  17.0  18  19


Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,11.0,,15.0,
2,18.0,20.0,22.0,24.0,
3,,,,,


In [48]:
pd.DataFrame.add?

In [49]:
print('df5:'); print(df5)
print('df6:'); print(df6)
df5.add(df6, fill_value=0)

df5:
   a  b   c   d
0  0  1   2   3
1  4  5   6   7
2  8  9  10  11
df6:
    a   b     c   d   e
0   0   1   2.0   3   4
1   5   6   NaN   8   9
2  10  11  12.0  13  14
3  15  16  17.0  18  19


Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,4.0
1,9.0,11.0,6.0,15.0,9.0
2,18.0,20.0,22.0,24.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [50]:
print('Before addition')

print(type(df5.loc[1, 'd'])); print(type(df6.loc[1, 'd'])); print()

print("After addition")
print(type(df5.add(df6, fill_value=0).loc[1, 'd']))

Before addition
<class 'numpy.int32'>
<class 'numpy.int32'>

After addition
<class 'numpy.float64'>


In [51]:
print('df5:'); print(df5)
print('df6:'); print(df6)
df5.add(df6, fill_value=10)

df5:
   a  b   c   d
0  0  1   2   3
1  4  5   6   7
2  8  9  10  11
df6:
    a   b     c   d   e
0   0   1   2.0   3   4
1   5   6   NaN   8   9
2  10  11  12.0  13  14
3  15  16  17.0  18  19


Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,14.0
1,9.0,11.0,16.0,15.0,19.0
2,18.0,20.0,22.0,24.0,24.0
3,25.0,26.0,27.0,28.0,29.0


In [52]:
print(df5)
# scalar division
1/df5

   a  b   c   d
0  0  1   2   3
1  4  5   6   7
2  8  9  10  11


Unnamed: 0,a,b,c,d
0,inf,1.0,0.5,0.333333
1,0.25,0.2,0.166667,0.142857
2,0.125,0.111111,0.1,0.090909


In [53]:
print(df5)
df5 * 2

   a  b   c   d
0  0  1   2   3
1  4  5   6   7
2  8  9  10  11


Unnamed: 0,a,b,c,d
0,0,2,4,6
1,8,10,12,14
2,16,18,20,22


In [54]:
print(df5)
df5 - 3

   a  b   c   d
0  0  1   2   3
1  4  5   6   7
2  8  9  10  11


Unnamed: 0,a,b,c,d
0,-3,-2,-1,0
1,1,2,3,4
2,5,6,7,8


In [55]:
print(df5)
df5.rdiv(1)

   a  b   c   d
0  0  1   2   3
1  4  5   6   7
2  8  9  10  11


Unnamed: 0,a,b,c,d
0,inf,1.0,0.5,0.333333
1,0.25,0.2,0.166667,0.142857
2,0.125,0.111111,0.1,0.090909


In [56]:
print(df5)
df5.rmul(2)

   a  b   c   d
0  0  1   2   3
1  4  5   6   7
2  8  9  10  11


Unnamed: 0,a,b,c,d
0,0,2,4,6
1,8,10,12,14
2,16,18,20,22


In [57]:
print(df5)
df5.rpow(2)

   a  b   c   d
0  0  1   2   3
1  4  5   6   7
2  8  9  10  11


Unnamed: 0,a,b,c,d
0,1,2,4,8
1,16,32,64,128
2,256,512,1024,2048


In [58]:
print(df5)
df5.radd(10)

   a  b   c   d
0  0  1   2   3
1  4  5   6   7
2  8  9  10  11


Unnamed: 0,a,b,c,d
0,10,11,12,13
1,14,15,16,17
2,18,19,20,21


## Operations Between DataFrame and Series

In [59]:
import pandas as pd
import numpy as np

In [60]:
df7 = pd.DataFrame(np.arange(12.).reshape((4, 3)),  
                  columns=list('bde'), index=['One', 'Two', 'Three', 'Four'])
df7

Unnamed: 0,b,d,e
One,0.0,1.0,2.0
Two,3.0,4.0,5.0
Three,6.0,7.0,8.0
Four,9.0,10.0,11.0


In [61]:
df7_ser = df7.iloc[0]
print(df7_ser); 
print(type(df7_ser))

b    0.0
d    1.0
e    2.0
Name: One, dtype: float64
<class 'pandas.core.series.Series'>


In [62]:
print(df7); print(df7_ser)
df7 - df7_ser

         b     d     e
One    0.0   1.0   2.0
Two    3.0   4.0   5.0
Three  6.0   7.0   8.0
Four   9.0  10.0  11.0
b    0.0
d    1.0
e    2.0
Name: One, dtype: float64


Unnamed: 0,b,d,e
One,0.0,0.0,0.0
Two,3.0,3.0,3.0
Three,6.0,6.0,6.0
Four,9.0,9.0,9.0


In [63]:
df7

Unnamed: 0,b,d,e
One,0.0,1.0,2.0
Two,3.0,4.0,5.0
Three,6.0,7.0,8.0
Four,9.0,10.0,11.0


In [64]:
ser2 = pd.Series(range(3), index=['b', 'e', 'f'])
ser2

b    0
e    1
f    2
dtype: int64

In [65]:
df7 + ser2

Unnamed: 0,b,d,e,f
One,0.0,,3.0,
Two,3.0,,6.0,
Three,6.0,,9.0,
Four,9.0,,12.0,


In [66]:
df7

Unnamed: 0,b,d,e
One,0.0,1.0,2.0
Two,3.0,4.0,5.0
Three,6.0,7.0,8.0
Four,9.0,10.0,11.0


In [67]:
df7_col = df7['b']
df7_col

One      0.0
Two      3.0
Three    6.0
Four     9.0
Name: b, dtype: float64

In [68]:
df7.sub(df7_col, axis='index')

Unnamed: 0,b,d,e
One,0.0,1.0,2.0
Two,0.0,1.0,2.0
Three,0.0,1.0,2.0
Four,0.0,1.0,2.0


## Function Application and Mapping

In [70]:
import pandas as pd
import numpy as np

In [71]:
pd.DataFrame.apply?

In [72]:
pd.DataFrame.applymap?

In [73]:
df8 = pd.DataFrame(np.random.randn(4, 3),  
                  columns=list('bde'), index=['One', 'Two', 'Three', 'Four'])
df8

Unnamed: 0,b,d,e
One,1.018806,0.831351,-1.660667
Two,0.692427,-0.960694,-0.374481
Three,-0.081594,-2.17142,0.104694
Four,-0.538068,1.739137,-1.769085


In [74]:
abs(df8)

Unnamed: 0,b,d,e
One,1.018806,0.831351,1.660667
Two,0.692427,0.960694,0.374481
Three,0.081594,2.17142,0.104694
Four,0.538068,1.739137,1.769085


In [75]:
f = lambda x: x.max()
print(df8)
df8.apply(f)

              b         d         e
One    1.018806  0.831351 -1.660667
Two    0.692427 -0.960694 -0.374481
Three -0.081594 -2.171420  0.104694
Four  -0.538068  1.739137 -1.769085


b    1.018806
d    1.739137
e    0.104694
dtype: float64

In [76]:
f = lambda x: x.min()
df8.apply(f)

b   -0.538068
d   -2.171420
e   -1.769085
dtype: float64

In [77]:
f = lambda x: x.max() - x.min()
df8.apply(f)

b    1.556875
d    3.910558
e    1.873780
dtype: float64

In [78]:
f = lambda x: x.max()
print(df8)
df8.apply(f, axis='columns')

              b         d         e
One    1.018806  0.831351 -1.660667
Two    0.692427 -0.960694 -0.374481
Three -0.081594 -2.171420  0.104694
Four  -0.538068  1.739137 -1.769085


One      1.018806
Two      0.692427
Three    0.104694
Four     1.739137
dtype: float64

In [79]:
f = lambda x: x.max() - x.min()
df8.apply(f, axis='columns')

One      2.679473
Two      1.653121
Three    2.276115
Four     3.508223
dtype: float64

In [80]:
def f(x): 
    return pd.Series([x.max(), x.min(), x.mean()], index=['max', 'min', 'mean'])
print(df8)
df8.apply(f)

              b         d         e
One    1.018806  0.831351 -1.660667
Two    0.692427 -0.960694 -0.374481
Three -0.081594 -2.171420  0.104694
Four  -0.538068  1.739137 -1.769085


Unnamed: 0,b,d,e
max,1.018806,1.739137,0.104694
min,-0.538068,-2.17142,-1.769085
mean,0.272893,-0.140407,-0.924885


In [81]:
f = lambda x: '%.3f' %x
df8.applymap(f)

Unnamed: 0,b,d,e
One,1.019,0.831,-1.661
Two,0.692,-0.961,-0.374
Three,-0.082,-2.171,0.105
Four,-0.538,1.739,-1.769


## Sorting and Ranking

In [82]:
import pandas as pd
import numpy as np

In [83]:
pd.DataFrame.sort_index?

In [84]:
pd.DataFrame.rank?

In [85]:
series = pd.Series(range(6), index=['d', 'a', 'b', 'c', 'f', 'g'])
series

d    0
a    1
b    2
c    3
f    4
g    5
dtype: int64

In [86]:
series.sort_index(axis=0, level=None, ascending=True)

a    1
b    2
c    3
d    0
f    4
g    5
dtype: int64

In [87]:
df9 = pd.DataFrame(np.random.randn(4, 5),  
                   columns=list('bdeac'), index=['1', '3', '2', '4'])
df9

Unnamed: 0,b,d,e,a,c
1,0.654621,-0.527915,0.000468,-1.312729,-0.52446
3,-1.435226,0.148216,-0.730586,-1.008752,0.611672
2,-0.064169,0.819534,0.315891,-0.343058,-2.40192
4,0.427173,-0.187428,-0.355605,-1.803791,0.846477


In [88]:
df9.sort_index(axis=1, level=None, ascending=True)

Unnamed: 0,a,b,c,d,e
1,-1.312729,0.654621,-0.52446,-0.527915,0.000468
3,-1.008752,-1.435226,0.611672,0.148216,-0.730586
2,-0.343058,-0.064169,-2.40192,0.819534,0.315891
4,-1.803791,0.427173,0.846477,-0.187428,-0.355605


In [89]:
df9.sort_index(axis=1, level=None, ascending=False)

Unnamed: 0,e,d,c,b,a
1,0.000468,-0.527915,-0.52446,0.654621,-1.312729
3,-0.730586,0.148216,0.611672,-1.435226,-1.008752
2,0.315891,0.819534,-2.40192,-0.064169,-0.343058
4,-0.355605,-0.187428,0.846477,0.427173,-1.803791


In [90]:
df9.sort_index(axis=0, level=None, ascending=True)

Unnamed: 0,b,d,e,a,c
1,0.654621,-0.527915,0.000468,-1.312729,-0.52446
2,-0.064169,0.819534,0.315891,-0.343058,-2.40192
3,-1.435226,0.148216,-0.730586,-1.008752,0.611672
4,0.427173,-0.187428,-0.355605,-1.803791,0.846477


In [91]:
df10 =pd.DataFrame(np.arange(12).reshape((3, 4)), 
                   index=['1', '3', '2'], 
                   columns=['d', 'a', 'b', 'c'])
df10

Unnamed: 0,d,a,b,c
1,0,1,2,3
3,4,5,6,7
2,8,9,10,11


In [92]:
df10.sort_index(axis='index', level=None, ascending=True, by=None)

Unnamed: 0,d,a,b,c
1,0,1,2,3
2,8,9,10,11
3,4,5,6,7


In [93]:
df10.sort_index(axis='index', level=None, ascending=True, by=['d'])

  """Entry point for launching an IPython kernel.


Unnamed: 0,d,a,b,c
1,0,1,2,3
3,4,5,6,7
2,8,9,10,11


In [94]:
pd.DataFrame.sort_values?

In [95]:
print(df9)
df9.sort_values(by=['b'])

          b         d         e         a         c
1  0.654621 -0.527915  0.000468 -1.312729 -0.524460
3 -1.435226  0.148216 -0.730586 -1.008752  0.611672
2 -0.064169  0.819534  0.315891 -0.343058 -2.401920
4  0.427173 -0.187428 -0.355605 -1.803791  0.846477


Unnamed: 0,b,d,e,a,c
3,-1.435226,0.148216,-0.730586,-1.008752,0.611672
2,-0.064169,0.819534,0.315891,-0.343058,-2.40192
4,0.427173,-0.187428,-0.355605,-1.803791,0.846477
1,0.654621,-0.527915,0.000468,-1.312729,-0.52446


In [96]:
print(df9)
df9.sort_values(by=['d'])

          b         d         e         a         c
1  0.654621 -0.527915  0.000468 -1.312729 -0.524460
3 -1.435226  0.148216 -0.730586 -1.008752  0.611672
2 -0.064169  0.819534  0.315891 -0.343058 -2.401920
4  0.427173 -0.187428 -0.355605 -1.803791  0.846477


Unnamed: 0,b,d,e,a,c
1,0.654621,-0.527915,0.000468,-1.312729,-0.52446
4,0.427173,-0.187428,-0.355605,-1.803791,0.846477
3,-1.435226,0.148216,-0.730586,-1.008752,0.611672
2,-0.064169,0.819534,0.315891,-0.343058,-2.40192


In [97]:
df8

Unnamed: 0,b,d,e
One,1.018806,0.831351,-1.660667
Two,0.692427,-0.960694,-0.374481
Three,-0.081594,-2.17142,0.104694
Four,-0.538068,1.739137,-1.769085


In [98]:
print(df8)
df8.rank()

              b         d         e
One    1.018806  0.831351 -1.660667
Two    0.692427 -0.960694 -0.374481
Three -0.081594 -2.171420  0.104694
Four  -0.538068  1.739137 -1.769085


Unnamed: 0,b,d,e
One,4.0,3.0,2.0
Two,3.0,2.0,3.0
Three,2.0,1.0,4.0
Four,1.0,4.0,1.0


In [99]:
print(df8)
df8.rank(axis='columns')

              b         d         e
One    1.018806  0.831351 -1.660667
Two    0.692427 -0.960694 -0.374481
Three -0.081594 -2.171420  0.104694
Four  -0.538068  1.739137 -1.769085


Unnamed: 0,b,d,e
One,3.0,2.0,1.0
Two,3.0,1.0,2.0
Three,2.0,1.0,3.0
Four,2.0,3.0,1.0


In [100]:
s = df8.loc[:, 'b']
s

One      1.018806
Two      0.692427
Three   -0.081594
Four    -0.538068
Name: b, dtype: float64

In [101]:
s.rank()

One      4.0
Two      3.0
Three    2.0
Four     1.0
Name: b, dtype: float64

## Axis Indexes with Duplicate Labels

In [102]:
import pandas as pd
import numpy as np

In [103]:
di_s = pd.Series(range(5), index=['a', 'a', 'b', 'b', 'c'])
di_s

a    0
a    1
b    2
b    3
c    4
dtype: int64

In [104]:
pd.Index.is_unique?

In [105]:
di_s.index.is_unique

False

In [106]:
di_s['a']

a    0
a    1
dtype: int64

In [107]:
di_s['b']

b    2
b    3
dtype: int64

In [108]:
df11 = pd.DataFrame(np.random.randn(4, 3), index=['a', 'a', 'b', 'b'])
df11

Unnamed: 0,0,1,2
a,0.563142,-0.6402,-0.148298
a,0.172281,-0.164422,-1.059759
b,-0.741279,-0.039291,-0.397615
b,0.807076,0.01397,-3.113098


In [109]:
df11.index.is_unique

False

In [110]:
df11.loc['a']

Unnamed: 0,0,1,2
a,0.563142,-0.6402,-0.148298
a,0.172281,-0.164422,-1.059759


## How to Summarise and compute Descriptive Statistics?

In [111]:
import pandas as pd
import numpy as np

In [112]:
df12 = pd.DataFrame([[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]], 
                         index=['a', 'b', 'c', 'd'], 
                         columns=['one', 'two'])
df12

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [113]:
df12.sum()

one    9.25
two   -5.80
dtype: float64

In [114]:
df12.sum(axis='columns')

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [115]:
df12.mean(axis='columns', skipna=False)

a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64

In [116]:
df12.idxmax()

one    b
two    d
dtype: object

In [117]:
df12.cumsum()

Unnamed: 0,one,two
a,1.4,
b,8.5,-4.5
c,,
d,9.25,-5.8


In [None]:
pd.Series.describe?

In [118]:
df12.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


In [119]:
ser = pd.Series(['a', 'a', 'b', 'c'] * 4)
print(ser)
print(ser.describe())

0     a
1     a
2     b
3     c
4     a
5     a
6     b
7     c
8     a
9     a
10    b
11    c
12    a
13    a
14    b
15    c
dtype: object
count     16
unique     3
top        a
freq       8
dtype: object


## Unique Values, Value Counts, and Membership

In [122]:
import pandas as pd

In [123]:
ser_u = pd.Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])
ser_u

0    c
1    a
2    d
3    a
4    a
5    b
6    b
7    c
8    c
dtype: object

In [124]:
uniques = ser_u.unique()
uniques

array(['c', 'a', 'd', 'b'], dtype=object)

In [125]:
ser_u.value_counts()

c    3
a    3
b    2
d    1
dtype: int64

In [126]:
membership = ser_u.isin(['d', 'c'])
print(ser_u)
membership

0    c
1    a
2    d
3    a
4    a
5    b
6    b
7    c
8    c
dtype: object


0     True
1    False
2     True
3    False
4    False
5    False
6    False
7     True
8     True
dtype: bool

In [127]:
ser_u[membership]

0    c
2    d
7    c
8    c
dtype: object

In [128]:
non_dist = pd.Series(['c', 'a', 'b', 'b', 'c', 'a'])
non_dist

0    c
1    a
2    b
3    b
4    c
5    a
dtype: object

In [129]:
dist = pd.Series(['c', 'b', 'a'])
dist

0    c
1    b
2    a
dtype: object

In [130]:
pd.Index(dist).get_indexer(non_dist)

array([0, 2, 1, 1, 0, 2], dtype=int64)

In [131]:
df13 = pd.DataFrame({'Qu1': [1, 3, 4, 3], 
                     'Qu2': [2, 3, 1, 2], 
                     'Qu3': [1, 5, 2, 4]})
df13

Unnamed: 0,Qu1,Qu2,Qu3
0,1,2,1
1,3,3,5
2,4,1,2
3,3,2,4


In [132]:
histogram = df13.apply(pd.value_counts)
histogram

Unnamed: 0,Qu1,Qu2,Qu3
1,1.0,1.0,1.0
2,,2.0,1.0
3,2.0,1.0,
4,1.0,,1.0
5,,,1.0


In [133]:
histogram = df13.apply(pd.value_counts).fillna(0)
histogram

Unnamed: 0,Qu1,Qu2,Qu3
1,1.0,1.0,1.0
2,0.0,2.0,1.0
3,2.0,1.0,0.0
4,1.0,0.0,1.0
5,0.0,0.0,1.0
