In [1]:
import pandas as pd
import numpy as np


In [2]:
some_series = pd.Series(np.random.rand(5), index=['a','c','d','e','b'])
some_series

a    0.343420
c    0.777896
d    0.258201
e    0.668668
b    0.269924
dtype: float64

In [3]:
some_series[some_series>.5]

c    0.777896
e    0.668668
dtype: float64

In [4]:
# we can update a value in the series
some_series[some_series>.8] = 1
some_series

a    0.343420
c    0.777896
d    0.258201
e    0.668668
b    0.269924
dtype: float64

In [5]:
some_series[['a','d']]

a    0.343420
d    0.258201
dtype: float64

In [6]:
some_series[[0,3]]

a    0.343420
e    0.668668
dtype: float64

In [7]:
# we can sort the index of the series
some_series = some_series.sort_index()
some_series

a    0.343420
b    0.269924
c    0.777896
d    0.258201
e    0.668668
dtype: float64

In [8]:
# we can also sort the series based on the values
some_series.sort_values()

d    0.258201
b    0.269924
a    0.343420
e    0.668668
c    0.777896
dtype: float64

In [9]:
#we can use rank to get the ranking of values, showing the order of values 
# when we run sort etc, we shall have the values in order of rank
some_series.rank()

a    3.0
b    2.0
c    5.0
d    1.0
e    4.0
dtype: float64

In [10]:
# we can also sort them in ascending order
some_series.sort_values(ascending=True)

d    0.258201
b    0.269924
a    0.343420
e    0.668668
c    0.777896
dtype: float64

In [11]:
# we can use drop to remove a value from the series, remember that new series is created 
nss = some_series.drop('c')
nss

a    0.343420
b    0.269924
d    0.258201
e    0.668668
dtype: float64

In [12]:
some_series_1 = pd.Series(np.random.rand(6), index=['a','b','c','d','e','f'])
some_series_1

a    0.227276
b    0.621150
c    0.766684
d    0.989394
e    0.265236
f    0.065219
dtype: float64

In [13]:
# we can do scalar operation on the series 
some_series_2 = some_series_1*2
some_series_2

a    0.454552
b    1.242300
c    1.533369
d    1.978789
e    0.530472
f    0.130438
dtype: float64

In [14]:
some_series_2 = some_series_2 - 1
some_series_2

a   -0.545448
b    0.242300
c    0.533369
d    0.978789
e   -0.469528
f   -0.869562
dtype: float64

In [15]:
# we can add two series 
some_series_2 + some_series_1

a   -0.318172
b    0.863450
c    1.300053
d    1.968183
e   -0.204292
f   -0.804344
dtype: float64

In [16]:
# but only when we have same index names can we see the result, else we shall see NaN
some_series + some_series_2

a   -0.202028
b    0.512223
c    1.311265
d    1.236990
e    0.199140
f         NaN
dtype: float64

In [17]:
# series supports describe that can be used to get some basic stats of data
some_series.describe()

count    5.000000
mean     0.463622
std      0.242372
min      0.258201
25%      0.269924
50%      0.343420
75%      0.668668
max      0.777896
dtype: float64

In [18]:
# we can use the pct_chnage to get the percentage change between series values
new_s = some_series.pct_change()
new_s

a         NaN
b   -0.214012
c    1.881911
d   -0.668078
e    1.589720
dtype: float64

In [19]:
# we can use isnull to get the NaN values 
new_s.isnull()

a     True
b    False
c    False
d    False
e    False
dtype: bool

In [20]:
# we can directly drop the Nan values by dropna
new_s = new_s.dropna()

In [21]:
new_s.min()

-0.6680777015690009

In [22]:
new_s.max()

1.881910664910905

In [23]:
new_s.std()

1.275995917196022

In [24]:
new_s.mean()

0.6473851944277041

In [64]:
# before we trun to Dataframe lets have a look at index heirarchies in series
# here we pass two list of indexes , basicaly we want to create 3 groups of types a,b,c 
another_series = pd.Series(np.random.rand(9), index=[[1,1,1,2,2,2,3,3,3], ['a','b','c','a','b','c','a','b','c']])
another_series

1  a    0.068926
   b    0.030109
   c    0.850493
2  a    0.823103
   b    0.220916
   c    0.450701
3  a    0.967128
   b    0.087010
   c    0.308175
dtype: float64

In [65]:
#when we print index we get to know about the multilevels 
another_series.index

MultiIndex(levels=[[1, 2, 3], ['a', 'b', 'c']],
           labels=[[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]])

In [66]:
#we get the first set
another_series[1]

a    0.068926
b    0.030109
c    0.850493
dtype: float64

In [69]:
#we can extract second level all values too
# this hall give us all 'a'
another_series[:, 'a']

1    0.068926
2    0.823103
3    0.967128
dtype: float64

In [71]:
#we can convert multilevel series to a dataframe
d = another_series.unstack()
type(d)

pandas.core.frame.DataFrame

In [72]:
d

Unnamed: 0,a,b,c
1,0.068926,0.030109,0.850493
2,0.823103,0.220916,0.450701
3,0.967128,0.08701,0.308175


Now lets look at the dataframe features

In [25]:
some_df = pd.DataFrame(np.random.rand(20).reshape(4,5), index=['a','b','c','d'], columns=['c1','c2','c3','c4','c5'])
some_df

Unnamed: 0,c1,c2,c3,c4,c5
a,0.023124,0.361262,0.979241,0.834571,0.595699
b,0.869244,0.60115,0.002771,0.880024,0.976524
c,0.532271,0.669598,0.088552,0.291433,0.63543
d,0.770113,0.81807,0.726456,0.864664,0.719879


In [26]:
# by default we shall remove the row values
some_df.drop('a')

Unnamed: 0,c1,c2,c3,c4,c5
b,0.869244,0.60115,0.002771,0.880024,0.976524
c,0.532271,0.669598,0.088552,0.291433,0.63543
d,0.770113,0.81807,0.726456,0.864664,0.719879


In [27]:
# axis value is 0 for row, i.e. the default behaviour
some_df.drop('a', axis=0)

Unnamed: 0,c1,c2,c3,c4,c5
b,0.869244,0.60115,0.002771,0.880024,0.976524
c,0.532271,0.669598,0.088552,0.291433,0.63543
d,0.770113,0.81807,0.726456,0.864664,0.719879


In [28]:
some_df.drop('c3', axis=1)

Unnamed: 0,c1,c2,c4,c5
a,0.023124,0.361262,0.834571,0.595699
b,0.869244,0.60115,0.880024,0.976524
c,0.532271,0.669598,0.291433,0.63543
d,0.770113,0.81807,0.864664,0.719879


In [29]:
# we can select the data by col index
some_df['c1']

a    0.023124
b    0.869244
c    0.532271
d    0.770113
Name: c1, dtype: float64

In [30]:
# we can use the condition to select relevant data 
some_df[some_df['c3']>.5]

Unnamed: 0,c1,c2,c3,c4,c5
a,0.023124,0.361262,0.979241,0.834571,0.595699
d,0.770113,0.81807,0.726456,0.864664,0.719879


In [31]:
# we can also do conditional on the entire dataframe
some_df > .2


Unnamed: 0,c1,c2,c3,c4,c5
a,False,True,True,True,True
b,True,True,False,True,True
c,True,True,False,True,True
d,True,True,True,True,True


In [32]:
# we can do the scalar operation on the dataframe
some_df_1 = some_df + 1
some_df_1

Unnamed: 0,c1,c2,c3,c4,c5
a,1.023124,1.361262,1.979241,1.834571,1.595699
b,1.869244,1.60115,1.002771,1.880024,1.976524
c,1.532271,1.669598,1.088552,1.291433,1.63543
d,1.770113,1.81807,1.726456,1.864664,1.719879


In [33]:
# we can add two dataframes
some_df_2 = some_df_1 + some_df
some_df_2

Unnamed: 0,c1,c2,c3,c4,c5
a,1.046247,1.722523,2.958482,2.669142,2.191398
b,2.738489,2.2023,1.005542,2.760049,2.953047
c,2.064542,2.339196,1.177103,1.582866,2.270861
d,2.540227,2.63614,2.452913,2.729328,2.439758


In [34]:
# if the row/col numners doesnt match than we shall see the NaN
# subset of row/col data shall be shown  
some_df_2 + some_df[some_df['c3']>.5]

Unnamed: 0,c1,c2,c3,c4,c5
a,1.069371,2.083785,3.937723,3.503713,2.787097
b,,,,,
c,,,,,
d,3.31034,3.45421,3.179369,3.593993,3.159637


In [35]:
#we can use the add function to fill some value instread of NaN
some_df_2.add(some_df[some_df['c3']>.5], fill_value='0')

Unnamed: 0,c1,c2,c3,c4,c5
a,1.069371,2.083785,3.937723,3.503713,2.787097
b,2.738489,2.2023,1.005542,2.760049,2.953047
c,2.064542,2.339196,1.177103,1.582866,2.270861
d,3.31034,3.45421,3.179369,3.593993,3.159637


In [36]:
#we can use the loc to get the particular row
x = some_df_2.loc['a']

In [37]:
# type of this row is series
type(x)

pandas.core.series.Series

In [38]:
x

c1    1.046247
c2    1.722523
c3    2.958482
c4    2.669142
c5    2.191398
Name: a, dtype: float64

In [39]:
# we can do operation like addition, substraction etc between dataFrame and Series
# the value of series shall be deducted from every row
some_df_2 - x

Unnamed: 0,c1,c2,c3,c4,c5
a,0.0,0.0,0.0,0.0,0.0
b,1.692242,0.479777,-1.95294,0.090906,0.76165
c,1.018295,0.616672,-1.781379,-1.086276,0.079463
d,1.493979,0.913617,-0.505569,0.060186,0.24836


In [40]:
new_df = pd.DataFrame(np.random.rand(30).reshape(6,5), index=['a','b','c','d','e','f'], columns=['c1','c2','c3','c4','c5'])
new_df

Unnamed: 0,c1,c2,c3,c4,c5
a,0.604141,0.895063,0.967133,0.740172,0.332916
b,0.790845,0.919555,0.238412,0.880845,0.734552
c,0.685157,0.997906,0.265529,0.453763,0.509011
d,0.824808,0.294703,0.145859,0.293337,0.395556
e,0.684641,0.758851,0.278872,0.696433,0.720534
f,0.906215,0.103272,0.532829,0.962328,0.097048


In [41]:
# pandas supports sum operation
# default behaviour is to sum all the cols
new_df.sum()

c1    4.495808
c2    3.969349
c3    2.428634
c4    4.026878
c5    2.789616
dtype: float64

In [42]:
# we can specify the axis as 1 to do sum based on rows 
new_df.sum(axis=1)

a    3.539425
b    3.564209
c    2.911365
d    1.954263
e    3.139331
f    2.601692
dtype: float64

In [43]:
# we can pass the axis to get min of each row, here we using default
new_df.min()

c1    0.604141
c2    0.103272
c3    0.145859
c4    0.293337
c5    0.097048
dtype: float64

In [44]:
# we can also get the max values
new_df.max(axis=1)

a    0.967133
b    0.919555
c    0.997906
d    0.824808
e    0.758851
f    0.962328
dtype: float64

In [45]:
#we can get the index information for the minimum value 
new_df.idxmin()

c1    a
c2    f
c3    d
c4    d
c5    f
dtype: object

In [46]:
#we can get the index information for the max values
new_df.idxmax(axis=1)

a    c3
b    c2
c    c2
d    c1
e    c2
f    c4
dtype: object

In [47]:
new_df

Unnamed: 0,c1,c2,c3,c4,c5
a,0.604141,0.895063,0.967133,0.740172,0.332916
b,0.790845,0.919555,0.238412,0.880845,0.734552
c,0.685157,0.997906,0.265529,0.453763,0.509011
d,0.824808,0.294703,0.145859,0.293337,0.395556
e,0.684641,0.758851,0.278872,0.696433,0.720534
f,0.906215,0.103272,0.532829,0.962328,0.097048


In [48]:
# we can get the cumilative sum
new_df.cumsum()

Unnamed: 0,c1,c2,c3,c4,c5
a,0.604141,0.895063,0.967133,0.740172,0.332916
b,1.394986,1.814618,1.205545,1.621018,1.067468
c,2.080144,2.812524,1.471074,2.07478,1.576478
d,2.904951,3.107227,1.616933,2.368117,1.972035
e,3.589593,3.866077,1.895805,3.06455,2.692568
f,4.495808,3.969349,2.428634,4.026878,2.789616


In [49]:
new_df.std()

c1    0.110830
c2    0.371470
c3    0.304171
c4    0.254621
c5    0.243947
dtype: float64

In [50]:
# pandas also has a descrive function, which shows us some basic statistic information about the dataframe
new_df.describe()

Unnamed: 0,c1,c2,c3,c4,c5
count,6.0,6.0,6.0,6.0,6.0
mean,0.749301,0.661558,0.404772,0.671146,0.464936
std,0.11083,0.37147,0.304171,0.254621,0.243947
min,0.604141,0.103272,0.145859,0.293337,0.097048
25%,0.68477,0.41074,0.245191,0.51443,0.348576
50%,0.738001,0.826957,0.272201,0.718303,0.452284
75%,0.816317,0.913432,0.46934,0.845677,0.667653
max,0.906215,0.997906,0.967133,0.962328,0.734552


In [51]:
# describe doesnt support axis flag currently
new_df.describe()

Unnamed: 0,c1,c2,c3,c4,c5
count,6.0,6.0,6.0,6.0,6.0
mean,0.749301,0.661558,0.404772,0.671146,0.464936
std,0.11083,0.37147,0.304171,0.254621,0.243947
min,0.604141,0.103272,0.145859,0.293337,0.097048
25%,0.68477,0.41074,0.245191,0.51443,0.348576
50%,0.738001,0.826957,0.272201,0.718303,0.452284
75%,0.816317,0.913432,0.46934,0.845677,0.667653
max,0.906215,0.997906,0.967133,0.962328,0.734552


In [52]:
new_df.mean()

c1    0.749301
c2    0.661558
c3    0.404772
c4    0.671146
c5    0.464936
dtype: float64

In [53]:
new_df.mean(axis=1)

a    0.707885
b    0.712842
c    0.582273
d    0.390853
e    0.627866
f    0.520338
dtype: float64

In [54]:
new_df


Unnamed: 0,c1,c2,c3,c4,c5
a,0.604141,0.895063,0.967133,0.740172,0.332916
b,0.790845,0.919555,0.238412,0.880845,0.734552
c,0.685157,0.997906,0.265529,0.453763,0.509011
d,0.824808,0.294703,0.145859,0.293337,0.395556
e,0.684641,0.758851,0.278872,0.696433,0.720534
f,0.906215,0.103272,0.532829,0.962328,0.097048


In [55]:
# we can get the percentage change between the values
# axis option is supported 
# value used as base value converted to NaN
cnew_df = new_df.pct_change(axis=1)
cnew_df

Unnamed: 0,c1,c2,c3,c4,c5
a,,0.481546,0.08052,-0.234674,-0.550218
b,,0.16275,-0.740732,2.694642,-0.166083
c,,0.456462,-0.733914,0.708901,0.121755
d,,-0.642701,-0.505064,1.011095,0.348473
e,,0.108392,-0.632507,1.497317,0.034606
f,,-0.88604,4.159477,0.806074,-0.899153


In [56]:
# we tell the pandas to delete all the cols where value is NaN 
cnew_df.dropna(axis=1)

Unnamed: 0,c2,c3,c4,c5
a,0.481546,0.08052,-0.234674,-0.550218
b,0.16275,-0.740732,2.694642,-0.166083
c,0.456462,-0.733914,0.708901,0.121755
d,-0.642701,-0.505064,1.011095,0.348473
e,0.108392,-0.632507,1.497317,0.034606
f,-0.88604,4.159477,0.806074,-0.899153


In [57]:
s_df = new_df.pct_change()
s_df

Unnamed: 0,c1,c2,c3,c4,c5
a,,,,,
b,0.309041,0.027364,-0.753486,0.190055,1.206419
c,-0.133639,0.085205,0.113742,-0.484856,-0.307046
d,0.203822,-0.704679,-0.450684,-0.353546,-0.222892
e,-0.169938,1.57497,0.911929,1.374177,0.82157
f,0.323635,-0.86391,0.910654,0.381796,-0.865311


In [58]:

ns_df = s_df.dropna()
ns_df

Unnamed: 0,c1,c2,c3,c4,c5
b,0.309041,0.027364,-0.753486,0.190055,1.206419
c,-0.133639,0.085205,0.113742,-0.484856,-0.307046
d,0.203822,-0.704679,-0.450684,-0.353546,-0.222892
e,-0.169938,1.57497,0.911929,1.374177,0.82157
f,0.323635,-0.86391,0.910654,0.381796,-0.865311


In [59]:
#we can also fill the NaN value with some value
s_df.fillna(1)

Unnamed: 0,c1,c2,c3,c4,c5
a,1.0,1.0,1.0,1.0,1.0
b,0.309041,0.027364,-0.753486,0.190055,1.206419
c,-0.133639,0.085205,0.113742,-0.484856,-0.307046
d,0.203822,-0.704679,-0.450684,-0.353546,-0.222892
e,-0.169938,1.57497,0.911929,1.374177,0.82157
f,0.323635,-0.86391,0.910654,0.381796,-0.865311


In [60]:
#s_df itself doesnt change 
s_df

Unnamed: 0,c1,c2,c3,c4,c5
a,,,,,
b,0.309041,0.027364,-0.753486,0.190055,1.206419
c,-0.133639,0.085205,0.113742,-0.484856,-0.307046
d,0.203822,-0.704679,-0.450684,-0.353546,-0.222892
e,-0.169938,1.57497,0.911929,1.374177,0.82157
f,0.323635,-0.86391,0.910654,0.381796,-0.865311


In [61]:
# when we do inpcae operation we dont get any return dataframe 
# inplcae is supported by many operations
s_df.fillna(0,inplace=True)
s_df

Unnamed: 0,c1,c2,c3,c4,c5
a,0.0,0.0,0.0,0.0,0.0
b,0.309041,0.027364,-0.753486,0.190055,1.206419
c,-0.133639,0.085205,0.113742,-0.484856,-0.307046
d,0.203822,-0.704679,-0.450684,-0.353546,-0.222892
e,-0.169938,1.57497,0.911929,1.374177,0.82157
f,0.323635,-0.86391,0.910654,0.381796,-0.865311


In [127]:
# lets havea look at mutlilevel indexing
another_df = pd.DataFrame((10*np.random.rand(36,)).reshape(6,6), index=[['a','a','a','b','b','b'],[1,2,3,1,2,3]], 
                          columns=[['car', 'car','bus','bus','metro', 'metro'],['fast','slow','fast','slow','fast','slow']])
another_df                   

Unnamed: 0_level_0,Unnamed: 1_level_0,car,car,bus,bus,metro,metro
Unnamed: 0_level_1,Unnamed: 1_level_1,fast,slow,fast,slow,fast,slow
a,1,6.018007,5.237671,9.919134,8.839063,2.114987,6.843967
a,2,5.772469,6.989484,4.445156,1.845333,9.816755,2.743572
a,3,3.410426,0.041655,1.347723,6.680796,3.786686,5.345911
b,1,6.062643,7.995729,2.404485,4.833443,8.451274,7.57336
b,2,4.806986,4.562165,1.469422,8.499959,4.226213,0.646013
b,3,7.366148,6.021188,1.878591,0.620946,2.591152,9.066613


In [128]:
# we shall get the muti level indexing info
another_df.index

MultiIndex(levels=[['a', 'b'], [1, 2, 3]],
           labels=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]])

In [129]:
another_df.index.names = ['type', 'class']
another_df.columns.names = ['vehicle', 'speed']
another_df

Unnamed: 0_level_0,vehicle,car,car,bus,bus,metro,metro
Unnamed: 0_level_1,speed,fast,slow,fast,slow,fast,slow
type,class,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
a,1,6.018007,5.237671,9.919134,8.839063,2.114987,6.843967
a,2,5.772469,6.989484,4.445156,1.845333,9.816755,2.743572
a,3,3.410426,0.041655,1.347723,6.680796,3.786686,5.345911
b,1,6.062643,7.995729,2.404485,4.833443,8.451274,7.57336
b,2,4.806986,4.562165,1.469422,8.499959,4.226213,0.646013
b,3,7.366148,6.021188,1.878591,0.620946,2.591152,9.066613


In [134]:
another_df = another_df.swaplevel('vehicle','speed',axis=1)
another_df

Unnamed: 0_level_0,vehicle,car,car,bus,bus,metro,metro
Unnamed: 0_level_1,speed,fast,slow,fast,slow,fast,slow
type,class,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
a,1,6.018007,5.237671,9.919134,8.839063,2.114987,6.843967
a,2,5.772469,6.989484,4.445156,1.845333,9.816755,2.743572
a,3,3.410426,0.041655,1.347723,6.680796,3.786686,5.345911
b,1,6.062643,7.995729,2.404485,4.833443,8.451274,7.57336
b,2,4.806986,4.562165,1.469422,8.499959,4.226213,0.646013
b,3,7.366148,6.021188,1.878591,0.620946,2.591152,9.066613


In [135]:
another_df.sort_index(1)

Unnamed: 0_level_0,vehicle,bus,bus,car,car,metro,metro
Unnamed: 0_level_1,speed,fast,slow,fast,slow,fast,slow
type,class,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
a,1,9.919134,8.839063,6.018007,5.237671,2.114987,6.843967
a,2,4.445156,1.845333,5.772469,6.989484,9.816755,2.743572
a,3,1.347723,6.680796,3.410426,0.041655,3.786686,5.345911
b,1,2.404485,4.833443,6.062643,7.995729,8.451274,7.57336
b,2,1.469422,8.499959,4.806986,4.562165,4.226213,0.646013
b,3,1.878591,0.620946,7.366148,6.021188,2.591152,9.066613


In [136]:
another_df.sum()

vehicle  speed
car      fast     33.436679
         slow     30.847892
bus      fast     21.464510
         slow     31.319540
metro    fast     30.987067
         slow     32.219436
dtype: float64

In [139]:
another_df.sum(level='speed', axis=1)

Unnamed: 0_level_0,speed,fast,slow
type,class,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,18.052128,20.920701
a,2,20.03438,11.578389
a,3,8.544835,12.068362
b,1,16.918402,20.402531
b,2,10.502622,13.708137
b,3,11.83589,15.708747
