https://pandas.pydata.org/pandas-docs/stable/getting_started/basics.html

# Essential basic functionality

In [4]:
import pandas as pd
import numpy as np

In [2]:
index = pd.date_range('1/1/2019',periods=8)

In [11]:
index

DatetimeIndex(['2019-01-01', '2019-01-02', '2019-01-03', '2019-01-04',
               '2019-01-05', '2019-01-06', '2019-01-07', '2019-01-08'],
              dtype='datetime64[ns]', freq='D')

In [22]:
s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])
s

a    1.331132
b    0.718406
c    0.080788
d    0.462967
e    0.926048
dtype: float64

In [24]:
df = pd.DataFrame(np.random.randn(8,3),index=index,columns=['A','B','C'])

In [25]:
df

Unnamed: 0,A,B,C
2019-01-01,0.146498,-0.800365,-0.596474
2019-01-02,-1.291924,-0.809541,-0.827662
2019-01-03,0.674604,1.493965,0.790137
2019-01-04,0.602992,0.762855,0.273925
2019-01-05,2.033115,-0.487244,0.542919
2019-01-06,0.265107,-1.724537,1.076319
2019-01-07,0.546746,-1.711256,-0.081782
2019-01-08,-0.285164,-0.364753,-0.475821


In [30]:
df.loc["2019"]

Unnamed: 0,A,B,C
2019-01-01,0.146498,-0.800365,-0.596474
2019-01-02,-1.291924,-0.809541,-0.827662
2019-01-03,0.674604,1.493965,0.790137
2019-01-04,0.602992,0.762855,0.273925
2019-01-05,2.033115,-0.487244,0.542919
2019-01-06,0.265107,-1.724537,1.076319
2019-01-07,0.546746,-1.711256,-0.081782
2019-01-08,-0.285164,-0.364753,-0.475821


## Attributes and underlying data

#### pandas objects have a number of attributes enabling you to access the metadata

In [31]:
df.shape

(8, 3)

In [32]:
df[:2]

Unnamed: 0,A,B,C
2019-01-01,0.146498,-0.800365,-0.596474
2019-01-02,-1.291924,-0.809541,-0.827662


In [33]:
df.columns

Index(['A', 'B', 'C'], dtype='object')

In [34]:
df.columns = [x.lower() for x in df.columns]

In [35]:
df

Unnamed: 0,a,b,c
2019-01-01,0.146498,-0.800365,-0.596474
2019-01-02,-1.291924,-0.809541,-0.827662
2019-01-03,0.674604,1.493965,0.790137
2019-01-04,0.602992,0.762855,0.273925
2019-01-05,2.033115,-0.487244,0.542919
2019-01-06,0.265107,-1.724537,1.076319
2019-01-07,0.546746,-1.711256,-0.081782
2019-01-08,-0.285164,-0.364753,-0.475821


In [37]:
s.array

<PandasArray>
[ 1.3311323195096614,  0.7184056970505551,  0.0807876957235343,
 0.46296650639656134,  0.9260479463854028]
Length: 5, dtype: float64

In [39]:
df.index.array

<DatetimeArray>
['2019-01-01 00:00:00', '2019-01-02 00:00:00', '2019-01-03 00:00:00',
 '2019-01-04 00:00:00', '2019-01-05 00:00:00', '2019-01-06 00:00:00',
 '2019-01-07 00:00:00', '2019-01-08 00:00:00']
Length: 8, dtype: datetime64[ns]

**If you know you need a NumPy array, use to_numpy() or numpy.asarray().**

In [40]:
s.to_numpy()

array([1.33113232, 0.7184057 , 0.0807877 , 0.46296651, 0.92604795])

In [41]:
df.to_numpy()

array([[ 0.14649807, -0.80036452, -0.5964737 ],
       [-1.29192364, -0.8095407 , -0.82766185],
       [ 0.6746038 ,  1.49396464,  0.79013721],
       [ 0.60299206,  0.76285481,  0.27392523],
       [ 2.03311525, -0.48724413,  0.54291933],
       [ 0.26510704, -1.72453655,  1.07631929],
       [ 0.54674591, -1.71125559, -0.08178213],
       [-0.28516375, -0.36475327, -0.47582054]])

In [71]:
pd.DataFrame({
    'one':pd.Series(np.random.randn(3),index=[1,2,3]),
    'two':pd.Series(np.random.randn(4),index=[1,2,3,4])
             })

<matplotlib.axes._subplots.AxesSubplot at 0x1181d7eb8>

###### https://docs.scipy.org/doc/numpy-1.15.0/reference/routines.random.html

In [49]:
np.random.rand(5,2)

array([[0.91939029, 0.28784204],
       [0.20631817, 0.52174077],
       [0.39217836, 0.57359845],
       [0.41608609, 0.55160643],
       [0.8499213 , 0.34572229]])

In [63]:
np.random.randn(5,2)

array([[ 0.64001246, -0.74672476],
       [ 0.72585591, -2.71129693],
       [ 1.93968801, -1.23861383],
       [ 0.82780421,  0.587266  ],
       [ 0.84064902,  0.02692479]])

In [66]:
#Return random floats in the half-open interval [0.0, 1.0).
np.random.random(5)

array([0.65607109, 0.65419561, 0.42021657, 0.99253164, 0.43252533])

In [70]:
#Return random integers from low (inclusive) to high (exclusive).
np.random.randint(2,5,size=(5,2))

array([[4, 3],
       [3, 2],
       [4, 4],
       [4, 2],
       [4, 4]])

In [81]:
#pd.read_clipboard(na_values=[None], parse_dates=['d'])

## Flexible binary operations

###  Matching / broadcasting behavior

In [83]:
df = pd.DataFrame({
    'one':pd.Series(np.random.randn(3),index=['a','b','c']),
    'two':pd.Series(np.random.randn(4),index=['a','b','c','d']),
    'three':pd.Series(np.random.randn(3),index=['b','c','d'])
    
})
df

Unnamed: 0,one,two,three
a,0.968812,-1.142344,
b,0.324877,0.288067,0.374192
c,-1.6909,1.121951,-1.298059
d,,-1.759968,-1.705607


In [84]:
row = df.iloc[1]
row

one      0.324877
two      0.288067
three    0.374192
Name: b, dtype: float64

In [85]:
column = df['two']
column

a   -1.142344
b    0.288067
c    1.121951
d   -1.759968
Name: two, dtype: float64

In [89]:
df.sub(row,axis='columns')

Unnamed: 0,one,two,three
a,0.643934,-1.43041,
b,0.0,0.0,0.0
c,-2.015778,0.833884,-1.67225
d,,-2.048035,-2.079799


In [90]:
df.sub(column, axis='index')

Unnamed: 0,one,two,three
a,2.111155,0.0,
b,0.036811,0.0,0.086125
c,-2.812851,0.0,-2.420009
d,,0.0,0.054361


DatetimeIndex(['2019-01-01', '2019-01-02', '2019-01-03', '2019-01-04',
               '2019-01-05', '2019-01-06', '2019-01-07', '2019-01-08'],
              dtype='datetime64[ns]', freq='D')

In [96]:
s = pd.Series(np.arange(10))
s

0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
9    9
dtype: int64

In [93]:
div, rem = divmod(s, 3)

In [94]:
div

0    0
1    0
2    0
3    1
4    1
5    1
6    2
7    2
8    2
9    3
dtype: int64

In [95]:
rem

0    0
1    1
2    2
3    0
4    1
5    2
6    0
7    1
8    2
9    0
dtype: int64

### Missing data / operations with fill values

In [97]:
df2=df

In [99]:
df+df2

Unnamed: 0,one,two,three
a,1.937623,-2.284687,
b,0.649755,0.576133,0.748383
c,-3.3818,2.243901,-2.596117
d,,-3.519937,-3.411215


In [108]:
(df.add(df2,fill_value=0)).fillna(0)

Unnamed: 0,one,two,three
a,1.937623,-2.284687,0.0
b,0.649755,0.576133,0.748383
c,-3.3818,2.243901,-2.596117
d,0.0,-3.519937,-3.411215


### Boolean reductions

You can apply the reductions: empty, any(), all(), and bool() to provide a way to summarize a boolean result.

In [106]:
(df >0).all()

one      False
two      False
three    False
dtype: bool

In [107]:
(df>0).any()

one      True
two      True
three    True
dtype: bool

In [109]:
#You can test if a pandas object is empty, via the empty property.
df.empty

False

### Comparing if objects are equivalent

In [110]:
df + df == df * 2

Unnamed: 0,one,two,three
a,True,True,False
b,True,True,True
c,True,True,True
d,False,True,True


In [112]:
#Notice that the boolean DataFrame df + df == df * 2 contains some False values! 
#This is because NaNs do not compare as equals:
np.nan == np.nan

False

In [113]:
(df+df).equals(df*2)

True

### Comparing array-like objects

In [114]:
pd.Series(['foo','bar','baz']) =='foo'

0     True
1    False
2    False
dtype: bool

In [115]:
pd.Series(['foo', 'bar', 'baz']) == pd.Index(['foo', 'bar', 'qux'])

0     True
1     True
2    False
dtype: bool

## Descriptive statistics

In [117]:
df.mean(0)

one     -0.132404
two     -0.373074
three   -0.876491
dtype: float64

In [118]:
df.mean(1)

a   -0.086766
b    0.329045
c   -0.622336
d   -1.732788
dtype: float64

In [121]:
df.mean(axis=0,skipna=True)

one     -0.132404
two     -0.373074
three   -0.876491
dtype: float64

In [122]:
df = pd.DataFrame({
    'one':pd.Series(np.arange(3),index=['a','b','c']),
    'two':pd.Series(np.arange(4),index=['a','b','c','d']),
    'three':pd.Series(np.arange(3),index=['b','c','d'])
    
})
df

Unnamed: 0,one,two,three
a,0.0,0,
b,1.0,1,0.0
c,2.0,2,1.0
d,,3,2.0


In [124]:
df.std()

one      1.000000
two      1.290994
three    1.000000
dtype: float64

In [129]:
((df-df.mean())/df.std()).std()

one      1.0
two      1.0
three    1.0
dtype: float64

Note that methods like cumsum() and cumprod() preserve the location of NaN values. This is somewhat different from expanding() and rolling(). For more details please see this [note](https://pandas.pydata.org/pandas-docs/stable/user_guide/computation.html#stats-moments-expanding-note)

In [130]:
df.cumsum()

Unnamed: 0,one,two,three
a,0.0,0.0,
b,1.0,1.0,0.0
c,3.0,3.0,1.0
d,,6.0,3.0


In [131]:
#Note that by chance some NumPy methods, like mean, std, and sum, will exclude NAs on Series input by default:
np.mean(df['one'])

1.0

In [132]:
np.mean(df['one'].to_numpy())

nan

### Summarizing data: describe

In [133]:
series = pd.Series(np.random.randn(1000))
series[::2]=np.nan

In [134]:
series.describe()

count    500.000000
mean       0.021694
std        0.980243
min       -3.467144
25%       -0.596366
50%        0.011667
75%        0.648470
max        3.122625
dtype: float64

In [135]:
frame = pd.DataFrame(np.random.randn(1000, 5),
    columns=['a', 'b', 'c', 'd', 'e'])

In [136]:
frame.iloc[::2] = np.nan

In [137]:
frame.describe()

Unnamed: 0,a,b,c,d,e
count,500.0,500.0,500.0,500.0,500.0
mean,0.052498,-0.028001,-0.041705,0.005442,-0.002915
std,1.012348,1.010122,0.964477,1.013019,0.993868
min,-2.989665,-2.851253,-3.448649,-3.543366,-3.239093
25%,-0.636087,-0.695931,-0.681534,-0.721773,-0.642027
50%,0.067491,-0.080417,-0.072155,-0.012134,0.015024
75%,0.683131,0.698961,0.616913,0.738867,0.626166
max,3.352948,3.42559,2.479393,3.021109,3.047629


In [139]:
frame.describe(percentiles=[0.25,0.95])

Unnamed: 0,a,b,c,d,e
count,500.0,500.0,500.0,500.0,500.0
mean,0.052498,-0.028001,-0.041705,0.005442,-0.002915
std,1.012348,1.010122,0.964477,1.013019,0.993868
min,-2.989665,-2.851253,-3.448649,-3.543366,-3.239093
25%,-0.636087,-0.695931,-0.681534,-0.721773,-0.642027
50%,0.067491,-0.080417,-0.072155,-0.012134,0.015024
95%,1.739601,1.604897,1.510598,1.621696,1.65749
max,3.352948,3.42559,2.479393,3.021109,3.047629


In [143]:
frame.describe(include='all')

Unnamed: 0,a,b,c,d,e
count,500.0,500.0,500.0,500.0,500.0
mean,0.052498,-0.028001,-0.041705,0.005442,-0.002915
std,1.012348,1.010122,0.964477,1.013019,0.993868
min,-2.989665,-2.851253,-3.448649,-3.543366,-3.239093
25%,-0.636087,-0.695931,-0.681534,-0.721773,-0.642027
50%,0.067491,-0.080417,-0.072155,-0.012134,0.015024
75%,0.683131,0.698961,0.616913,0.738867,0.626166
max,3.352948,3.42559,2.479393,3.021109,3.047629


In [144]:
frame.describe(include=['number'])

Unnamed: 0,a,b,c,d,e
count,500.0,500.0,500.0,500.0,500.0
mean,0.052498,-0.028001,-0.041705,0.005442,-0.002915
std,1.012348,1.010122,0.964477,1.013019,0.993868
min,-2.989665,-2.851253,-3.448649,-3.543366,-3.239093
25%,-0.636087,-0.695931,-0.681534,-0.721773,-0.642027
50%,0.067491,-0.080417,-0.072155,-0.012134,0.015024
75%,0.683131,0.698961,0.616913,0.738867,0.626166
max,3.352948,3.42559,2.479393,3.021109,3.047629


### Index of min/max values

In [145]:
df1 = pd.DataFrame(np.random.randn(5, 3), columns=['A', 'B', 'C'])
df1

Unnamed: 0,A,B,C
0,0.350509,-1.298687,-1.53947
1,-0.604503,1.967911,-0.465778
2,0.125669,1.98673,1.020245
3,-0.028709,-1.555085,-0.236776
4,-0.841389,0.519869,-1.317559


In [147]:
df1.idxmax(axis=0)

A    0
B    2
C    2
dtype: int64

In [148]:
df1.idxmax(axis=1)

0    A
1    B
2    B
3    A
4    B
dtype: object

### Value counts (histogramming) / mode

In [149]:
data = np.random.randint(0,9,size=50)

In [150]:
df1 = pd.Series(data)

In [151]:
df1.value_counts()

2    10
3     9
6     8
5     7
7     5
8     4
4     4
1     2
0     1
dtype: int64

In [152]:
df1.mode()

0    2
dtype: int64

### Discretization and quantiling

Continuous values can be discretized using the cut() (bins based on values) and qcut() (bins based on sample quantiles) functions:

In [155]:
arr = np.arange(5)
factor = pd.cut(arr,4)
factor

[(-0.004, 1.0], (-0.004, 1.0], (1.0, 2.0], (2.0, 3.0], (3.0, 4.0]]
Categories (4, interval[float64]): [(-0.004, 1.0] < (1.0, 2.0] < (2.0, 3.0] < (3.0, 4.0]]

In [160]:
#qcut() computes sample quantiles. 
#For example, we could slice up some normally distributed data into equal-size quartiles like so:
arr = np.arange(10)
factor = pd.qcut(arr,[0,0.25,0.5,0.75,1])
factor

[(-0.001, 2.25], (-0.001, 2.25], (-0.001, 2.25], (2.25, 4.5], (2.25, 4.5], (4.5, 6.75], (4.5, 6.75], (6.75, 9.0], (6.75, 9.0], (6.75, 9.0]]
Categories (4, interval[float64]): [(-0.001, 2.25] < (2.25, 4.5] < (4.5, 6.75] < (6.75, 9.0]]

## Function application