# Essential functionality 
* **Head & Tail**: df.head(), df.tail()
* **Attributess**: df.shape, df.columns, df.values, df.dtypes
* **Descriptive**: df.mean(), df.sum(), df.cumsum(), df.std(), df.min(), df.count()...
* **Index of Min/Max values**: df.idxmin(), df.idxmax()
* **Value Distribution**: df.value_counts()
* **Discretization and Quantiling**: pd.cut(), pd.qcut()
* **Function Application**: df.apply(), df.agg(). df.transform()
* **Reindexing**: df.reindex(), df.reindex_like()
* **Altering labels**: df.drop, df.rename
* **Iteration**: df.iterrows(), df.itertuples(), df.iteritems()
* **Sorting**: df.sort_index(), df.sort_values(), df.nsmallest(), df.nlargest()
* **Dtypes**: df.to_numeric(), df.to_datetime(), df.astype(), df.select_dtypes(include=, exclude=,)

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.DataFrame({'A': np.random.randint(3, size=5),
                   'B': ['a', 'b', 'c', 'd', 'e'],
                   'C': pd.Series([1, 4, 5, 6, 2]), 
                   'D': ('one', 1, 'two', 2, np.nan)})
df

Unnamed: 0,A,B,C,D
0,0,a,1,one
1,0,b,4,1
2,0,c,5,two
3,0,d,6,2
4,1,e,2,


## Head and Tail

In [3]:
df.head(3)

Unnamed: 0,A,B,C,D
0,0,a,1,one
1,0,b,4,1
2,0,c,5,two


In [4]:
df.tail(2)

Unnamed: 0,A,B,C,D
3,0,d,6,2.0
4,1,e,2,


## Arrtibutes, Raw ndarrays

In [5]:
df.columns

Index([u'A', u'B', u'C', u'D'], dtype='object')

In [6]:
df.values

array([[0, 'a', 1L, 'one'],
       [0, 'b', 4L, 1],
       [0, 'c', 5L, 'two'],
       [0, 'd', 6L, 2],
       [1, 'e', 2L, nan]], dtype=object)

In [7]:
df.shape

(5, 4)

In [8]:
df.dtypes

A     int32
B    object
C     int64
D    object
dtype: object

## Descriptive Statistics

In [9]:
df

Unnamed: 0,A,B,C,D
0,0,a,1,one
1,0,b,4,1
2,0,c,5,two
3,0,d,6,2
4,1,e,2,


In [10]:
df.mean() # sum(),std()...

A    0.2
C    3.6
dtype: float64

In [11]:
df.mean(axis=1)

0    0.5
1    2.0
2    2.5
3    3.0
4    1.5
dtype: float64

In [12]:
df.iloc[0, 0] = np.nan
df

Unnamed: 0,A,B,C,D
0,,a,1,one
1,0.0,b,4,1
2,0.0,c,5,two
3,0.0,d,6,2
4,1.0,e,2,


In [13]:
df.sum(0, skipna=True)

A        1
B    abcde
C       18
dtype: object

In [14]:
df['C'].cumsum()

0     1
1     5
2    10
3    16
4    18
Name: C, dtype: int64

In [15]:
df['C'].count()

5

In [16]:
df['C'].min() # max()

1

In [17]:
# return the number of unique non-NA values
df.nunique()

A    2
B    5
C    5
D    4
dtype: int64

In [18]:
df.describe(include='all') # include = ...

Unnamed: 0,A,B,C,D
count,4.0,5,5.0,4
unique,,5,,4
top,,d,,one
freq,,1,,1
mean,0.25,,3.6,
std,0.5,,2.073644,
min,0.0,,1.0,
25%,0.0,,2.0,
50%,0.0,,4.0,
75%,0.25,,5.0,


## Index of Min/Max Values

In [19]:
# return the index labels with the minimum values
df.C.idxmin()

0

In [20]:
# return the index labels with the maximum values
df.C.idxmax()

3

## Value counts
This is very helpful function that can help describe the distribution of data.

In [21]:
df.C.value_counts()

6    1
5    1
4    1
2    1
1    1
Name: C, dtype: int64

## Discretization and Quantiling

In [22]:
arr = np.random.randint(5,size=20)
arr

array([1, 3, 2, 1, 1, 1, 4, 0, 4, 3, 0, 0, 2, 4, 3, 2, 3, 3, 0, 1])

In [23]:
pd.cut(arr,3)

[(-0.004, 1.333], (2.667, 4.0], (1.333, 2.667], (-0.004, 1.333], (-0.004, 1.333], ..., (1.333, 2.667], (2.667, 4.0], (2.667, 4.0], (-0.004, 1.333], (-0.004, 1.333]]
Length: 20
Categories (3, interval[float64]): [(-0.004, 1.333] < (1.333, 2.667] < (2.667, 4.0]]

In [24]:
arr1 = np.random.randint(10, size=20)
pd.qcut(arr1, [0, 0.25, 0.5, 0.75, 1])

[(0.999, 3.0], (0.999, 3.0], (3.0, 5.0], (8.0, 9.0], (0.999, 3.0], ..., (5.0, 8.0], (0.999, 3.0], (3.0, 5.0], (5.0, 8.0], (5.0, 8.0]]
Length: 20
Categories (4, interval[float64]): [(0.999, 3.0] < (3.0, 5.0] < (5.0, 8.0] < (8.0, 9.0]]

## Apply

In [25]:
df = pd.DataFrame({'A': np.random.randint(5, size=5),
                   'B': np.random.randint(3, size=5)})
df

Unnamed: 0,A,B
0,2,2
1,3,1
2,0,1
3,1,1
4,3,1


In [26]:
df.apply(np.mean, axis=1)

0    2.0
1    2.0
2    0.5
3    1.0
4    2.0
dtype: float64

In [27]:
df.apply(lambda x:x.max() - x.min(), axis=0)

A    3
B    1
dtype: int64

## Aggregating 

In [28]:
df.agg([np.sum, np.mean])

Unnamed: 0,A,B
sum,9.0,6.0
mean,1.8,1.2


## Transform

In [29]:
# add one for all values
df.transform(lambda x: x+1)

Unnamed: 0,A,B
0,3,3
1,4,2
2,1,2
3,2,2
4,4,2


## Reindex

In [30]:
df

Unnamed: 0,A,B
0,2,2
1,3,1
2,0,1
3,1,1
4,3,1


In [31]:
df1 = df.reindex(index=[1, 0, 3, 5, 4, 2], columns=['B', 'A', 'D']) # can also use axis to control 
df1 
# note: when there is no index 5, or columns 'D', it will return NaN

Unnamed: 0,B,A,D
1,1.0,3.0,
0,2.0,2.0,
3,1.0,1.0,
5,,,
4,1.0,3.0,
2,1.0,0.0,


In [32]:
# reindex_like
df2 = df.reindex(index=[1, 0, 3, 4, 2], columns=['B', 'A'])
df2

Unnamed: 0,B,A
1,1,3
0,2,2
3,1,1
4,1,3
2,1,0


In [33]:
# reinde_like
df.reindex_like(df2)

Unnamed: 0,B,A
1,1,3
0,2,2
3,1,1
4,1,3
2,1,0


## Drop

In [34]:
df1

Unnamed: 0,B,A,D
1,1.0,3.0,
0,2.0,2.0,
3,1.0,1.0,
5,,,
4,1.0,3.0,
2,1.0,0.0,


In [35]:
df1.drop(['D'], axis=1)

Unnamed: 0,B,A
1,1.0,3.0
0,2.0,2.0
3,1.0,1.0
5,,
4,1.0,3.0
2,1.0,0.0


## Rename / Mapping labels

In [36]:
df.rename(columns={'A': 'One', 'B': 'Two'},
          index={0:'5'})

Unnamed: 0,One,Two
5,2,2
1,3,1
2,0,1
3,1,1
4,3,1


## Iteration
1. iterrows()
2. itertupes() faster than iterrow(), preferable to use to iterate over the values of a DataFrame
3. iteritems()

In [37]:
for index, row in df.iterrows():
    print row

A    2
B    2
Name: 0, dtype: int32
A    3
B    1
Name: 1, dtype: int32
A    0
B    1
Name: 2, dtype: int32
A    1
B    1
Name: 3, dtype: int32
A    3
B    1
Name: 4, dtype: int32


In [38]:
for key, value in df.iteritems():
    print key
    print value

A
0    2
1    3
2    0
3    1
4    3
Name: A, dtype: int32
B
0    2
1    1
2    1
3    1
4    1
Name: B, dtype: int32


In [39]:
for row in df.itertuples():
    print row

Pandas(Index=0, A=2, B=2)
Pandas(Index=1, A=3, B=1)
Pandas(Index=2, A=0, B=1)
Pandas(Index=3, A=1, B=1)
Pandas(Index=4, A=3, B=1)


## Sorting
1. by Index
2. by Values

In [40]:
# by index
df1.sort_index(axis=1) # axis = 0 or 1

Unnamed: 0,A,B,D
1,3.0,1.0,
0,2.0,2.0,
3,1.0,1.0,
5,,,
4,3.0,1.0,
2,0.0,1.0,


In [41]:
# by values
df1.sort_values(by=['A', 'B'], na_position='first') # by control sorting sequence, na_position control the nan sequence

Unnamed: 0,B,A,D
5,,,
2,1.0,0.0,
3,1.0,1.0,
0,2.0,2.0,
1,1.0,3.0,
4,1.0,3.0,


## nsmallest, nlargest

In [42]:
df.A.nsmallest(3)

2    0
3    1
0    2
Name: A, dtype: int32

In [43]:
df.B.nlargest(3)

0    2
1    1
2    1
Name: B, dtype: int32

## object conversion
1. to_numeric()
2. to_datetime()

In [44]:
df3 = pd.DataFrame({'A': ['1', '2', '3'], 'B': ['2017-09-03', '2016-08-02', '2012-01-24'], 'C':[5, 3, 1]})
df3

Unnamed: 0,A,B,C
0,1,2017-09-03,5
1,2,2016-08-02,3
2,3,2012-01-24,1


In [45]:
df3.dtypes

A    object
B    object
C     int64
dtype: object

In [46]:
pd.to_numeric(df3.A)

0    1
1    2
2    3
Name: A, dtype: int64

In [47]:
pd.to_datetime(df3.B)

0   2017-09-03
1   2016-08-02
2   2012-01-24
Name: B, dtype: datetime64[ns]

## select_dtypes(include = , exclude = ,)

In [48]:
df3.dtypes

A    object
B    object
C     int64
dtype: object

In [49]:
df3.select_dtypes(include=['number'])

Unnamed: 0,C
0,5
1,3
2,1


In [50]:
df3.select_dtypes(exclude=['object'])

Unnamed: 0,C
0,5
1,3
2,1


## Reference
+ [Essential Basic Functionality](https://pandas.pydata.org/pandas-docs/stable/basics.html#descriptive-statistics)