In [1]:
import pandas as pd
import numpy  as np

In [2]:
# Create a Series and let Pandas create a default integer index
s = pd.Series([1, 3, 5, np.nan, 6, 8])

In [3]:
# dtype = float 64
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [4]:
# Creating a DataFrame by passing a NumPy array
# with a datetime index and labeled columns
# Type(dates): pandas.core.indexes.datetimes.DatetimeIndex
dates = pd.date_range('20180101', periods=6)

In [5]:
dates

DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04',
               '2018-01-05', '2018-01-06'],
              dtype='datetime64[ns]', freq='D')

In [6]:
# Create a Dataframe here
df = pd.DataFrame(np.random.randn(6, 4),
                 index=dates,
                 columns=list('ABCD'))

In [7]:
df

Unnamed: 0,A,B,C,D
2018-01-01,-1.039937,2.341353,-1.262901,-1.539084
2018-01-02,-0.154367,0.852368,0.699856,-0.35087
2018-01-03,0.623281,-1.699634,-0.784289,-0.250227
2018-01-04,0.176548,-1.08637,-0.226455,0.171448
2018-01-05,-2.143135,-1.05451,-0.29226,0.817536
2018-01-06,-0.948526,-0.120819,-1.003916,-0.172403


In [8]:
# Create a DataFrame by passing a dict of objects
# that can be converted to series-like.
df2 = pd.DataFrame({'A': 1.,
                    'B': pd.Timestamp('20130102'),
                    'C': pd.Series(1, index=list(range(4)), dtype='float32'),
                    'D': np.array([3] * 4, dtype='int32'),
                    'E': pd.Categorical(["test", "train", "test", "train"]),
                    'F': 'food'})

In [9]:
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,food
1,1.0,2013-01-02,1.0,3,train,food
2,1.0,2013-01-02,1.0,3,test,food
3,1.0,2013-01-02,1.0,3,train,food


In [10]:
# The columns of the resulting DataFrame can have different dtypes
df2.dtypes

# Only for IPython
# df2.<TAB>

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [11]:
# how to view the top and bottom rows of the frame
df.head()

Unnamed: 0,A,B,C,D
2018-01-01,-1.039937,2.341353,-1.262901,-1.539084
2018-01-02,-0.154367,0.852368,0.699856,-0.35087
2018-01-03,0.623281,-1.699634,-0.784289,-0.250227
2018-01-04,0.176548,-1.08637,-0.226455,0.171448
2018-01-05,-2.143135,-1.05451,-0.29226,0.817536


In [12]:
df.tail(3)

Unnamed: 0,A,B,C,D
2018-01-04,0.176548,-1.08637,-0.226455,0.171448
2018-01-05,-2.143135,-1.05451,-0.29226,0.817536
2018-01-06,-0.948526,-0.120819,-1.003916,-0.172403


In [13]:
# Display the index
df.index

DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04',
               '2018-01-05', '2018-01-06'],
              dtype='datetime64[ns]', freq='D')

In [14]:
# Display the columns
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [15]:
# DataFrame.to_numpy() gives a Numpy representation of the underlaying data
# This can be expensive operation, because numpy array can only be 1 type.
# If your DataFrame has multiple data type, then every single element
# has to be converted into a Python object.

# In our example, it is not an issue because it is of the same type
df.to_numpy()

array([[-1.03993745,  2.34135333, -1.26290134, -1.53908352],
       [-0.1543671 ,  0.85236819,  0.69985562, -0.35087008],
       [ 0.6232809 , -1.69963362, -0.78428857, -0.25022677],
       [ 0.17654773, -1.08637028, -0.22645459,  0.17144849],
       [-2.14313549, -1.05451012, -0.29226036,  0.81753569],
       [-0.94852647, -0.12081852, -1.0039164 , -0.17240318]])

In [16]:
# For df2, the DataFrame with multiple dtypes, DataFrame.to_numpy() 
# is relatively expensive (slow)
df2.to_numpy()

array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'food'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'food'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'food'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'food']],
      dtype=object)

In [17]:
# DataFrame.to_numpy() does not include the index or column labels
# in the output. Use describe() shows a quick statistic summary 
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.581023,-0.127935,-0.478328,-0.2206
std,0.999215,1.503716,0.703077,0.773304
min,-2.143135,-1.699634,-1.262901,-1.539084
25%,-1.017085,-1.078405,-0.949009,-0.325709
50%,-0.551447,-0.587664,-0.538274,-0.211315
75%,0.093819,0.609072,-0.242906,0.085486
max,0.623281,2.341353,0.699856,0.817536


In [18]:
# Transposing your data, capital T
df.T

Unnamed: 0,2018-01-01 00:00:00,2018-01-02 00:00:00,2018-01-03 00:00:00,2018-01-04 00:00:00,2018-01-05 00:00:00,2018-01-06 00:00:00
A,-1.039937,-0.154367,0.623281,0.176548,-2.143135,-0.948526
B,2.341353,0.852368,-1.699634,-1.08637,-1.05451,-0.120819
C,-1.262901,0.699856,-0.784289,-0.226455,-0.29226,-1.003916
D,-1.539084,-0.35087,-0.250227,0.171448,0.817536,-0.172403


In [19]:
# Sort by an axis:
# Axis of 1 is the header row, and ascending = False means reverse
# From largest to smallest.
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2018-01-01,-1.539084,-1.262901,2.341353,-1.039937
2018-01-02,-0.35087,0.699856,0.852368,-0.154367
2018-01-03,-0.250227,-0.784289,-1.699634,0.623281
2018-01-04,0.171448,-0.226455,-1.08637,0.176548
2018-01-05,0.817536,-0.29226,-1.05451,-2.143135
2018-01-06,-0.172403,-1.003916,-0.120819,-0.948526


In [20]:
# Sort by Values:
df.sort_values(by='B', ascending=False)

Unnamed: 0,A,B,C,D
2018-01-01,-1.039937,2.341353,-1.262901,-1.539084
2018-01-02,-0.154367,0.852368,0.699856,-0.35087
2018-01-06,-0.948526,-0.120819,-1.003916,-0.172403
2018-01-05,-2.143135,-1.05451,-0.29226,0.817536
2018-01-04,0.176548,-1.08637,-0.226455,0.171448
2018-01-03,0.623281,-1.699634,-0.784289,-0.250227


In [21]:
# While the standard Python / Numpy expressions works
# For production code we recommend using pandas data access method
# such as .at, .iat, .loc, and .iloc

# Getting
# Selecting a single columns and return a series
# same as using df.A
df['A']

2018-01-01   -1.039937
2018-01-02   -0.154367
2018-01-03    0.623281
2018-01-04    0.176548
2018-01-05   -2.143135
2018-01-06   -0.948526
Freq: D, Name: A, dtype: float64

In [22]:
# The same as df['A']
# Both returns a series, this one is type float64
df.A

2018-01-01   -1.039937
2018-01-02   -0.154367
2018-01-03    0.623281
2018-01-04    0.176548
2018-01-05   -2.143135
2018-01-06   -0.948526
Freq: D, Name: A, dtype: float64

In [23]:
# Selecting via [], which slices the rows
df[0:3]

Unnamed: 0,A,B,C,D
2018-01-01,-1.039937,2.341353,-1.262901,-1.539084
2018-01-02,-0.154367,0.852368,0.699856,-0.35087
2018-01-03,0.623281,-1.699634,-0.784289,-0.250227


In [24]:
# Selecting via [], which slices the values of the row
# Does it have to be sorted?
df['20180102':'20180104']

Unnamed: 0,A,B,C,D
2018-01-02,-0.154367,0.852368,0.699856,-0.35087
2018-01-03,0.623281,-1.699634,-0.784289,-0.250227
2018-01-04,0.176548,-1.08637,-0.226455,0.171448


In [25]:
# Selection by Label

# For getting a cross section using a label
# Get the cross section of a given row, 
# dates[0] is 20180102
df.loc[dates[0]]

A   -1.039937
B    2.341353
C   -1.262901
D   -1.539084
Name: 2018-01-01 00:00:00, dtype: float64

In [26]:
# Selecting on a multi-axis by label
# Select all the rows, and column and index
# df.loc[row_range, column_range]
# df.loc[row_list, column_list]
df.loc[:, ['A', 'B']]

# Label slicing, both endpoints are included
df.loc['20180102':'20180104', 'B':'D']

# By range of the rows
# The column you can't slice by index ...
# This will give an error message
# df.loc[0:3, 0:1]

Unnamed: 0,B,C,D
2018-01-02,0.852368,0.699856,-0.35087
2018-01-03,-1.699634,-0.784289,-0.250227
2018-01-04,-1.08637,-0.226455,0.171448


In [27]:
# Reduction in the dimensions of the returned object
# pandas.core.series.Series
df.loc['20180103', ['A', 'B']]

A    0.623281
B   -1.699634
Name: 2018-01-03 00:00:00, dtype: float64

In [28]:
# For getting a scalar value (Number)
df.loc[dates[0], 'A']

-1.0399374477199097

In [29]:
# For fastest access to a scala (equivalent to the prior method)
# one uses at, the slower one is loc
df.at[dates[0], 'A']

-1.0399374477199097

# Selection by Position

In [30]:
# Select via the position of the passed integers
# This is the data from row 4
print(df)
df.iloc[3]

                   A         B         C         D
2018-01-01 -1.039937  2.341353 -1.262901 -1.539084
2018-01-02 -0.154367  0.852368  0.699856 -0.350870
2018-01-03  0.623281 -1.699634 -0.784289 -0.250227
2018-01-04  0.176548 -1.086370 -0.226455  0.171448
2018-01-05 -2.143135 -1.054510 -0.292260  0.817536
2018-01-06 -0.948526 -0.120819 -1.003916 -0.172403


A    0.176548
B   -1.086370
C   -0.226455
D    0.171448
Name: 2018-01-04 00:00:00, dtype: float64

In [31]:
# By integer slices, similiar to numpy and python
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2018-01-04,0.176548,-1.08637
2018-01-05,-2.143135,-1.05451


In [32]:
# By lists of integer position locations, similiar to the numpy
# and python style.
df.iloc[[1, 2, 4], [0, 2]]

Unnamed: 0,A,C
2018-01-02,-0.154367,0.699856
2018-01-03,0.623281,-0.784289
2018-01-05,-2.143135,-0.29226


In [33]:
# For slicing rows explicitly
df.iloc[1:3, :]

Unnamed: 0,A,B,C,D
2018-01-02,-0.154367,0.852368,0.699856,-0.35087
2018-01-03,0.623281,-1.699634,-0.784289,-0.250227


In [34]:
# For slicing columns explicitly
df.iloc[:, 1:3]

Unnamed: 0,B,C
2018-01-01,2.341353,-1.262901
2018-01-02,0.852368,0.699856
2018-01-03,-1.699634,-0.784289
2018-01-04,-1.08637,-0.226455
2018-01-05,-1.05451,-0.29226
2018-01-06,-0.120819,-1.003916


In [35]:
# For getting a value explicitly
df.iloc[1, 1]

0.8523681920513916

In [36]:
# For faster access to a scalar 
df.iat[1, 1]

0.8523681920513916

# Boolean Indexing

In [37]:
# Using a single column's value to select data
df[df.A > 0]

Unnamed: 0,A,B,C,D
2018-01-03,0.623281,-1.699634,-0.784289,-0.250227
2018-01-04,0.176548,-1.08637,-0.226455,0.171448


In [38]:
# Selecting values from a DataFrame where boolean condition is met
df[df > 0]

Unnamed: 0,A,B,C,D
2018-01-01,,2.341353,,
2018-01-02,,0.852368,0.699856,
2018-01-03,0.623281,,,
2018-01-04,0.176548,,,0.171448
2018-01-05,,,,0.817536
2018-01-06,,,,


In [39]:
# using the isin() method for filtering
df2 = df.copy()

# Adding a new column by giving it value
df2['E'] = ['one', 'one', 'two', 'three', 'four', 'three']
df2

Unnamed: 0,A,B,C,D,E
2018-01-01,-1.039937,2.341353,-1.262901,-1.539084,one
2018-01-02,-0.154367,0.852368,0.699856,-0.35087,one
2018-01-03,0.623281,-1.699634,-0.784289,-0.250227,two
2018-01-04,0.176548,-1.08637,-0.226455,0.171448,three
2018-01-05,-2.143135,-1.05451,-0.29226,0.817536,four
2018-01-06,-0.948526,-0.120819,-1.003916,-0.172403,three


In [40]:
df2[df2['E'].isin(['two', 'four'])]

Unnamed: 0,A,B,C,D,E
2018-01-03,0.623281,-1.699634,-0.784289,-0.250227,two
2018-01-05,-2.143135,-1.05451,-0.29226,0.817536,four


# Setting

In [41]:
# Setting a new column automatically aligns the data
s1 = pd.Series([1, 2, 3, 4, 5, 6], 
               index=pd.date_range('20180102', periods=6))
df['F'] = s1
s1

2018-01-02    1
2018-01-03    2
2018-01-04    3
2018-01-05    4
2018-01-06    5
2018-01-07    6
Freq: D, dtype: int64

In [42]:
# Setting values by label
df.at[dates[0], 'A'] = 0

# Setting values by position
df.iat[0, 1] = 0

# Setting by assigning with a Numpy Array:
df.loc[:, 'D'] = np.array([5] * len(df))

In [43]:
# A where operation with setting
df2 = df.copy()

# If the values of df2 is greater 0, then assign a negative values
# or multiply by -1
df[df2 > 0] = -df2

# Missing DataM

In [46]:
# Pandas's Axis, 0 = rows, 1 = columns
# https://i.stack.imgur.com/YiAo9.png

In [47]:
# Pandas use np.nan to represent missing data, this is by default not included
# in the comutations. Reindexing allows you to change/add/delete

In [48]:
df1 = df.reindex(index=dates[0:4],
                 columns=list(df.columns) + ['E'])

In [53]:
# pandas.DataFrame.loc - access a group of rows by labels or boolean array
# df1.loc[row, column] ...
df1.loc[dates[0]:dates[1], 'E'] = 1
df1

Unnamed: 0,A,B,C,D,F,E
2018-01-01,0.0,0.0,-1.262901,-5,,1.0
2018-01-02,-0.154367,-0.852368,-0.699856,-5,-1.0,1.0
2018-01-03,-0.623281,-1.699634,-0.784289,-5,-2.0,
2018-01-04,-0.176548,-1.08637,-0.226455,-5,-3.0,


In [54]:
# To drop any rows that have missing data
df1.dropna(how='any')

Unnamed: 0,A,B,C,D,F,E
2018-01-02,-0.154367,-0.852368,-0.699856,-5,-1.0,1.0


In [55]:
# Filling missing data
df1.fillna(value=5)

Unnamed: 0,A,B,C,D,F,E
2018-01-01,0.0,0.0,-1.262901,-5,5.0,1.0
2018-01-02,-0.154367,-0.852368,-0.699856,-5,-1.0,1.0
2018-01-03,-0.623281,-1.699634,-0.784289,-5,-2.0,5.0
2018-01-04,-0.176548,-1.08637,-0.226455,-5,-3.0,5.0


In [56]:
# To get the boolean mask where values are nan
pd.isna(df1)

Unnamed: 0,A,B,C,D,F,E
2018-01-01,False,False,False,False,True,False
2018-01-02,False,False,False,False,False,False
2018-01-03,False,False,False,False,False,True
2018-01-04,False,False,False,False,False,True


# Operation

In [57]:
# Operation in general exclude missing data
# Performing a descriptive statistic:
df.mean()

A   -0.674310
B   -0.802283
C   -0.711613
D   -5.000000
F   -3.000000
dtype: float64

In [58]:
# Same operation by on the other axis:
df.mean(1)

2018-01-01   -1.565725
2018-01-02   -1.541318
2018-01-03   -2.021441
2018-01-04   -1.897875
2018-01-05   -2.497981
2018-01-06   -2.414652
Freq: D, dtype: float64

In [61]:
s = pd.Series([1, 3, 5, np.nan, 6, 8], index=list(range(0, 6))).shift(2)

In [62]:
df.sub(s, axis='index')

Unnamed: 0,A,B,C,D,F
2018-01-01 00:00:00,,,,,
2018-01-02 00:00:00,,,,,
2018-01-03 00:00:00,,,,,
2018-01-04 00:00:00,,,,,
2018-01-05 00:00:00,,,,,
2018-01-06 00:00:00,,,,,
0,,,,,
1,,,,,
2,,,,,
3,,,,,


# Apply

In [63]:
# Applying functions to the data:
df.apply(np.cumsum)

Unnamed: 0,A,B,C,D,F
2018-01-01,0.0,0.0,-1.262901,-5,
2018-01-02,-0.154367,-0.852368,-1.962757,-10,-1.0
2018-01-03,-0.777648,-2.552002,-2.747046,-15,-3.0
2018-01-04,-0.954196,-3.638372,-2.9735,-20,-6.0
2018-01-05,-3.097331,-4.692882,-3.26576,-25,-10.0
2018-01-06,-4.045858,-4.813701,-4.269677,-30,-15.0


In [64]:
df.apply(lambda x: x.max() - x.min())

A    2.143135
B    1.699634
C    1.036447
D    0.000000
F    4.000000
dtype: float64

In [65]:
s = pd.Series(['A', 'B', 'C', 'Aaba', np.nan, 'CABA', 'dog', 'cat'])

In [66]:
s

0       A
1       B
2       C
3    Aaba
4     NaN
5    CABA
6     dog
7     cat
dtype: object

In [68]:
# Convert the pandas series s to lower case
s.str.lower()

0       a
1       b
2       c
3    aaba
4     NaN
5    caba
6     dog
7     cat
dtype: object

# Merge

In [71]:
# Easily combining together Series, DataFrame and Pandas
# 10 rows and 4 columns
df = pd.DataFrame(np.random.randn(10, 4))
df

Unnamed: 0,0,1,2,3
0,2.62403,-1.735929,1.116723,0.571828
1,-0.192374,1.311253,-0.759419,0.204992
2,-1.706395,-2.045635,0.02328,-0.439553
3,-2.098655,-0.687155,-2.369166,-1.55207
4,-1.146291,-0.632602,-0.337493,-0.956414
5,0.110846,1.129436,2.287677,1.402385
6,-0.253389,0.360327,0.497867,-1.6591
7,-0.313342,0.015751,0.822692,-2.116902
8,-0.349565,-1.797301,-0.500155,0.013583
9,-1.364996,0.028013,0.725263,-1.785979


In [73]:
# Break it into pieces
# Note pieces is actually list of pandas DataFrame
pieces = [df[:3], df[3:7], df[7:]]
pieces

[          0         1         2         3
 0  2.624030 -1.735929  1.116723  0.571828
 1 -0.192374  1.311253 -0.759419  0.204992
 2 -1.706395 -2.045635  0.023280 -0.439553,
           0         1         2         3
 3 -2.098655 -0.687155 -2.369166 -1.552070
 4 -1.146291 -0.632602 -0.337493 -0.956414
 5  0.110846  1.129436  2.287677  1.402385
 6 -0.253389  0.360327  0.497867 -1.659100,
           0         1         2         3
 7 -0.313342  0.015751  0.822692 -2.116902
 8 -0.349565 -1.797301 -0.500155  0.013583
 9 -1.364996  0.028013  0.725263 -1.785979]

In [76]:
type(pieces)

list

In [77]:
type(pieces[1])

pandas.core.frame.DataFrame

In [78]:
pd.concat(pieces)

Unnamed: 0,0,1,2,3
0,2.62403,-1.735929,1.116723,0.571828
1,-0.192374,1.311253,-0.759419,0.204992
2,-1.706395,-2.045635,0.02328,-0.439553
3,-2.098655,-0.687155,-2.369166,-1.55207
4,-1.146291,-0.632602,-0.337493,-0.956414
5,0.110846,1.129436,2.287677,1.402385
6,-0.253389,0.360327,0.497867,-1.6591
7,-0.313342,0.015751,0.822692,-2.116902
8,-0.349565,-1.797301,-0.500155,0.013583
9,-1.364996,0.028013,0.725263,-1.785979


# Join