# Pandas for Data Analysis (Part 1 of 2)

In [1]:
import numpy as np
import pandas as pd

### Creating a DatetimeIndex

In [2]:
index = pd.date_range('2018-01-01 20:30:40', periods=10, freq='2S')
index

DatetimeIndex(['2018-01-01 20:30:40', '2018-01-01 20:30:42',
               '2018-01-01 20:30:44', '2018-01-01 20:30:46',
               '2018-01-01 20:30:48', '2018-01-01 20:30:50',
               '2018-01-01 20:30:52', '2018-01-01 20:30:54',
               '2018-01-01 20:30:56', '2018-01-01 20:30:58'],
              dtype='datetime64[ns]', freq='2S')

In [3]:
index = pd.date_range('2018-01-01 00:00:00', periods=4, freq='D')
index

DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04'], dtype='datetime64[ns]', freq='D')

- Commonly Used Frequency Letters:
    - A: year end
    - M: month end
    - W: weekly
    - D: calendar day
    - B: business day
    - H: hourly
    - BH: business hour
    - T: minutely
    - S: secondly
    - L: milliseconds

### Creating a DataFrame

- A `DataFrame` is a 2-dimensional labeled data structure with columns of potentially different types. 

In [4]:
df = pd.DataFrame(np.random.randn(4, 3), index=index, columns=['A','B','C'])
df

Unnamed: 0,A,B,C
2018-01-01,-0.556326,-1.619785,0.368533
2018-01-02,-1.413169,2.012528,0.632755
2018-01-03,0.384332,-0.964538,0.913506
2018-01-04,-0.191651,0.490828,0.578182


In [5]:
df.index.name = 'DateTime'
df

Unnamed: 0_level_0,A,B,C
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-01-01,-0.556326,-1.619785,0.368533
2018-01-02,-1.413169,2.012528,0.632755
2018-01-03,0.384332,-0.964538,0.913506
2018-01-04,-0.191651,0.490828,0.578182


- We can create a `DataFrame` based on a `dict`.

In [6]:
dict1 = {'one': pd.Series([1, 2], index=['a', 'b']),
         'two': pd.Series(['t1','t2','t3'], index=['a', 'b', 'c'])}
df2 = pd.DataFrame(dict1)
df2

Unnamed: 0,one,two
a,1.0,t1
b,2.0,t2
c,,t3


- A pandas `DataFrame` can contain **heterogeneous** data;
- A numpy `ndarray` can contain **homogeneous** data only.

### Viewing the Created DataFrame

In [7]:
df  # See the entire DataFrame

Unnamed: 0_level_0,A,B,C
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-01-01,-0.556326,-1.619785,0.368533
2018-01-02,-1.413169,2.012528,0.632755
2018-01-03,0.384332,-0.964538,0.913506
2018-01-04,-0.191651,0.490828,0.578182


In [8]:
df.head(2) # See the top 2 rows

Unnamed: 0_level_0,A,B,C
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-01-01,-0.556326,-1.619785,0.368533
2018-01-02,-1.413169,2.012528,0.632755


In [9]:
df.tail(3) # See the bottom 3 rows

Unnamed: 0_level_0,A,B,C
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-01-02,-1.413169,2.012528,0.632755
2018-01-03,0.384332,-0.964538,0.913506
2018-01-04,-0.191651,0.490828,0.578182


In [10]:
df.index  # Display the index only

DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04'], dtype='datetime64[ns]', name='DateTime', freq='D')

In [11]:
df.columns  # Display the column names only

Index(['A', 'B', 'C'], dtype='object')

In [12]:
df.values  # Display the values only

array([[-0.55632588, -1.61978517,  0.36853259],
       [-1.41316939,  2.01252817,  0.63275516],
       [ 0.38433219, -0.96453825,  0.91350572],
       [-0.19165132,  0.49082757,  0.57818247]])

In [13]:
df.describe()  # Show the basic statistics

Unnamed: 0,A,B,C
count,4.0,4.0,4.0
mean,-0.444204,-0.020242,0.623244
std,0.753153,1.61695,0.224537
min,-1.413169,-1.619785,0.368533
25%,-0.770537,-1.12835,0.52577
50%,-0.373989,-0.236855,0.605469
75%,-0.047655,0.871253,0.702943
max,0.384332,2.012528,0.913506


In [14]:
df.T  # Show the transpose

DateTime,2018-01-01 00:00:00,2018-01-02 00:00:00,2018-01-03 00:00:00,2018-01-04 00:00:00
A,-0.556326,-1.413169,0.384332,-0.191651
B,-1.619785,2.012528,-0.964538,0.490828
C,0.368533,0.632755,0.913506,0.578182


In [15]:
df  # Transposing is not done in place

Unnamed: 0_level_0,A,B,C
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-01-01,-0.556326,-1.619785,0.368533
2018-01-02,-1.413169,2.012528,0.632755
2018-01-03,0.384332,-0.964538,0.913506
2018-01-04,-0.191651,0.490828,0.578182


In [16]:
df.sort_index(axis=0, ascending=False)  # Sort by index

Unnamed: 0_level_0,A,B,C
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-01-04,-0.191651,0.490828,0.578182
2018-01-03,0.384332,-0.964538,0.913506
2018-01-02,-1.413169,2.012528,0.632755
2018-01-01,-0.556326,-1.619785,0.368533


In [17]:
df.sort_index(axis=1, ascending=False)  # Sort by column

Unnamed: 0_level_0,C,B,A
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-01-01,0.368533,-1.619785,-0.556326
2018-01-02,0.632755,2.012528,-1.413169
2018-01-03,0.913506,-0.964538,0.384332
2018-01-04,0.578182,0.490828,-0.191651


In [18]:
df.sort_values(by='B', ascending=True)  # Sort by values

Unnamed: 0_level_0,A,B,C
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-01-01,-0.556326,-1.619785,0.368533
2018-01-03,0.384332,-0.964538,0.913506
2018-01-04,-0.191651,0.490828,0.578182
2018-01-02,-1.413169,2.012528,0.632755


### Data Selection (Indexing and Slicing)

In [19]:
df

Unnamed: 0_level_0,A,B,C
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-01-01,-0.556326,-1.619785,0.368533
2018-01-02,-1.413169,2.012528,0.632755
2018-01-03,0.384332,-0.964538,0.913506
2018-01-04,-0.191651,0.490828,0.578182


In [20]:
df['A']  # Selecting a single column

DateTime
2018-01-01   -0.556326
2018-01-02   -1.413169
2018-01-03    0.384332
2018-01-04   -0.191651
Freq: D, Name: A, dtype: float64

In [21]:
df.loc['20180102':'20180104', ['B', 'C']]  # Selecting by label

Unnamed: 0_level_0,B,C
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-01-02,2.012528,0.632755
2018-01-03,-0.964538,0.913506
2018-01-04,0.490828,0.578182


In [22]:
df.iloc[1:3, [0, 2]]  # Selecting by position

Unnamed: 0_level_0,A,C
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-01-02,-1.413169,0.632755
2018-01-03,0.384332,0.913506


In [23]:
df

Unnamed: 0_level_0,A,B,C
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-01-01,-0.556326,-1.619785,0.368533
2018-01-02,-1.413169,2.012528,0.632755
2018-01-03,0.384332,-0.964538,0.913506
2018-01-04,-0.191651,0.490828,0.578182


In [24]:
df.A > 0

DateTime
2018-01-01    False
2018-01-02    False
2018-01-03     True
2018-01-04    False
Freq: D, Name: A, dtype: bool

In [25]:
df[df.A > 0]  # Boolean indexing

Unnamed: 0_level_0,A,B,C
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-01-03,0.384332,-0.964538,0.913506


In [26]:
df > 0

Unnamed: 0_level_0,A,B,C
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-01-01,False,False,True
2018-01-02,False,True,True
2018-01-03,True,False,True
2018-01-04,False,True,True


In [27]:
df[df > 0]

Unnamed: 0_level_0,A,B,C
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-01-01,,,0.368533
2018-01-02,,2.012528,0.632755
2018-01-03,0.384332,,0.913506
2018-01-04,,0.490828,0.578182


In [28]:
df2

Unnamed: 0,one,two
a,1.0,t1
b,2.0,t2
c,,t3


In [29]:
fil = df2['two'].isin(['t1','t3','t4'])  # Using isin for filtering
fil

a     True
b    False
c     True
Name: two, dtype: bool

In [30]:
df2[fil]

Unnamed: 0,one,two
a,1.0,t1
c,,t3


### Setting New Data

In [31]:
df

Unnamed: 0_level_0,A,B,C
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-01-01,-0.556326,-1.619785,0.368533
2018-01-02,-1.413169,2.012528,0.632755
2018-01-03,0.384332,-0.964538,0.913506
2018-01-04,-0.191651,0.490828,0.578182


In [32]:
df['B'] = 1
df

Unnamed: 0_level_0,A,B,C
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-01-01,-0.556326,1,0.368533
2018-01-02,-1.413169,1,0.632755
2018-01-03,0.384332,1,0.913506
2018-01-04,-0.191651,1,0.578182


In [33]:
df.iloc[0, 2] = 100
df

Unnamed: 0_level_0,A,B,C
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-01-01,-0.556326,1,100.0
2018-01-02,-1.413169,1,0.632755
2018-01-03,0.384332,1,0.913506
2018-01-04,-0.191651,1,0.578182


In [34]:
df[df < 0] = -df
df

Unnamed: 0_level_0,A,B,C
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-01-01,0.556326,1,100.0
2018-01-02,1.413169,1,0.632755
2018-01-03,0.384332,1,0.913506
2018-01-04,0.191651,1,0.578182


### Missing Data or Empty Cells

In [35]:
df.iloc[[0, 3], 1] = np.NaN  # Not a number
df

Unnamed: 0_level_0,A,B,C
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-01-01,0.556326,,100.0
2018-01-02,1.413169,1.0,0.632755
2018-01-03,0.384332,1.0,0.913506
2018-01-04,0.191651,,0.578182


In [36]:
df.dropna(how='any')  # Drop the rows with any missing data (not in-place)

Unnamed: 0_level_0,A,B,C
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-01-02,1.413169,1.0,0.632755
2018-01-03,0.384332,1.0,0.913506


In [37]:
df.fillna(value=-1)  # Fill missing data with a value (not in-place)

Unnamed: 0_level_0,A,B,C
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-01-01,0.556326,-1.0,100.0
2018-01-02,1.413169,1.0,0.632755
2018-01-03,0.384332,1.0,0.913506
2018-01-04,0.191651,-1.0,0.578182


In [38]:
df

Unnamed: 0_level_0,A,B,C
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-01-01,0.556326,,100.0
2018-01-02,1.413169,1.0,0.632755
2018-01-03,0.384332,1.0,0.913506
2018-01-04,0.191651,,0.578182


In [39]:
pd.isnull(df)  # Get the boolean mask where values are NaN

Unnamed: 0_level_0,A,B,C
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-01-01,False,True,False
2018-01-02,False,False,False
2018-01-03,False,False,False
2018-01-04,False,True,False


### Course Materials on YouTube and GitHub

- Course videos are hosted by YouTube ( http://youtube.com/yongtwang ).
- Course documents (Jupyter Notebooks and Python source code) are hosted by GitHub ( http://github.com/yongtwang ).