# 10 Minutes to pandas

This is a short introduction to pandas, geared mainly for new users. You can see more complex recipes in the Cookbook

Customarily, we import as follows:

In [169]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Object Creation

See the Data Structure Intro section

Creating a Series by passing a list of values, letting pandas create a default integer index:

In [170]:
s = pd.Series([1,3,5,np.nan,6,8])

Creating a DataFrame by passing a numpy array, with a datetime index and labeled columns:

In [171]:
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [172]:
dates = pd.date_range('20170701', periods = 6)

In [173]:
df = pd.DataFrame(np.random.rand(6,4), index = dates, columns=list('ABCD'))

In [174]:
df

Unnamed: 0,A,B,C,D
2017-07-01,0.668505,0.256012,0.088454,0.366526
2017-07-02,0.372194,0.259593,0.131392,0.740838
2017-07-03,0.056692,0.708669,0.255733,0.907439
2017-07-04,0.824013,0.179509,0.795908,0.689344
2017-07-05,0.201423,0.302832,0.215712,0.437765
2017-07-06,0.939856,0.877096,0.302152,0.628505


Creating a DataFrame by passing a dict of objects that can be converted to series-like.

In [64]:
df2 = pd.DataFrame({
    'A':1,
    'B' : pd.Timestamp('20130102'),
    'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
    'D' : np.array([3] * 4,dtype='int32'),
    'E' : pd.Categorical(["test","train","test","train"]),
    'F' : 'foo' 
})

In [65]:
df2

Unnamed: 0,A,B,C,D,E,F
0,1,2013-01-02,1.0,3,test,foo
1,1,2013-01-02,1.0,3,train,foo
2,1,2013-01-02,1.0,3,test,foo
3,1,2013-01-02,1.0,3,train,foo


In [42]:
pd.DataFrame(
{
    'C' : pd.Series(1,index=list(range(4)),dtype='float32')
}
)

Unnamed: 0,C
0,1.0
1,1.0
2,1.0
3,1.0


In [43]:
df2.dtypes

A             int64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

If you’re using IPython, tab completion for column names (as well as public attributes) is automatically enabled. Here’s a subset of the attributes that will be completed:

In [167]:
df2

Unnamed: 0,A,B,C,D,E,F
0,1,2013-01-02,1.0,3,test,foo
1,1,2013-01-02,1.0,3,train,foo
2,1,2013-01-02,1.0,3,test,foo
3,1,2013-01-02,1.0,3,train,foo


In [168]:
df

Unnamed: 0,A,B,C,D
0,0.238209,0.902206,0.479699,0.642928
1,0.139293,0.454752,0.235356,0.783967
2,0.721651,0.337595,0.485675,0.633115
3,0.166322,0.67625,0.463672,0.975147
4,0.428741,0.325252,0.332834,0.805807
5,0.123604,0.418347,0.867431,0.835774


In [121]:
num = pd.DataFrame({
    'data' : (1,2,3,4,5,6)
})

In [147]:
df

Unnamed: 0,A,B,C,D
0,0.238209,0.902206,0.479699,0.642928
1,0.139293,0.454752,0.235356,0.783967
2,0.721651,0.337595,0.485675,0.633115
3,0.166322,0.67625,0.463672,0.975147
4,0.428741,0.325252,0.332834,0.805807
5,0.123604,0.418347,0.867431,0.835774


In [150]:
df.head()

Unnamed: 0,A,B,C,D
0,0.238209,0.902206,0.479699,0.642928
1,0.139293,0.454752,0.235356,0.783967
2,0.721651,0.337595,0.485675,0.633115
3,0.166322,0.67625,0.463672,0.975147
4,0.428741,0.325252,0.332834,0.805807


In [152]:
df.tail(3)

Unnamed: 0,A,B,C,D
3,0.166322,0.67625,0.463672,0.975147
4,0.428741,0.325252,0.332834,0.805807
5,0.123604,0.418347,0.867431,0.835774


In [154]:
df.index

Int64Index([0, 1, 2, 3, 4, 5], dtype='int64')

In [156]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [157]:
df.values

array([[ 0.23820886,  0.90220626,  0.47969867,  0.64292815],
       [ 0.13929348,  0.45475217,  0.23535644,  0.78396716],
       [ 0.72165055,  0.33759499,  0.48567543,  0.63311514],
       [ 0.16632156,  0.67624983,  0.46367152,  0.97514682],
       [ 0.42874098,  0.32525213,  0.33283407,  0.80580689],
       [ 0.1236038 ,  0.4183468 ,  0.8674315 ,  0.83577382]])

In [158]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.30297,0.519067,0.477445,0.779456
std,0.233641,0.226398,0.215347,0.128247
min,0.123604,0.325252,0.235356,0.633115
25%,0.146051,0.357783,0.365543,0.678188
50%,0.202265,0.436549,0.471685,0.794887
75%,0.381108,0.620875,0.484181,0.828282
max,0.721651,0.902206,0.867431,0.975147


In [159]:
df.T

Unnamed: 0,0,1,2,3,4,5
A,0.238209,0.139293,0.721651,0.166322,0.428741,0.123604
B,0.902206,0.454752,0.337595,0.67625,0.325252,0.418347
C,0.479699,0.235356,0.485675,0.463672,0.332834,0.867431
D,0.642928,0.783967,0.633115,0.975147,0.805807,0.835774


In [161]:
# 컬럼명으로 컬럼정렬
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
0,0.642928,0.479699,0.902206,0.238209
1,0.783967,0.235356,0.454752,0.139293
2,0.633115,0.485675,0.337595,0.721651
3,0.975147,0.463672,0.67625,0.166322
4,0.805807,0.332834,0.325252,0.428741
5,0.835774,0.867431,0.418347,0.123604


In [162]:
df.sort_values(by='B')

Unnamed: 0,A,B,C,D
4,0.428741,0.325252,0.332834,0.805807
2,0.721651,0.337595,0.485675,0.633115
5,0.123604,0.418347,0.867431,0.835774
1,0.139293,0.454752,0.235356,0.783967
3,0.166322,0.67625,0.463672,0.975147
0,0.238209,0.902206,0.479699,0.642928


# Selection

In [163]:
df['A']

0    0.238209
1    0.139293
2    0.721651
3    0.166322
4    0.428741
5    0.123604
Name: A, dtype: float64

In [165]:
df[0:3]

Unnamed: 0,A,B,C,D
0,0.238209,0.902206,0.479699,0.642928
1,0.139293,0.454752,0.235356,0.783967
2,0.721651,0.337595,0.485675,0.633115


In [185]:
pd.date_range(periods=5, start='2017-08-31', freq='D')

DatetimeIndex(['2017-08-31', '2017-09-01', '2017-09-02', '2017-09-03',
               '2017-09-04'],
              dtype='datetime64[ns]', freq='D')

In [178]:
df

Unnamed: 0,A,B,C,D
2017-07-01,0.668505,0.256012,0.088454,0.366526
2017-07-02,0.372194,0.259593,0.131392,0.740838
2017-07-03,0.056692,0.708669,0.255733,0.907439
2017-07-04,0.824013,0.179509,0.795908,0.689344
2017-07-05,0.201423,0.302832,0.215712,0.437765
2017-07-06,0.939856,0.877096,0.302152,0.628505


### Selection by label
#### loc -> index와 컬럼명으로 데이터에 접근 할 수 있다.

In [212]:
df.loc[dates[0]]

A    3.000000
B    0.256012
C    0.088454
D    0.366526
Name: 2017-07-01 00:00:00, dtype: float64

In [213]:
df.loc[:,['A','B']]

Unnamed: 0,A,B
2017-07-01,3.0,0.256012
2017-07-02,0.372194,0.259593
2017-07-03,0.056692,0.708669
2017-07-04,0.824013,0.179509
2017-07-05,0.201423,0.302832
2017-07-06,0.939856,0.877096


In [214]:
df.loc['2017-07-01':'2017-07-02',['A']]

Unnamed: 0,A
2017-07-01,3.0
2017-07-02,0.372194


In [215]:
df.loc['2017-07-01','A']

3.0

In [216]:
df.loc['2017-07-03','B']

0.7086690004908256

In [217]:
df.loc[dates[0],'A'] = 3

In [218]:
## loc 랑 기능 같고, 속도가 더 빠름
df.at[dates[0],'A']

3.0

## Selection by postion
#### 데이터 프레임 ~ 번째 ~ 번째 값에 접근한다. 

In [207]:
df.iloc[3]

A    0.824013
B    0.179509
C    0.795908
D    0.689344
Name: 2017-07-04 00:00:00, dtype: float64

In [210]:
df.iloc[3:5,0:2]

Unnamed: 0,A,B
2017-07-04,0.824013,0.179509
2017-07-05,0.201423,0.302832


In [211]:
df.iloc[[1,2,4],[0,2]]

Unnamed: 0,A,C
2017-07-02,0.372194,0.131392
2017-07-03,0.056692,0.255733
2017-07-05,0.201423,0.215712


In [219]:
df

Unnamed: 0,A,B,C,D
2017-07-01,3.0,0.256012,0.088454,0.366526
2017-07-02,0.372194,0.259593,0.131392,0.740838
2017-07-03,0.056692,0.708669,0.255733,0.907439
2017-07-04,0.824013,0.179509,0.795908,0.689344
2017-07-05,0.201423,0.302832,0.215712,0.437765
2017-07-06,0.939856,0.877096,0.302152,0.628505


In [220]:
df.iloc[[0,2],[0,1]]
# 0.088454, 0.256012

Unnamed: 0,A,B
2017-07-01,3.0,0.256012
2017-07-03,0.056692,0.708669
