# Pandas 실습

## 학습 목표



*   Pandas를 이용한 데이터 처리의 기본을 이해한다.
*   데이터 프레임을 만들고 활용하는 기본 원리를 이해한다.



판다스(Pandas)는 파이썬 데이터 처리를 위한 라이브러리입니다. 파이썬을 이용한 데이터 분석과 같은 작업에서 필수 라이브러리로 알려져있습니다. 참고 할 수 있는 Pandas 링크는 다음과 같습니다.

링크 : http://pandas.pydata.org/pandas-docs/stable/

In [None]:
import numpy as np
import pandas as pd

## Object Creation

In [None]:
s = pd.Series([1, 3, 5, np.nan, 6, 8])

s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [None]:
dates = pd.date_range("20130101", periods=6)

dates

df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list("ABCD"))

df

Unnamed: 0,A,B,C,D
2013-01-01,-0.537373,-0.541963,0.481311,-1.308247
2013-01-02,0.015091,-1.497063,0.810655,0.066823
2013-01-03,-1.181875,1.413479,0.039071,-1.358815
2013-01-04,0.150618,-0.583306,-0.076017,-1.208279
2013-01-05,0.096517,1.535287,-0.251767,0.644491
2013-01-06,0.024792,0.635045,-0.480779,-0.244599


In [None]:
df2 = pd.DataFrame(
    {
        "A": 1.0,
        "B": pd.Timestamp("20130102"),
        "C": pd.Series(1, index=list(range(4)), dtype="float32"),
        "D": np.array([3] * 4, dtype="int32"),
        "E": pd.Categorical(["test", "train", "test", "train"]),
        "F": "foo",
    }
)

df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [None]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

## Viewing Data

In [None]:
df.head(3)

Unnamed: 0,A,B,C,D
2013-01-01,-0.537373,-0.541963,0.481311,-1.308247
2013-01-02,0.015091,-1.497063,0.810655,0.066823
2013-01-03,-1.181875,1.413479,0.039071,-1.358815


In [None]:
df.tail(3)

Unnamed: 0,A,B,C,D
2013-01-04,0.150618,-0.583306,-0.076017,-1.208279
2013-01-05,0.096517,1.535287,-0.251767,0.644491
2013-01-06,0.024792,0.635045,-0.480779,-0.244599


In [None]:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [None]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [None]:
df.to_numpy()

array([[-0.5373727 , -0.5419633 ,  0.48131065, -1.30824691],
       [ 0.01509131, -1.49706268,  0.8106547 ,  0.06682275],
       [-1.18187463,  1.41347939,  0.03907145, -1.35881521],
       [ 0.15061756, -0.58330557, -0.07601703, -1.20827863],
       [ 0.09651745,  1.53528667, -0.25176749,  0.6444915 ],
       [ 0.02479235,  0.63504541, -0.48077945, -0.24459904]])

In [None]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.238705,0.160247,0.087079,-0.568104
std,0.524702,1.223062,0.478574,0.843924
min,-1.181875,-1.497063,-0.480779,-1.358815
25%,-0.399257,-0.57297,-0.20783,-1.283255
50%,0.019942,0.046541,-0.018473,-0.726439
75%,0.078586,1.218871,0.370751,-0.011033
max,0.150618,1.535287,0.810655,0.644491


In [None]:
df.T

Unnamed: 0,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06
A,-0.537373,0.015091,-1.181875,0.150618,0.096517,0.024792
B,-0.541963,-1.497063,1.413479,-0.583306,1.535287,0.635045
C,0.481311,0.810655,0.039071,-0.076017,-0.251767,-0.480779
D,-1.308247,0.066823,-1.358815,-1.208279,0.644491,-0.244599


In [None]:
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2013-01-01,-1.308247,0.481311,-0.541963,-0.537373
2013-01-02,0.066823,0.810655,-1.497063,0.015091
2013-01-03,-1.358815,0.039071,1.413479,-1.181875
2013-01-04,-1.208279,-0.076017,-0.583306,0.150618
2013-01-05,0.644491,-0.251767,1.535287,0.096517
2013-01-06,-0.244599,-0.480779,0.635045,0.024792


In [None]:
df.sort_values(by="B")

Unnamed: 0,A,B,C,D
2013-01-02,0.015091,-1.497063,0.810655,0.066823
2013-01-04,0.150618,-0.583306,-0.076017,-1.208279
2013-01-01,-0.537373,-0.541963,0.481311,-1.308247
2013-01-06,0.024792,0.635045,-0.480779,-0.244599
2013-01-03,-1.181875,1.413479,0.039071,-1.358815
2013-01-05,0.096517,1.535287,-0.251767,0.644491


## Selection

In [None]:
df["A"]

2013-01-01   -0.537373
2013-01-02    0.015091
2013-01-03   -1.181875
2013-01-04    0.150618
2013-01-05    0.096517
2013-01-06    0.024792
Freq: D, Name: A, dtype: float64

In [None]:
df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,-0.537373,-0.541963,0.481311,-1.308247
2013-01-02,0.015091,-1.497063,0.810655,0.066823
2013-01-03,-1.181875,1.413479,0.039071,-1.358815


In [None]:
df.loc[dates[0]]

A   -0.537373
B   -0.541963
C    0.481311
D   -1.308247
Name: 2013-01-01 00:00:00, dtype: float64

In [None]:
df.loc[:, ["A", "B"]]

Unnamed: 0,A,B
2013-01-01,-0.537373,-0.541963
2013-01-02,0.015091,-1.497063
2013-01-03,-1.181875,1.413479
2013-01-04,0.150618,-0.583306
2013-01-05,0.096517,1.535287
2013-01-06,0.024792,0.635045


In [None]:
df.loc["20130102":"20130104", ["A", "B"]]

Unnamed: 0,A,B
2013-01-02,0.015091,-1.497063
2013-01-03,-1.181875,1.413479
2013-01-04,0.150618,-0.583306


## Selection by Position

In [None]:
df.iloc[3]

A    0.150618
B   -0.583306
C   -0.076017
D   -1.208279
Name: 2013-01-04 00:00:00, dtype: float64

In [None]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2013-01-04,0.150618,-0.583306
2013-01-05,0.096517,1.535287


In [None]:
df.iloc[[1, 2, 4], [0, 2]]

Unnamed: 0,A,C
2013-01-02,0.015091,0.810655
2013-01-03,-1.181875,0.039071
2013-01-05,0.096517,-0.251767


In [None]:
df.iloc[1:3, :]

Unnamed: 0,A,B,C,D
2013-01-02,0.015091,-1.497063,0.810655,0.066823
2013-01-03,-1.181875,1.413479,0.039071,-1.358815


In [None]:
df.iloc[:, 1:3]

Unnamed: 0,B,C
2013-01-01,-0.541963,0.481311
2013-01-02,-1.497063,0.810655
2013-01-03,1.413479,0.039071
2013-01-04,-0.583306,-0.076017
2013-01-05,1.535287,-0.251767
2013-01-06,0.635045,-0.480779


In [None]:
df.iloc[1, 1]

-1.4970626818683912

## Boolean Indexing

In [None]:
df[df["A"] > 0]

Unnamed: 0,A,B,C,D
2013-01-02,0.015091,-1.497063,0.810655,0.066823
2013-01-04,0.150618,-0.583306,-0.076017,-1.208279
2013-01-05,0.096517,1.535287,-0.251767,0.644491
2013-01-06,0.024792,0.635045,-0.480779,-0.244599


In [None]:
df[df > 0]

Unnamed: 0,A,B,C,D
2013-01-01,,,0.481311,
2013-01-02,0.015091,,0.810655,0.066823
2013-01-03,,1.413479,0.039071,
2013-01-04,0.150618,,,
2013-01-05,0.096517,1.535287,,0.644491
2013-01-06,0.024792,0.635045,,


In [None]:
df2 = df.copy()
df2["E"] = ["one", "one", "two", "three", "four", "three"]
df2

Unnamed: 0,A,B,C,D,E
2013-01-01,-0.537373,-0.541963,0.481311,-1.308247,one
2013-01-02,0.015091,-1.497063,0.810655,0.066823,one
2013-01-03,-1.181875,1.413479,0.039071,-1.358815,two
2013-01-04,0.150618,-0.583306,-0.076017,-1.208279,three
2013-01-05,0.096517,1.535287,-0.251767,0.644491,four
2013-01-06,0.024792,0.635045,-0.480779,-0.244599,three


In [None]:
df2[df2["E"].isin(["two", "four"])]

Unnamed: 0,A,B,C,D,E
2013-01-03,-1.181875,1.413479,0.039071,-1.358815,two
2013-01-05,0.096517,1.535287,-0.251767,0.644491,four


## Missing Data

In [None]:
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ["E"])
df1.loc[dates[0] : dates[1], "E"] = 1
df1

Unnamed: 0,A,B,C,D,E
2013-01-01,-0.537373,-0.541963,0.481311,-1.308247,1.0
2013-01-02,0.015091,-1.497063,0.810655,0.066823,1.0
2013-01-03,-1.181875,1.413479,0.039071,-1.358815,
2013-01-04,0.150618,-0.583306,-0.076017,-1.208279,


In [None]:
df1.dropna(how="any")

Unnamed: 0,A,B,C,D,E
2013-01-01,-0.537373,-0.541963,0.481311,-1.308247,1.0
2013-01-02,0.015091,-1.497063,0.810655,0.066823,1.0


In [None]:
df1.fillna(value=5)

Unnamed: 0,A,B,C,D,E
2013-01-01,-0.537373,-0.541963,0.481311,-1.308247,1.0
2013-01-02,0.015091,-1.497063,0.810655,0.066823,1.0
2013-01-03,-1.181875,1.413479,0.039071,-1.358815,5.0
2013-01-04,0.150618,-0.583306,-0.076017,-1.208279,5.0


In [None]:
pd.isna(df1)

Unnamed: 0,A,B,C,D,E
2013-01-01,False,False,False,False,False
2013-01-02,False,False,False,False,False
2013-01-03,False,False,False,False,True
2013-01-04,False,False,False,False,True


## Merge

In [None]:
df = pd.DataFrame(np.random.randn(10, 4))
df

Unnamed: 0,0,1,2,3
0,-0.307852,0.396048,-0.546016,1.059068
1,2.036902,1.23103,-0.913893,-0.099561
2,1.253109,-1.37786,0.51811,1.605975
3,1.88847,0.133372,1.641441,-0.088215
4,-0.427393,0.47228,-0.048346,-0.362472
5,-1.354369,0.867187,-0.164675,0.861903
6,-0.012647,-1.570644,-1.152325,-0.237159
7,0.149257,0.95012,-0.613779,-1.058151
8,-1.793587,-0.019042,0.14331,0.118587
9,1.014702,-1.2607,1.513566,0.515109


In [None]:
pieces = [df[:3], df[3:7], df[7:]]
pd.concat(pieces)

Unnamed: 0,0,1,2,3
0,-0.307852,0.396048,-0.546016,1.059068
1,2.036902,1.23103,-0.913893,-0.099561
2,1.253109,-1.37786,0.51811,1.605975
3,1.88847,0.133372,1.641441,-0.088215
4,-0.427393,0.47228,-0.048346,-0.362472
5,-1.354369,0.867187,-0.164675,0.861903
6,-0.012647,-1.570644,-1.152325,-0.237159
7,0.149257,0.95012,-0.613779,-1.058151
8,-1.793587,-0.019042,0.14331,0.118587
9,1.014702,-1.2607,1.513566,0.515109


In [None]:
left = pd.DataFrame({"key": ["foo1", "foo2"], "lval": [1, 2]})
right = pd.DataFrame({"key": ["foo1", "foo2"], "rval": [4, 5]})
print(left)
print(right)

    key  lval
0  foo1     1
1  foo2     2
    key  rval
0  foo1     4
1  foo2     5


In [None]:
pd.merge(left, right, on="key")

Unnamed: 0,key,lval,rval
0,foo1,1,4
1,foo2,2,5


## Grouping

In [None]:
df = pd.DataFrame(
    {
        "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
        "B": ["one", "one", "two", "three", "two", "two", "one", "three"],
        "C": np.random.randn(8),
        "D": np.random.randn(8),
    }
)
df

Unnamed: 0,A,B,C,D
0,foo,one,-0.132819,-0.221794
1,bar,one,-0.292201,1.130019
2,foo,two,-2.071183,-0.664301
3,bar,three,-0.789196,-1.144451
4,foo,two,0.90819,-0.576351
5,bar,two,-0.467586,-1.140885
6,foo,one,-1.009631,0.560122
7,foo,three,0.29243,-2.106412


In [None]:
df.groupby("A")[["C", "D"]].sum()

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,-1.548983,-1.155317
foo,-2.013012,-3.008736


In [None]:
df.groupby(["A", "B"]).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-0.292201,1.130019
bar,three,-0.789196,-1.144451
bar,two,-0.467586,-1.140885
foo,one,-1.14245,0.338328
foo,three,0.29243,-2.106412
foo,two,-1.162993,-1.240652


## Data Import

In [None]:
df.read_csv("foo.csv")

In [None]:
df.read_hdf("foo.h5", "df")

In [None]:
df.read_excel("foo.xlsx", sheet_name="Sheet1")

## 학습 정리



*   Pandas는 데이터 프레임 생성과 처리를 도와주는 패키지
*   데이터 프레임을 인덱싱, 슬라이싱, 머지, 그룹핑하는 방법을 이해



본 예제 과정은 Pandas 10 minutes tutorial을 참고하여 제작하였음