# 3. DataFrame
* DataFrame is to attach `row index` and `column index` on two-dimensional array.
* It is similar to Excel's Sheet
* DataFrame = `value of two-dimensional array` + `row index` + `column index`
* Two-dimensional of NumPy must to be made up of all the same data type but DataFrame can be different to each columns's data type.

## 3-1. Generate DataFrame

In [112]:
import pandas as pd
import numpy as np

### 1) give only value

In [113]:
data = [['삼성', 2000, '스마트폰'], 
        ['LG', 1000, '가전제품'], 
        ['네이버', 500, '포털']]
d = pd.DataFrame(data)
d

Unnamed: 0,0,1,2
0,삼성,2000,스마트폰
1,LG,1000,가전제품
2,네이버,500,포털


In [114]:
type(d)

pandas.core.frame.DataFrame

In [115]:
d.shape

(3, 3)

In [116]:
# row index
d.index

RangeIndex(start=0, stop=3, step=1)

In [117]:
# column index
d.columns

RangeIndex(start=0, stop=3, step=1)

In [118]:
d.values

array([['삼성', 2000, '스마트폰'],
       ['LG', 1000, '가전제품'],
       ['네이버', 500, '포털']], dtype=object)

In [119]:
type(d.values)

numpy.ndarray

In [120]:
d.head(2)

Unnamed: 0,0,1,2
0,삼성,2000,스마트폰
1,LG,1000,가전제품


In [121]:
d.tail(2)

Unnamed: 0,0,1,2
1,LG,1000,가전제품
2,네이버,500,포털


In [122]:
# viewing the data structure of DataFrame
d.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       3 non-null      object
 1   1       3 non-null      int64 
 2   2       3 non-null      object
dtypes: int64(1), object(2)
memory usage: 200.0+ bytes


In [123]:
# summary statistics about columns
d.describe()

Unnamed: 0,1
count,3.0
mean,1166.666667
std,763.762616
min,500.0
25%,750.0
50%,1000.0
75%,1500.0
max,2000.0


In [124]:
# Quartile(4분위수) : one of the statistics - a point in quarter that arange the data distribution in ascending order. (a point that divided ones per 25% equally)
# cf) Quantile : a point that divided given data equally.
# => so, quantile or quartile are exist in various.
# Q1(25%), Q2(50%), Q3(75%), Q4(100%)
d.median()

1    1000.0
dtype: float64

### 2) give value and index together

In [125]:
index = [1, 2, 3] # row index
columns = ['기업명', '주가', '업종'] # column index
data = [['삼성', 2000, '스마트폰'],
        ['LG', 1000, '가전제품'],
        ['네이버', 500, '포털']]
d = pd.DataFrame(data=data, index=index, columns=columns)
d

Unnamed: 0,기업명,주가,업종
1,삼성,2000,스마트폰
2,LG,1000,가전제품
3,네이버,500,포털


### 3) give in dictionary type (Create by Column)

In [126]:
data = {
    '기업명':['삼성', 'LG', '네이버'],
    '주가':[2000,1000,500],
    '업종':['스마트폰','가전제품','포털']
}
d2 = pd.DataFrame(data=data)
d2

Unnamed: 0,기업명,주가,업종
0,삼성,2000,스마트폰
1,LG,1000,가전제품
2,네이버,500,포털


In [127]:
# name at row index
d2.index.name = '순서'
d2

Unnamed: 0_level_0,기업명,주가,업종
순서,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,삼성,2000,스마트폰
1,LG,1000,가전제품
2,네이버,500,포털


In [128]:
# name at column index
d2.columns.name = '항목'
d2

항목,기업명,주가,업종
순서,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,삼성,2000,스마트폰
1,LG,1000,가전제품
2,네이버,500,포털


## 3-2. DataFrame Indexing and Slicing

In [129]:
# row indexing
d['기업명']

1     삼성
2     LG
3    네이버
Name: 기업명, dtype: object

In [130]:
type(d['기업명'])

pandas.core.series.Series

In [131]:
d.기업명

1     삼성
2     LG
3    네이버
Name: 기업명, dtype: object

In [132]:
# row Fancy Indexing
b = d[['기업명', '업종']]
b

Unnamed: 0,기업명,업종
1,삼성,스마트폰
2,LG,가전제품
3,네이버,포털


In [133]:
type(b)

pandas.core.frame.DataFrame

In [134]:
# make DataFrame with indexing exclusively one column (기업명)
d[['기업명']]

Unnamed: 0,기업명
1,삼성
2,LG
3,네이버


In [135]:
# DataFrame which is consist of 주가 and 업종 (and move 기업명 to row index)
d

Unnamed: 0,기업명,주가,업종
1,삼성,2000,스마트폰
2,LG,1000,가전제품
3,네이버,500,포털


In [136]:
d.index

Int64Index([1, 2, 3], dtype='int64')

In [137]:
d.index = d['기업명']
d

Unnamed: 0_level_0,기업명,주가,업종
기업명,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
삼성,삼성,2000,스마트폰
LG,LG,1000,가전제품
네이버,네이버,500,포털


In [138]:
del d['기업명']
d

Unnamed: 0_level_0,주가,업종
기업명,Unnamed: 1_level_1,Unnamed: 2_level_1
삼성,2000,스마트폰
LG,1000,가전제품
네이버,500,포털


In [139]:
# add column (발행주식수)
d['발행주식수'] = [30000, 20000, 10000]
d

Unnamed: 0_level_0,주가,업종,발행주식수
기업명,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
삼성,2000,스마트폰,30000
LG,1000,가전제품,20000
네이버,500,포털,10000


In [140]:
# column indexing (삼성)

### 1) loc(Label Location) : column indexing standard on column index label.

In [141]:
a = d.loc['삼성']  # It causes error as you use non-defined label name
a

주가        2000
업종        스마트폰
발행주식수    30000
Name: 삼성, dtype: object

In [142]:
type(a)

pandas.core.series.Series

In [143]:
d.loc[['삼성', '네이버']]

Unnamed: 0_level_0,주가,업종,발행주식수
기업명,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
삼성,2000,스마트폰,30000
네이버,500,포털,10000


### 2) iloc (Integer Location) : column indexing standard on 0 index

In [144]:
d.iloc[0]

주가        2000
업종        스마트폰
발행주식수    30000
Name: 삼성, dtype: object

In [145]:
d.iloc[-1]

주가         500
업종          포털
발행주식수    10000
Name: 네이버, dtype: object

In [146]:
d.iloc[[0, -1]]

Unnamed: 0_level_0,주가,업종,발행주식수
기업명,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
삼성,2000,스마트폰,30000
네이버,500,포털,10000


In [147]:
d

Unnamed: 0_level_0,주가,업종,발행주식수
기업명,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
삼성,2000,스마트폰,30000
LG,1000,가전제품,20000
네이버,500,포털,10000


In [148]:
# row indexing -> column indexing
d['업종']

기업명
삼성     스마트폰
LG     가전제품
네이버      포털
Name: 업종, dtype: object

In [149]:
d['업종']['삼성']

'스마트폰'

In [150]:
#d['업종', '삼성'] # you can't use this syntax

In [151]:
# column indexing -> row indexing
d.loc['삼성']

주가        2000
업종        스마트폰
발행주식수    30000
Name: 삼성, dtype: object

In [152]:
d.loc['삼성']['업종']

'스마트폰'

In [153]:
d.loc['삼성', '업종'] # more universal

'스마트폰'

In [154]:
d.iloc[0, 1]

'스마트폰'

In [155]:
d

Unnamed: 0_level_0,주가,업종,발행주식수
기업명,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
삼성,2000,스마트폰,30000
LG,1000,가전제품,20000
네이버,500,포털,10000


In [156]:
# row slicing
d[:2]

Unnamed: 0_level_0,주가,업종,발행주식수
기업명,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
삼성,2000,스마트폰,30000
LG,1000,가전제품,20000


In [157]:
d[1:]

Unnamed: 0_level_0,주가,업종,발행주식수
기업명,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
LG,1000,가전제품,20000
네이버,500,포털,10000


In [158]:
d[1:2]

Unnamed: 0_level_0,주가,업종,발행주식수
기업명,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
LG,1000,가전제품,20000


In [159]:
d['LG':'LG']

Unnamed: 0_level_0,주가,업종,발행주식수
기업명,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
LG,1000,가전제품,20000


In [160]:
d.columns

Index(['주가', '업종', '발행주식수'], dtype='object')

In [161]:
d.columns[:2]

Index(['주가', '업종'], dtype='object')

In [162]:
# column slicing
d[d.columns[:2]]

Unnamed: 0_level_0,주가,업종
기업명,Unnamed: 1_level_1,Unnamed: 2_level_1
삼성,2000,스마트폰
LG,1000,가전제품
네이버,500,포털


In [163]:
# Boolean 인덱싱
d

Unnamed: 0_level_0,주가,업종,발행주식수
기업명,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
삼성,2000,스마트폰,30000
LG,1000,가전제품,20000
네이버,500,포털,10000


In [164]:
d['발행주식수'] >= 15000

기업명
삼성      True
LG      True
네이버    False
Name: 발행주식수, dtype: bool

In [165]:
d[d['발행주식수'] >= 15000]

Unnamed: 0_level_0,주가,업종,발행주식수
기업명,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
삼성,2000,스마트폰,30000
LG,1000,가전제품,20000


In [166]:
# loc slicing
d.loc['삼성':'LG']

Unnamed: 0_level_0,주가,업종,발행주식수
기업명,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
삼성,2000,스마트폰,30000
LG,1000,가전제품,20000


In [167]:
d

Unnamed: 0_level_0,주가,업종,발행주식수
기업명,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
삼성,2000,스마트폰,30000
LG,1000,가전제품,20000
네이버,500,포털,10000


In [168]:
d.loc['삼성':'LG', '업종':'발행주식수']

Unnamed: 0_level_0,업종,발행주식수
기업명,Unnamed: 1_level_1,Unnamed: 2_level_1
삼성,스마트폰,30000
LG,가전제품,20000


In [169]:
d.iloc[:2, 1:]

Unnamed: 0_level_0,업종,발행주식수
기업명,Unnamed: 1_level_1,Unnamed: 2_level_1
삼성,스마트폰,30000
LG,가전제품,20000


## 3-3 DataFrame Data CRUD

In [170]:
# create row
#d.append({'주가':1500, '업종':'IT', '발행주식수':15000}, ignore_index=True) # it isn't good way
#d

In [171]:
d

Unnamed: 0_level_0,주가,업종,발행주식수
기업명,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
삼성,2000,스마트폰,30000
LG,1000,가전제품,20000
네이버,500,포털,10000


In [172]:
data2 = {'주가':1500, '업종':'IT', '발행주식수':15000}
index2 = ['카카오']
d2 = pd.DataFrame(data=data2, index=index2)
d2

Unnamed: 0,주가,업종,발행주식수
카카오,1500,IT,15000


In [173]:
# create row 1
d = d.append(d2)
d

Unnamed: 0,주가,업종,발행주식수
삼성,2000,스마트폰,30000
LG,1000,가전제품,20000
네이버,500,포털,10000
카카오,1500,IT,15000


In [177]:
d = pd.concat([d, d2])
d

Unnamed: 0,주가,업종,발행주식수
삼성,2000,스마트폰,30000
LG,1000,가전제품,20000
네이버,500,포털,10000
카카오,1500,IT,15000
카카오,1500,IT,15000


In [174]:
s = pd.Series(data=data2)
s

주가        1500
업종          IT
발행주식수    15000
dtype: object