# Restructuring Data into Tidy Form

## 정돈된 형태로 데이터 재구성 



## 8.4 스택된 데이터 되돌리기 

- Dataframe 은 두 가지 유사한 메서드인 stack 과 melt 가 있어서 수평 열 이름을 수직 열 값으로 변환할 수 있다. DataFrame 은 이 두 연산을 각각 unstack 과 pivot 메서드를 사용해 되돌릴 수 있다. 
- stack/unstack : melt/pivot 보다 더 단순한 메서드로 열/행 인덱스에 대해서만 조절한다
- melt/pivot : 어떤 열을 재구성할 것인지 선택할 수 있어 보다 더 많은 유연성을 가질 수 있다.

### 준비 단계

이 단계에서는 데이터셋을 stack/melt 한 후 unstack/pivot 을 사용해 즉시 원래 상태로 되돌린다.

### 방법

In [1]:
import pandas as pd
path = 'C:/Users/HS/Documents/GitHub/Python-Study/Pandas-Cookbook/data'

In [11]:
# 1. college 데이터셋을 기관명을 인덱스로 설정하고 학부생 인종 열 그룹만 읽어들인다. 
usecol_func = lambda x: 'UGDS_' in x or x == 'INSTNM'

college = pd.read_csv(path + '/college.csv', 
                      index_col = 'INSTNM',    # index_col : 인덱스로 사용할 열(기관명)
                      usecols = usecol_func)   # usecols : 사용할 열만 불러들이기(인종열)
college.head()


Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,0.0333,0.9353,0.0055,0.0019,0.0024,0.0019,0.0,0.0059,0.0138
University of Alabama at Birmingham,0.5922,0.26,0.0283,0.0518,0.0022,0.0007,0.0368,0.0179,0.01
Amridge University,0.299,0.4192,0.0069,0.0034,0.0,0.0,0.0,0.0,0.2715
University of Alabama in Huntsville,0.6988,0.1255,0.0382,0.0376,0.0143,0.0002,0.0172,0.0332,0.035
Alabama State University,0.0158,0.9208,0.0121,0.0019,0.001,0.0006,0.0098,0.0243,0.0137


In [6]:
# 2. stack 메서드를 사용해 각 수평 열을 수직 인덱스 레벨로 변환한다. 

college_stacked = college.stack()
college_stacked.head(18)

INSTNM                                         
Alabama A & M University             UGDS_WHITE    0.0333
                                     UGDS_BLACK    0.9353
                                     UGDS_HISP     0.0055
                                     UGDS_ASIAN    0.0019
                                     UGDS_AIAN     0.0024
                                     UGDS_NHPI     0.0019
                                     UGDS_2MOR     0.0000
                                     UGDS_NRA      0.0059
                                     UGDS_UNKN     0.0138
University of Alabama at Birmingham  UGDS_WHITE    0.5922
                                     UGDS_BLACK    0.2600
                                     UGDS_HISP     0.0283
                                     UGDS_ASIAN    0.0518
                                     UGDS_AIAN     0.0022
                                     UGDS_NHPI     0.0007
                                     UGDS_2MOR     0.0368
                        

In [10]:
# 3. 이 스택된 데이터를 unstack Series 메서드를 사용해 원래 형태로 되돌린다. 
college_stacked.unstack().head()

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,0.0333,0.9353,0.0055,0.0019,0.0024,0.0019,0.0,0.0059,0.0138
University of Alabama at Birmingham,0.5922,0.26,0.0283,0.0518,0.0022,0.0007,0.0368,0.0179,0.01
Amridge University,0.299,0.4192,0.0069,0.0034,0.0,0.0,0.0,0.0,0.2715
University of Alabama in Huntsville,0.6988,0.1255,0.0382,0.0376,0.0143,0.0002,0.0172,0.0332,0.035
Alabama State University,0.0158,0.9208,0.0121,0.0019,0.001,0.0006,0.0098,0.0243,0.0137


In [13]:
# 4. melt 와 pivot 을 순서대로 사용해 비슷한 연산을 사용할 수 있다. 먼저 인덱스 기관 이름을 사용하지 않고 데이터를 읽어들인다. 

college2 = pd.read_csv(path + '/college.csv', 
                     usecols = usecol_func)
college2.head()

Unnamed: 0,INSTNM,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
0,Alabama A & M University,0.0333,0.9353,0.0055,0.0019,0.0024,0.0019,0.0,0.0059,0.0138
1,University of Alabama at Birmingham,0.5922,0.26,0.0283,0.0518,0.0022,0.0007,0.0368,0.0179,0.01
2,Amridge University,0.299,0.4192,0.0069,0.0034,0.0,0.0,0.0,0.0,0.2715
3,University of Alabama in Huntsville,0.6988,0.1255,0.0382,0.0376,0.0143,0.0002,0.0172,0.0332,0.035
4,Alabama State University,0.0158,0.9208,0.0121,0.0019,0.001,0.0006,0.0098,0.0243,0.0137


In [19]:
# melt 메서드를 써서 모든 인종 열을 단일 열로 전치한다.
college_melted = college2.melt(id_vars = 'INSTNM', 
                                 var_name = 'Race',
                                 value_name = 'Percentage')
college_melted

Unnamed: 0,INSTNM,Race,Percentage
0,Alabama A & M University,UGDS_WHITE,0.0333
1,University of Alabama at Birmingham,UGDS_WHITE,0.5922
2,Amridge University,UGDS_WHITE,0.2990
3,University of Alabama in Huntsville,UGDS_WHITE,0.6988
4,Alabama State University,UGDS_WHITE,0.0158
5,The University of Alabama,UGDS_WHITE,0.7825
6,Central Alabama Community College,UGDS_WHITE,0.7255
7,Athens State University,UGDS_WHITE,0.7823
8,Auburn University at Montgomery,UGDS_WHITE,0.5328
9,Auburn University,UGDS_WHITE,0.8507


In [20]:
# 6. pivot 메서드를 사용해 앞 결과를 되돌린다. 

melted_inv = college_melted.pivot(index = 'INSTNM',
                                 columns = 'Race', 
                                 values = 'Percentage')
melted_inv.head()

Race,UGDS_2MOR,UGDS_AIAN,UGDS_ASIAN,UGDS_BLACK,UGDS_HISP,UGDS_NHPI,UGDS_NRA,UGDS_UNKN,UGDS_WHITE
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
A & W Healthcare Educators,0.0,0.0,0.0,0.975,0.025,0.0,0.0,0.0,0.0
A T Still University of Health Sciences,,,,,,,,,
ABC Beauty Academy,0.0,0.0,0.9333,0.0333,0.0333,0.0,0.0,0.0,0.0
ABC Beauty College Inc,0.0,0.0,0.0,0.6579,0.0526,0.0,0.0,0.0,0.2895
AI Miami International University of Art and Design,0.0018,0.0,0.0018,0.0198,0.4773,0.0,0.0025,0.4644,0.0324


In [33]:
# 7. 기관명이 인덱스 사이를 왔다 갔다 하는 사이 순서가 달라진 점에 주목하자. 
# 열 이름이 원래의 순서가 아니다. 단계 4에서 시작할 때의 DataFrame 을 정확히 복제하려면 .
# .loc 인덱스 연산자를 사용해 행과 열을 동시에 선택하고 인덱스를 리셋해야 한다.

college2_replication = melted_inv.loc[college2['INSTNM'],
                                     college2.columns[1:]].reset_index()
print(college2.equals(college2_replication))
college2_replication

True


Unnamed: 0,INSTNM,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
0,Alabama A & M University,0.0333,0.9353,0.0055,0.0019,0.0024,0.0019,0.0000,0.0059,0.0138
1,University of Alabama at Birmingham,0.5922,0.2600,0.0283,0.0518,0.0022,0.0007,0.0368,0.0179,0.0100
2,Amridge University,0.2990,0.4192,0.0069,0.0034,0.0000,0.0000,0.0000,0.0000,0.2715
3,University of Alabama in Huntsville,0.6988,0.1255,0.0382,0.0376,0.0143,0.0002,0.0172,0.0332,0.0350
4,Alabama State University,0.0158,0.9208,0.0121,0.0019,0.0010,0.0006,0.0098,0.0243,0.0137
5,The University of Alabama,0.7825,0.1119,0.0348,0.0106,0.0038,0.0009,0.0261,0.0268,0.0026
6,Central Alabama Community College,0.7255,0.2613,0.0044,0.0025,0.0044,0.0000,0.0000,0.0000,0.0019
7,Athens State University,0.7823,0.1200,0.0191,0.0053,0.0157,0.0010,0.0174,0.0057,0.0334
8,Auburn University at Montgomery,0.5328,0.3376,0.0074,0.0221,0.0044,0.0016,0.0297,0.0397,0.0246
9,Auburn University,0.8507,0.0704,0.0248,0.0227,0.0074,0.0000,0.0000,0.0100,0.0140


### 작동와 원리 

단계 1의 결과를 성취할 수 있는 여러 방법이 있다. 여기서는 read_csv 함수의 유용성을 볼 수 있었다. usecols 매개변수는 임포트하려는 열 이름을 받아들이거나 이를 동적으로 결정하는 함수를 받아들인다. 여기서는 열 이름이 UGDS 를 포함하거나 INCTNM 와 같은 지를 확인하는 익명의 함수를 사용한다. 함수는 각 열 이름이 문자열로 전달되고 불리언을 반환해야 한다. 이 방법을 사용하면 대량의 메모리를 절약할 수 있다. 

단계 2에서 사용한 stack 메서드는 모든 열 이름을 가장 안쪽의 인덱스 레벨에 넣은 후 Series 를 반환한다. 단계 3에서 unstack 메서드는 가장 안쪽 인덱스 레벨의 값을 열 이름으로 변환함으로써 이 연산을 되돌린다. 

단계 4는 단계 1과 동일한 데이터셋을 읽어들이지만 기관명이 인덱스에 들어있지 않으면 melt 메서드가 접근할 수 없기 때문이다. 단계 5는 melt 메서드를 이용해 Race열을 모두 전치한다. 전치하는 방법은 value_vars 매개변수를 그 디폴트 값인 None 으로 두면 된다. 이 값이 지정되지 않으면 id_vars 에 있지 않은 모든 열은 전치된다. 

단계 6은 단계 5로의 연산들