# 9장 - 데이터 타이딩과 리셰이핑
- 타이디하지 못한 데이터의 특징
1. 병합 기준 열 관계에 대한 명확성 부족
2. 일대다 관계의 일에 해당하는 쪽에 중복에 있음
3. 다대다 관계로 인해 데이터가 중복됨
4. 열 이름에 값이 저장됨
5. 하나의 변숫값에 여러 값이 저장됨
6. 데이터가 분석 단위에 맞게 구조화되지 않음

## 중복 행 제거하기
- 중복 데이터를 제거하기에 앞서 일대다 중 다에 해당하는 쪽의 정보를 집계해야 한다.

In [1]:
import pandas as pd

covidcases=pd.read_csv('C:/data-cleansing-main/Chapter09/data/covidcases720.csv')

In [4]:
dailyvars = ['casedate','new_cases','new_deaths']
totvars = ['location','total_cases','total_deaths']
demovars = ['population','population_density','median_age','gdp_per_capita','hospital_beds_per_thousand','region']
covidcases[dailyvars + totvars + demovars].head(3).T

Unnamed: 0,0,1,2
casedate,2019-12-31,2020-01-01,2020-01-02
new_cases,0.0,0.0,0.0
new_deaths,0.0,0.0,0.0
location,Afghanistan,Afghanistan,Afghanistan
total_cases,0.0,0.0,0.0
total_deaths,0.0,0.0,0.0
population,38928341.0,38928341.0,38928341.0
population_density,54.422,54.422,54.422
median_age,18.6,18.6,18.6
gdp_per_capita,1803.987,1803.987,1803.987


In [5]:
# 일일 데이터만 있는 데이터프레임 생성
coviddaily=covidcases[['location']+dailyvars]
coviddaily.shape
coviddaily

Unnamed: 0,location,casedate,new_cases,new_deaths
0,Afghanistan,2019-12-31,0.0,0.0
1,Afghanistan,2020-01-01,0.0,0.0
2,Afghanistan,2020-01-02,0.0,0.0
3,Afghanistan,2020-01-03,0.0,0.0
4,Afghanistan,2020-01-04,0.0,0.0
...,...,...,...,...
29524,Zimbabwe,2020-07-08,53.0,0.0
29525,Zimbabwe,2020-07-09,98.0,0.0
29526,Zimbabwe,2020-07-10,41.0,3.0
29527,Zimbabwe,2020-07-11,16.0,1.0


In [11]:
# 국가별로 한 행 선택
covidcases.location.nunique()       # 고유한 위치의 개수
coviddemo=covidcases[['casedate']+totvars+demovars].sort_values(['location','casedate']).drop_duplicates(['location'],keep='last').rename(columns={'casedate':'lastdate'})

In [13]:
coviddemo.head(3).T

Unnamed: 0,184,310,500
lastdate,2020-07-12,2020-07-12,2020-07-12
location,Afghanistan,Albania,Algeria
total_cases,34451.0,3371.0,18712.0
total_deaths,1010.0,89.0,1004.0
population,38928341.0,2877800.0,43851043.0
population_density,54.422,104.871,17.348
median_age,18.6,38.0,29.1
gdp_per_capita,1803.987,11803.431,13913.839
hospital_beds_per_thousand,0.5,2.89,1.9
region,South Asia,Eastern Europe,North Africa


In [15]:
covidtotals=covidcases.groupby(['location'],as_index=False).agg({'new_cases':'sum','new_deaths':'sum','median_age':'last',
                                                                 'gdp_per_capita':'last','region':'last','casedate':'last','population':'last'}).\
                                                                 rename(columns={'new_cases':'total_cases','new_deaths':'total_deaths','casedate':'lastdate'})

In [16]:
covidtotals.head(3).T

Unnamed: 0,0,1,2
location,Afghanistan,Albania,Algeria
total_cases,34451.0,3371.0,18712.0
total_deaths,1010.0,89.0,1004.0
median_age,18.6,38.0,29.1
gdp_per_capita,1803.987,11803.431,13913.839
region,South Asia,Eastern Europe,North Africa
lastdate,2020-07-12,2020-07-12,2020-07-12
population,38928341.0,2877800.0,43851043.0
