In [6]:
# pandas를 통해서 csv 데이터 찾아오기

import pandas as pd

doc = pd.read_csv("00_data/olist_customers_dataset.csv", encoding="utf-8-sig")
doc.head()

Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state
0,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409,franca,SP
1,18955e83d337fd6b2def6b18a428ac77,290c77bc529b7ac935b93aa66c333dc3,9790,sao bernardo do campo,SP
2,4e7b3e00288586ebd08712fdd0374a03,060e732b5b29e8181a18229c7b0b2b5e,1151,sao paulo,SP
3,b2b6027bc5c5109e529d4dc6358b12c3,259dac757896d24d7702b9acbbff3f3c,8775,mogi das cruzes,SP
4,4f2d8ab171c80ec8364f7c12e35b23ad,345ecd01c38d18a9036ed96c73b8d066,13056,campinas,SP


In [4]:
# 찾아온 데이터의 행.열 개수 확인

doc.shape

(99441, 5)

In [5]:
# 찾아온 데이터의 행.열의 세부적인 정보를 확인

doc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 5 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   customer_id               99441 non-null  object
 1   customer_unique_id        99441 non-null  object
 2   customer_zip_code_prefix  99441 non-null  int64 
 3   customer_city             99441 non-null  object
 4   customer_state            99441 non-null  object
dtypes: int64(1), object(4)
memory usage: 3.8+ MB


In [8]:
# 찾아온 데이터의 상단 필드명의 그룹을 확인하고자 할 때

doc.columns

Index(['customer_id', 'customer_unique_id', 'customer_zip_code_prefix',
       'customer_city', 'customer_state'],
      dtype='object')

In [9]:
# 데이터의 인덱스를 확인하고자 할 때

doc.index

RangeIndex(start=0, stop=99441, step=1)

In [10]:
# 데이터 내 중요한 5가지 지표 (5 number summary) 확인하고자 할 때

doc.describe()

Unnamed: 0,customer_zip_code_prefix
count,99441.0
mean,35137.474583
std,29797.938996
min,1003.0
25%,11347.0
50%,24416.0
75%,58900.0
max,99990.0


In [13]:
# 기존 로우데이터에서 원하는 컬럼만 선택해서 복사본의 데이터프레임을 새롭게 만들고자 할 때

doc2 = doc[['customer_zip_code_prefix', 'customer_city', 'customer_state']].copy()
doc2

Unnamed: 0,customer_zip_code_prefix,customer_city,customer_state
0,14409,franca,SP
1,9790,sao bernardo do campo,SP
2,1151,sao paulo,SP
3,8775,mogi das cruzes,SP
4,13056,campinas,SP
...,...,...,...
99436,3937,sao paulo,SP
99437,6764,taboao da serra,SP
99438,60115,fortaleza,CE
99439,92120,canoas,RS


In [16]:
# 복제한 데이터 프레임에서 특정 컬럼 내 일치하는 값만 추출해서 갯수를 확인하고자 할 때

doc3 = doc2[doc2["customer_city"] == "sao paulo"]
doc3.shape

(15540, 3)

In [18]:
# 특정 컬럼 내 동일한 값을 가지고 있는 요소들의 개수를 명세서 형태로 확인

doc2["customer_city"].value_counts()

customer_city
sao paulo            15540
rio de janeiro        6882
belo horizonte        2773
brasilia              2131
curitiba              1521
                     ...  
bequimao                 1
andarai                  1
vargem grande            1
curvelandia              1
eugenio de castro        1
Name: count, Length: 4119, dtype: int64

In [21]:
# 특정 컬럼을 그룹화한 후 나머지 컬럼의 갯수를 집계할 때 사용 -> 그룹화가되면 자동으로 인덱스 값으로 변환!

doc4 = doc2.groupby("customer_city").count()
doc4

Unnamed: 0_level_0,customer_zip_code_prefix,customer_state
customer_city,Unnamed: 1_level_1,Unnamed: 2_level_1
abadia dos dourados,3,3
abadiania,1,1
abaete,12,12
abaetetuba,11,11
abaiara,2,2
...,...,...
xinguara,9,9
xique-xique,3,3
zacarias,2,2
ze doca,5,5


In [24]:
# 특정 데이터베이스 내 인덱스의 갯수를 확인하고자 할 때

len(doc4.index)

4119

In [25]:
doc4

Unnamed: 0_level_0,customer_zip_code_prefix,customer_state
customer_city,Unnamed: 1_level_1,Unnamed: 2_level_1
abadia dos dourados,3,3
abadiania,1,1
abaete,12,12
abaetetuba,11,11
abaiara,2,2
...,...,...
xinguara,9,9
xique-xique,3,3
zacarias,2,2
ze doca,5,5


In [27]:
doc4 = doc.groupby("customer_city").count()
doc4.head()

Unnamed: 0_level_0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_state
customer_city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
abadia dos dourados,3,3,3,3
abadiania,1,1,1,1
abaete,12,12,12,12
abaetetuba,11,11,11,11
abaiara,2,2,2,2


In [32]:
# 기존 데이터 프레임 안에서 특정 컬럼을 기준으로 정렬하고자 할 때 (오름차순 디폴트, 내림차순 => False)

doc4["customer_id"].sort_values(ascending=False).head(1)

customer_city
sao paulo    15540
Name: customer_id, dtype: int64

In [35]:
# 데이터프레임 안에서 특정 컬럼을 인덱스로 적용하고 싶을 때

doc.set_index("customer_city")

Unnamed: 0_level_0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_state
customer_city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
franca,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409,SP
sao bernardo do campo,18955e83d337fd6b2def6b18a428ac77,290c77bc529b7ac935b93aa66c333dc3,9790,SP
sao paulo,4e7b3e00288586ebd08712fdd0374a03,060e732b5b29e8181a18229c7b0b2b5e,1151,SP
mogi das cruzes,b2b6027bc5c5109e529d4dc6358b12c3,259dac757896d24d7702b9acbbff3f3c,8775,SP
campinas,4f2d8ab171c80ec8364f7c12e35b23ad,345ecd01c38d18a9036ed96c73b8d066,13056,SP
...,...,...,...,...
sao paulo,17ddf5dd5d51696bb3d7c6291687be6f,1a29b476fee25c95fbafc67c5ac95cf8,3937,SP
taboao da serra,e7b71a9017aa05c9a7fd292d714858e8,d52a67c98be1cf6a5c84435bd38d095d,6764,SP
fortaleza,5e28dfe12db7fb50a4b2f691faecea5e,e9f50caf99f032f0bf3c55141f019d99,60115,CE
canoas,56b18e2166679b8a959d72dd06da27f9,73c2643a0a458b49f58cea58833b192e,92120,RS


In [37]:
# 특정 컬럼을 기준으로 인덱스 처리 후 정렬하고자 할 때

doc.set_index("customer_city").sort_index()

Unnamed: 0_level_0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_state
customer_city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
abadia dos dourados,a23e3f9a2b656b23b7e52075964b42cd,afddf43a03a9941624ed42c0b2c17280,38540,MG
abadia dos dourados,9e01f714a2b3b8962c222cf2b74c20dc,e1feae9083c4c2895ddf6dc80526a85d,38540,MG
abadia dos dourados,f11eb8f0b8b87510a93e3e1aa10b0ade,64ee476500a01beb94df40f97a108c50,38540,MG
abadiania,576d71ddb21b21763cfedce73b902180,8d76b559181609308fcae630ea64cd61,72940,GO
abaete,5e9e1ae42e02df93e9a591e86fd531a3,28af9604f7830ef6d1230fb575c39eb1,35620,MG
...,...,...,...,...
ze doca,1052e334b46dd1e9876946d6372a363e,c933181d09535cb6bdfb1c0160323bd9,65365,MA
ze doca,d8b3ef6f73de33ae716e94d2478cc080,9ef06526b8e57a567ed6f1dce7ab0320,65365,MA
ze doca,fc87e5c06780700952aa42998a22968f,d4798889fe74026a739ca0be9d2dccb8,65365,MA
zortea,0a466d490a3c2fcc1e7177f4809ec7dd,d5a5ceb83234eeb855268d7fce7f54ea,89633,SC


In [41]:
doc2["customer_state"].value_counts().shape

(27,)

In [42]:
doc4

Unnamed: 0_level_0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_state
customer_city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
abadia dos dourados,3,3,3,3
abadiania,1,1,1,1
abaete,12,12,12,12
abaetetuba,11,11,11,11
abaiara,2,2,2,2
...,...,...,...,...
xinguara,9,9,9,9
xique-xique,3,3,3,3
zacarias,2,2,2,2
ze doca,5,5,5,5


In [45]:
# 기존 데이터 프레임에서 인덱스를 해제하고자 할 때

doc4 = doc4.reset_index()
doc4

Unnamed: 0,index,customer_city,customer_id,customer_unique_id,customer_zip_code_prefix,customer_state
0,0,abadia dos dourados,3,3,3,3
1,1,abadiania,1,1,1,1
2,2,abaete,12,12,12,12
3,3,abaetetuba,11,11,11,11
4,4,abaiara,2,2,2,2
...,...,...,...,...,...,...
4114,4114,xinguara,9,9,9,9
4115,4115,xique-xique,3,3,3,3
4116,4116,zacarias,2,2,2,2
4117,4117,ze doca,5,5,5,5


In [50]:
doc4 = doc.groupby("customer_city").count()
doc4.head()

Unnamed: 0_level_0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_state
customer_city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
abadia dos dourados,3,3,3,3
abadiania,1,1,1,1
abaete,12,12,12,12
abaetetuba,11,11,11,11
abaiara,2,2,2,2


In [55]:
# 특정 컬럼 안에 있는 값들을 비교연산자로 비교한 후 데이터의 형태를 출력하고자 할 때

doc4[doc4["customer_id"] > 1000].shape

(9, 4)

In [56]:
# 결측치 = Null (*채워지지 않은값)
# 분산되어있는 수치형 데이터의 집계 (더하기, 평균, 최소, 최대값 등) => 값이 없는 경우 에러
# 문자열 데이터의 경우, 결측치 값이 존재 => 그룹화, 조인 등의 값을 하나로 합쳐야하는 경우

doc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 5 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   customer_id               99441 non-null  object
 1   customer_unique_id        99441 non-null  object
 2   customer_zip_code_prefix  99441 non-null  int64 
 3   customer_city             99441 non-null  object
 4   customer_state            99441 non-null  object
dtypes: int64(1), object(4)
memory usage: 3.8+ MB


In [59]:
# 해당 데이터프레임 내부에 결측치값의 합계

doc.isnull().sum()

customer_id                 0
customer_unique_id          0
customer_zip_code_prefix    0
customer_city               0
customer_state              0
dtype: int64

In [63]:
# 데이터프레임에서 특정 컬럼을 기준으로 중복되는 값을 1개만 살리고, 해당 컬럼을 리스트 자료구조로 변환하고자 할 때

doc.drop_duplicates(subset="customer_city", keep="last")["customer_city"].to_list()

['vargem grande',
 'agua fria de goias',
 'taperuaba',
 'cajueiro',
 'canapi',
 'itanhem',
 'sao jose da coroa grande',
 'sao jorge do ivai',
 'nova laranjeiras',
 'ivora',
 'anaurilandia',
 'pedro velho',
 'glaura',
 'sao miguel do aleixo',
 'santo antonio do caiua',
 'fernao',
 'sao jose da tapera',
 'cordeiros',
 'ananas',
 'godoy moreira',
 'japi',
 'barauna',
 'sandolandia',
 'inga',
 'muritiba',
 'desembargador otoni',
 'ipiranga do norte',
 'felipe guerra',
 'rio formoso',
 'igaratinga',
 'cedro',
 'sao joao da urtiga',
 'bom jesus do querendo',
 'formosa da serra negra',
 'rainha do mar',
 'banabuiu',
 'chorrocho',
 'cipo-guacu',
 'itabaianinha',
 'ibirapua',
 'francisco santos',
 'guimaraes',
 'tapera',
 'rio doce',
 'tocos do moji',
 'humberto de campos',
 'riversul',
 'lindolfo collor',
 'santana do sobrado',
 'central',
 'nova independencia',
 'sacra familia do tingua',
 'unistalda',
 'sobralia',
 'divinopolis de goias',
 'sao joao de petropolis',
 'independencia',
 'lagoa 

In [64]:
# pivot => 기존의 어떤 데이터를 보완해서 새롭게 만든다는 의미!

import pandas as pd

doc_covid = pd.read_csv("COVID-19-master/csse_covid_19_data/csse_covid_19_daily_reports/04-01-2020.csv", encoding="utf-8-sig")
doc_covid.head()

Unnamed: 0,FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key
0,45001.0,Abbeville,South Carolina,US,2020-04-01 21:58:49,34.223334,-82.461707,4,0,0,4,"Abbeville, South Carolina, US"
1,22001.0,Acadia,Louisiana,US,2020-04-01 21:58:49,30.295065,-92.414197,47,1,0,46,"Acadia, Louisiana, US"
2,51001.0,Accomack,Virginia,US,2020-04-01 21:58:49,37.767072,-75.632346,7,0,0,7,"Accomack, Virginia, US"
3,16001.0,Ada,Idaho,US,2020-04-01 21:58:49,43.452658,-116.241552,195,3,0,192,"Ada, Idaho, US"
4,19001.0,Adair,Iowa,US,2020-04-01 21:58:49,41.330756,-94.471059,1,0,0,1,"Adair, Iowa, US"


In [76]:
# 일반적인 데이터 프레임 복제 VS 피봇테이블 생성
# 특정 컬럼을 기준으로 집계 및 연산처리 가능
# 총계 확인 가능

doc_covid2 = pd.pivot_table(doc_covid, index=["Country_Region"], values=["Confirmed", "Deaths"], aggfunc={"Confirmed" : "sum", "Deaths": "mean"}, fill_value=0, margins=True, margins_name="Total")
doc_covid2[doc_covid2.index == "Total"]

Unnamed: 0_level_0,Confirmed,Deaths
Country_Region,Unnamed: 1_level_1,Unnamed: 2_level_1
Total,938959,20.477002
