# Dacon Covid 데이터 분석 3주차

## kaggle에 존재하는 외부 데이터 사용해 분석

- 한국뿐만 아니라 세계국가 데이터 존재
- 필요시 한국 데이터만 추출해 사용
- 한국과 다른 나라들간의 상황 비교분석

In [1]:
import pandas as pd
import numpy as np

In [21]:
data = pd.read_csv("C:/Users/joyh1/Desktop/GitRepo/data/covid-19-all.csv")

In [3]:
# 데이터 행,열 개수 확인
data.shape

(74352, 8)

In [4]:
# 데이터 타입 구성 확인
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74352 entries, 0 to 74351
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Country/Region  74352 non-null  object 
 1   Province/State  67247 non-null  object 
 2   Latitude        74351 non-null  float64
 3   Longitude       74351 non-null  float64
 4   Confirmed       74333 non-null  float64
 5   Recovered       73964 non-null  float64
 6   Deaths          73911 non-null  float64
 7   Date            74352 non-null  object 
dtypes: float64(5), object(3)
memory usage: 4.5+ MB


In [6]:
# 데이터 기술통계량 보기(위도, 경도 제외)
data[['Confirmed','Recovered','Deaths']].describe()

# 3분위수와 최댓값 격차가 매우 큰걸 보면 코로나 확산이 급작스럽게 증가했지 않을까 추론해본다.

Unnamed: 0,Confirmed,Recovered,Deaths
count,74333.0,73964.0,73911.0
mean,367.382508,90.369126,18.83472
std,4334.365424,1725.06155,365.755142
min,0.0,0.0,0.0
25%,1.0,0.0,0.0
50%,4.0,0.0,0.0
75%,28.0,0.0,1.0
max,166831.0,64281.0,19899.0


In [7]:
# 객체 타입의 기술통계량 보기
data.describe(include=object)

Unnamed: 0,Country/Region,Province/State,Date
count,74352,67247,74352
unique,212,298,82
top,US,Texas,2020-03-30
freq,62707,4381,3439


In [8]:
# 국가별로 얼마나 데이터가 존재하는지 관찰
data['Country/Region'].value_counts()

US               62707
China             2606
Canada             562
Australia          492
France             345
                 ...  
East Timor           1
Cape Verde           1
North Ireland        1
St. Martin           1
Saint Martin         1
Name: Country/Region, Length: 212, dtype: int64

In [9]:
# 한국이 있는지 조회해 보기
data['Country/Region'].unique()

# 두번째 줄에 'South Korea' 존재

array(['China', 'Hong Kong', 'Macau', 'Taiwan', 'US', 'Japan', 'Thailand',
       'South Korea', 'Singapore', 'Philippines', 'Malaysia', 'Vietnam',
       'Australia', 'Mexico', 'Brazil', 'Colombia', 'France', 'Nepal',
       'Canada', 'Cambodia', 'Sri Lanka', 'Ivory Coast', 'Germany',
       'Finland', 'United Arab Emirates', 'India', 'Italy',
       'United Kingdom', 'Russia', 'Sweden', 'Spain', 'Belgium', 'Others',
       'Egypt', 'Iran', 'Israel', 'Lebanon', 'Iraq', 'Oman',
       'Afghanistan', 'Bahrain', 'Kuwait', 'Austria', 'Algeria',
       'Croatia', 'Switzerland', 'Pakistan', 'Georgia', 'Greece',
       'North Macedonia', 'Norway', 'Romania', 'Denmark', 'Estonia',
       'Netherlands', 'San Marino', 'Azerbaijan', 'Belarus', 'Iceland',
       'Lithuania', 'New Zealand', 'Nigeria', 'North Ireland', 'Ireland',
       'Luxembourg', 'Monaco', 'Qatar', 'Ecuador', 'Czech Republic',
       'Armenia', 'Dominican Republic', 'Indonesia', 'Portugal',
       'Andorra', 'Latvia', 'Morocco'

In [10]:
# South Korea에 대한 데이터 얼마나 존재하는지 보기
data[data['Country/Region'] == 'South Korea']

# 82개만 존재

Unnamed: 0,Country/Region,Province/State,Latitude,Longitude,Confirmed,Recovered,Deaths,Date
37,South Korea,,36.000000,128.000000,1.0,,,2020-01-22
75,South Korea,,36.000000,128.000000,1.0,,,2020-01-23
121,South Korea,,36.000000,128.000000,2.0,,,2020-01-24
162,South Korea,,36.000000,128.000000,2.0,,,2020-01-25
208,South Korea,,36.000000,128.000000,3.0,,,2020-01-26
...,...,...,...,...,...,...,...,...
62451,South Korea,,35.907757,127.766922,10384.0,6776.0,200.0,2020-04-08
65362,South Korea,,35.907757,127.766922,10423.0,6973.0,204.0,2020-04-09
68302,South Korea,,35.907757,127.766922,10450.0,7117.0,208.0,2020-04-10
71268,South Korea,,35.907757,127.766922,10480.0,7243.0,211.0,2020-04-11


In [13]:
# 결측치있는 칼럼 결측치 대체하기

data.isnull().sum()

Country/Region       0
Province/State    7105
Latitude             1
Longitude            1
Confirmed           19
Recovered          388
Deaths             441
Date                 0
dtype: int64

In [23]:
# Confirmed/ Recovered/ Deaths 결측치 0으로대체
data['Confirmed'] = data['Confirmed'].fillna(0)
data['Recovered'] = data['Recovered'].fillna(0)
data['Deaths'] = data['Deaths'].fillna(0)
data.head()

Unnamed: 0,Country/Region,Province/State,Latitude,Longitude,Confirmed,Recovered,Deaths,Date
0,China,Anhui,31.8257,117.2264,1.0,0.0,0.0,2020-01-22
1,China,Beijing,40.1824,116.4142,14.0,0.0,0.0,2020-01-22
2,China,Chongqing,30.0572,107.874,6.0,0.0,0.0,2020-01-22
3,China,Fujian,26.0789,117.9874,1.0,0.0,0.0,2020-01-22
4,China,Gansu,37.8099,101.0583,0.0,0.0,0.0,2020-01-22


In [24]:
data.columns = ['국가','시도','위도','경도','확진자누적수','완치자누적수','사망자누적수','날짜']
data.head()

Unnamed: 0,국가,시도,위도,경도,확진자누적수,완치자누적수,사망자누적수,날짜
0,China,Anhui,31.8257,117.2264,1.0,0.0,0.0,2020-01-22
1,China,Beijing,40.1824,116.4142,14.0,0.0,0.0,2020-01-22
2,China,Chongqing,30.0572,107.874,6.0,0.0,0.0,2020-01-22
3,China,Fujian,26.0789,117.9874,1.0,0.0,0.0,2020-01-22
4,China,Gansu,37.8099,101.0583,0.0,0.0,0.0,2020-01-22


In [29]:
# 세계적 비교이기 때문에 결측치가 많은 시도 칼럼 제거
data = data.drop(['시도'], axis=1).copy()
data.head()

Unnamed: 0,국가,위도,경도,확진자누적수,완치자누적수,사망자누적수,날짜
0,China,31.8257,117.2264,1.0,0.0,0.0,2020-01-22
1,China,40.1824,116.4142,14.0,0.0,0.0,2020-01-22
2,China,30.0572,107.874,6.0,0.0,0.0,2020-01-22
3,China,26.0789,117.9874,1.0,0.0,0.0,2020-01-22
4,China,37.8099,101.0583,0.0,0.0,0.0,2020-01-22


In [35]:
# 국가, 위도,경도별로 확진자 누적수 최댓값 데이터프레임 만들기
df_country = pd.DataFrame(data.groupby(['국가','위도','경도'])['확진자누적수'].max())
df_country = df_country.reset_index()
df_country

Unnamed: 0,국가,위도,경도,확진자누적수
0,Afghanistan,33.000000,65.000000,7.0
1,Afghanistan,33.939100,67.710000,24.0
2,Afghanistan,33.939110,67.709953,607.0
3,Albania,41.153300,20.168300,446.0
4,Algeria,28.033900,1.659600,1914.0
...,...,...,...,...
4180,Zambia,-13.133900,27.849300,2.0
4181,Zambia,-13.133897,27.849332,43.0
4182,Zimbabwe,-20.000000,30.000000,1.0
4183,Zimbabwe,-19.015438,29.154857,14.0


In [36]:
import folium

In [37]:
lat = data['위도'].mean()
long = data['경도'].mean()

In [46]:
m = folium.Map([lat,long], zoom_start=1)
df_country['확진자누적수'] = df_country['확진자누적수'].astype(int)
df_country['확진자누적수'] = df_country['확진자누적수'].astype(str)
from folium.plugins import MarkerCluster
mark_cluster = MarkerCluster().add_to(m)
for i in df_country.index:
    sub_lat = df_country.loc[i, '위도']
    sub_long = df_country.loc[i, '경도']
    tooltip = df_country.loc[i, '국가']+"의 확진자 수:"+df_country.loc[i, '확진자누적수']
    
    folium.Marker(location=[sub_lat, sub_long],
                 tooltip=tooltip).add_to(mark_cluster)
    
m.save("encoding.html")
m

In [None]:
# Choropleth 이용해보기