In [2]:
# data > gapminder.tsv 파일로부터 데이터를 불러와서 EDA 방식으로 데이터 살펴보기

# Pandas 불러오기
import pandas as pd

# gapminder.tsv 불러오기
df = pd.read_csv("./data/gapminder.tsv", sep="\t") # 탭 구분자로 데이터 불러오기
df

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
1,Afghanistan,Asia,1957,30.332,9240934,820.853030
2,Afghanistan,Asia,1962,31.997,10267083,853.100710
3,Afghanistan,Asia,1967,34.020,11537966,836.197138
4,Afghanistan,Asia,1972,36.088,13079460,739.981106
...,...,...,...,...,...,...
1699,Zimbabwe,Africa,1987,62.351,9216418,706.157306
1700,Zimbabwe,Africa,1992,60.377,10704340,693.420786
1701,Zimbabwe,Africa,1997,46.809,11404948,792.449960
1702,Zimbabwe,Africa,2002,39.989,11926563,672.038623


In [5]:
# 앞쪽 데이터 생김새 확인
# df.head() # 기본적으로 5개 데이터를 출력
# df.head(10) # 지정된 갯수만큼 출력할때 
df.head(n=10)

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
1,Afghanistan,Asia,1957,30.332,9240934,820.85303
2,Afghanistan,Asia,1962,31.997,10267083,853.10071
3,Afghanistan,Asia,1967,34.02,11537966,836.197138
4,Afghanistan,Asia,1972,36.088,13079460,739.981106
5,Afghanistan,Asia,1977,38.438,14880372,786.11336
6,Afghanistan,Asia,1982,39.854,12881816,978.011439
7,Afghanistan,Asia,1987,40.822,13867957,852.395945
8,Afghanistan,Asia,1992,41.674,16317921,649.341395
9,Afghanistan,Asia,1997,41.763,22227415,635.341351


In [7]:
# 뒤쪽 데이터 생김새 확인
df.tail()

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
1699,Zimbabwe,Africa,1987,62.351,9216418,706.157306
1700,Zimbabwe,Africa,1992,60.377,10704340,693.420786
1701,Zimbabwe,Africa,1997,46.809,11404948,792.44996
1702,Zimbabwe,Africa,2002,39.989,11926563,672.038623
1703,Zimbabwe,Africa,2007,43.487,12311143,469.709298


In [8]:
# 데이터 타입 확인
print(type(df))
# DataFrame -> pandas의 기본 데이터 타입

<class 'pandas.core.frame.DataFrame'>


In [11]:
# 데이터프레임의 형태(크기)
df.shape  # (행크기, 열크기)
print(f"{df.shape[0]}행 x {df.shape[1]}열")

1704행 x 6열


In [12]:
# 데이터프레임에 포함된 컬럼 정보
df.columns

Index(['country', 'continent', 'year', 'lifeExp', 'pop', 'gdpPercap'], dtype='object')

In [13]:
# 각 컬럼의 자료형
df.dtypes

country       object
continent     object
year           int64
lifeExp      float64
pop            int64
gdpPercap    float64
dtype: object

In [14]:
# 데이터프레임의 전반적 정보
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1704 entries, 0 to 1703
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   country    1704 non-null   object 
 1   continent  1704 non-null   object 
 2   year       1704 non-null   int64  
 3   lifeExp    1704 non-null   float64
 4   pop        1704 non-null   int64  
 5   gdpPercap  1704 non-null   float64
dtypes: float64(2), int64(2), object(2)
memory usage: 80.0+ KB


In [16]:
# 단일 컬럼 정보를 얻어오기
countries = df['country']
print("type:", type(countries)) # Series 
print("dtype:", countries.dtype)
countries.head()

type: <class 'pandas.core.series.Series'>
dtype: object


0    Afghanistan
1    Afghanistan
2    Afghanistan
3    Afghanistan
4    Afghanistan
Name: country, dtype: object

In [17]:
# 여러 개의 컬럼 정보를 얻어오기
subset = df[["country", "continent", "year"]]
print("type:", type(subset))
subset.head()

type: <class 'pandas.core.frame.DataFrame'>


Unnamed: 0,country,continent,year
0,Afghanistan,Asia,1952
1,Afghanistan,Asia,1957
2,Afghanistan,Asia,1962
3,Afghanistan,Asia,1967
4,Afghanistan,Asia,1972


In [18]:
# 특정 행 정보(관측치) 추출
# loc -> 인덱스 정보를 바탕으로 레코드 추출
# iloc -> 행 정보를 바탕으로 레코드 추출
# 임의로 인덱스를 지정하지 않으면 판다스가 임의로 0부터 차례로 인덱스 부여
print(df.loc[3]) # 인덱스가 3인 관측치 추출
print(df.iloc[3]) # 행정보가 3인 관측치 추출

country      Afghanistan
continent           Asia
year                1967
lifeExp            34.02
pop             11537966
gdpPercap     836.197138
Name: 3, dtype: object
country      Afghanistan
continent           Asia
year                1967
lifeExp            34.02
pop             11537966
gdpPercap     836.197138
Name: 3, dtype: object


In [19]:
# 슬라이싱을 이용한 데이터 추출
subset2 = df.loc[:, ['year', 'pop']] # 전체 행, year, pop 컬럼 선택
subset2.head()

Unnamed: 0,year,pop
0,1952,8425333
1,1957,9240934
2,1962,10267083
3,1967,11537966
4,1972,13079460


In [20]:
# 전체 행 데이터 중 2, 4, -1(마지막) 컬럼 데이터 추출
# 음수 인덱스는 iloc메서드에서만 활용 가능
subset3 = df.iloc[:, [2, 4, -1]] 
subset3.head()

Unnamed: 0,year,pop,gdpPercap
0,1952,8425333,779.445314
1,1957,9240934,820.85303
2,1962,10267083,853.10071
3,1967,11537966,836.197138
4,1972,13079460,739.981106


In [21]:
# 범위 객체를 이용한 데이터 추출
# range([start,] end [,step])
# 짝수번째 컬럼 데이터만 추출
# range(0, df.shape[1], 2)
subset4 = df.iloc[:, range(0, df.shape[1], 2)]
subset4.head()

Unnamed: 0,country,year,pop
0,Afghanistan,1952,8425333
1,Afghanistan,1957,9240934
2,Afghanistan,1962,10267083
3,Afghanistan,1967,11537966
4,Afghanistan,1972,13079460


In [22]:
# df을 year로 그룹화
grouped_year_df = df.groupby("year") # year 컬럼을 기준으로 그룹핑
print(type(grouped_year_df)) # DataFrameGroupBy
grouped_year_df

<class 'pandas.core.groupby.generic.DataFrameGroupBy'>


<pandas.core.groupby.generic.DataFrameGroupBy object at 0x00000214EAF32650>

In [23]:
# year 로 그룹핑된 데이터프레임의 기대수명의 평균
grouped_year_df['lifeExp'].mean()

year
1952    49.057620
1957    51.507401
1962    53.609249
1967    55.678290
1972    57.647386
1977    59.570157
1982    61.533197
1987    63.212613
1992    64.160338
1997    65.014676
2002    65.694923
2007    67.007423
Name: lifeExp, dtype: float64

In [24]:
grouped_year_df.describe() # 기초 통계량 메서드

Unnamed: 0_level_0,lifeExp,lifeExp,lifeExp,lifeExp,lifeExp,lifeExp,lifeExp,lifeExp,pop,pop,pop,pop,pop,gdpPercap,gdpPercap,gdpPercap,gdpPercap,gdpPercap,gdpPercap,gdpPercap,gdpPercap
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1952,142.0,49.05762,12.225956,28.801,39.059,45.1355,59.765,72.67,142.0,16950400.0,...,9168197.75,556263500.0,142.0,3725.276046,9321.064786,298.846212,864.752389,1968.528344,3913.492777,108382.3529
1957,142.0,51.507401,12.231286,30.332,41.2475,48.3605,63.03675,73.47,142.0,18763410.0,...,9817598.0,637408000.0,142.0,4299.408345,9869.662202,335.997115,930.540819,2173.220291,4876.356362,113523.1329
1962,142.0,53.609249,12.097245,31.997,43.4685,50.881,65.2345,73.68,142.0,20421010.0,...,10980084.5,665770000.0,142.0,4725.812342,8667.362525,355.203227,1059.149171,2335.439533,5709.381428,95458.11176
1967,142.0,55.67829,11.718858,34.02,46.03375,53.825,67.4195,74.16,142.0,22658300.0,...,12614584.75,754550000.0,142.0,5483.653047,8095.315431,349.0,1151.245103,2678.33474,7075.932943,80894.88326
1972,142.0,57.647386,11.381953,35.4,48.50025,56.53,69.2475,74.72,142.0,25189980.0,...,14679199.5,862030000.0,142.0,6770.082815,10614.383403,357.0,1257.193853,3339.129407,9508.839304,109347.867
1977,142.0,59.570157,11.227229,31.22,50.4755,59.672,70.3825,76.11,142.0,27676380.0,...,16670227.0,943455000.0,142.0,7313.166421,8362.48915,371.0,1357.257252,3798.609244,11204.102423,59265.47714
1982,142.0,61.533197,10.770618,38.445,52.94,62.4415,70.92125,77.11,142.0,30207300.0,...,18407324.75,1000281000.0,142.0,7518.901673,7733.845006,424.0,1363.338985,4216.228428,12347.953722,33693.17525
1987,142.0,63.212613,10.556285,39.906,54.94075,65.834,71.87725,78.67,142.0,33038570.0,...,20947542.5,1084035000.0,142.0,7900.920218,8288.281304,385.0,1327.469823,4280.300366,11994.052795,31540.9748
1992,142.0,64.160338,11.22738,23.599,56.12175,67.703,72.5825,79.36,142.0,35990920.0,...,22705382.5,1164970000.0,142.0,8158.608521,9031.84608,347.0,1270.660958,4386.085502,10684.35187,34932.91959
1997,142.0,65.014676,11.559439,36.087,55.63375,69.394,74.16975,80.69,142.0,38839470.0,...,24311369.75,1230075000.0,142.0,9090.175363,10171.493263,312.188423,1366.837958,4781.825478,12022.867188,41283.16433


In [25]:
# 여러 컬럼의 산술 평균 구하기
# lifeExp, gdpPercap 컬럼의 산술평균 연도별로 집계
grouped_year_df[["lifeExp", "gdpPercap"]].mean()

Unnamed: 0_level_0,lifeExp,gdpPercap
year,Unnamed: 1_level_1,Unnamed: 2_level_1
1952,49.05762,3725.276046
1957,51.507401,4299.408345
1962,53.609249,4725.812342
1967,55.67829,5483.653047
1972,57.647386,6770.082815
1977,59.570157,7313.166421
1982,61.533197,7518.901673
1987,63.212613,7900.920218
1992,64.160338,8158.608521
1997,65.014676,9090.175363


In [27]:
# 그룹화된 데이터프레임에서 빈도수 세기
print(df.groupby("continent")['country'].unique())
df.groupby("continent")['country'].nunique()

continent
Africa      [Algeria, Angola, Benin, Botswana, Burkina Fas...
Americas    [Argentina, Bolivia, Brazil, Canada, Chile, Co...
Asia        [Afghanistan, Bahrain, Bangladesh, Cambodia, C...
Europe      [Albania, Austria, Belgium, Bosnia and Herzego...
Oceania                              [Australia, New Zealand]
Name: country, dtype: object


continent
Africa      52
Americas    25
Asia        33
Europe      30
Oceania      2
Name: country, dtype: int64

In [None]:
# EDA에서 Visualization은 아주 중요한 절차다
