In [60]:
!pip install html5lib
!pip install bs4
!pip install requests

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [61]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from io import StringIO


def make_df():
    url = "https://www.imf.org/en/Publications/WEO/weo-database/2024/April/weo-report?c=512,914,612,171,614,311,213,911,314,193,122,912,313,419,513,316,913,124,339,638,514,218,963,616,223,516,918,748,618,624,522,622,156,626,628,228,924,233,632,636,634,238,662,960,423,935,128,611,321,243,248,469,253,642,643,939,734,644,819,172,132,646,648,915,134,652,174,328,258,656,654,336,263,268,532,944,176,534,536,429,433,178,436,136,343,158,439,916,664,826,542,967,443,917,544,941,446,666,668,672,946,137,546,674,676,548,556,678,181,867,682,684,273,868,921,948,943,686,688,518,728,836,558,138,196,278,692,694,962,142,449,564,565,283,853,288,293,566,964,182,359,453,968,922,714,862,135,716,456,722,942,718,724,576,936,961,813,726,199,733,184,524,361,362,364,732,366,144,146,463,528,923,738,578,537,742,866,369,744,186,925,869,746,926,466,112,111,298,927,846,299,582,487,474,754,698,&s=NGDPD,&sy=2022&ey=2029&ssm=0&scsm=1&scc=0&ssd=1&ssc=0&sic=0&sort=country&ds=.&br=1"
    response = requests.get(url)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, "html.parser")

    # HTML에서 테이블을 DataFrame으로 읽어오기
    table = soup.find("table", class_="fluid")
    df = pd.read_html(StringIO(str(table)))[0]  # 첫 번째 테이블만 가져오기

    df = df[["Country", "2024"]]
    return df


# 크롤링 후, 데이터 전처리 적용
df = make_df()
# '2024' 열을 'GDP_USD_billion'으로 이름 변경
df = df.rename(columns={'2024': 'GDP_USD_billion'})
df = df.dropna(axis=0, subset=["GDP_USD_billion"])
df = df.sort_values(by='GDP_USD_billion', ascending=False)
df = df.reset_index(drop=True)
print(df)

                              Country  GDP_USD_billion
0                       United States        28781.083
1                               China        18532.633
2                             Germany         4591.100
3                               Japan         4110.452
4                               India         3937.011
5                      United Kingdom         3495.261
6                              France         3130.014
7                              Brazil         2331.391
8                               Italy         2328.028
9                              Canada         2242.182
10                             Russia         2056.844
11                             Mexico         2017.025
12                          Australia         1790.348
13                              Korea         1760.947
14                              Spain         1647.114
15                          Indonesia         1475.690
16                        Netherlands         1142.513
17        

In [62]:
def countries_by_continent():
    url = "https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_by_continent"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")

    # HTML에서 테이블을 DataFrame으로 읽어오기
    table = soup.find_all("table", {"class": "sortable"})
    africa_rows = table[0].find_all("tr")
    asia_rows = table[1].find_all("tr")
    europe_rows = table[2].find_all("tr")
    north_america_rows = table[3].find_all("tr")
    oceania_rows = table[4].find_all("tr")
    south_america_rows = table[5].find_all("tr")

    africa = each_continent(africa_rows)
    asia = each_continent(asia_rows)
    europe = each_continent(europe_rows)
    north_america = each_continent(north_america_rows)
    oceania = each_continent(oceania_rows)
    south_america = each_continent(south_america_rows)

    return africa, asia, europe, north_america, oceania, south_america


def each_continent(rows):
    data = []
    for row in rows[3:]:
        cols = row.find_all("td")
        if cols[0].find("a"):
            country = cols[0].find("a")
            country = country.text.strip()
            data.append(country)
    return data


africa, asia, europe, north_america, oceania, south_america = countries_by_continent()

In [63]:
# Pandas 출력 설정 변경
pd.set_option('display.max_rows', None)  # 모든 행 출력
pd.set_option('display.max_columns', None)  # 모든 열 출력

# Region 열 추가 및 대륙 할당
df['Region'] = 'Other'  # 기본적으로 모든 국가를 'Other'로 초기화

# 각 대륙 리스트에 포함된 국가의 인덱스를 찾아서 Region 열을 업데이트
df.loc[df['Country'].isin(africa), 'Region'] = 'Africa'
df.loc[df['Country'].isin(asia), 'Region'] = 'Asia'
df.loc[df['Country'].isin(europe), 'Region'] = 'Europe'
df.loc[df['Country'].isin(north_america), 'Region'] = 'North America'
df.loc[df['Country'].isin(oceania), 'Region'] = 'Oceania'
df.loc[df['Country'].isin(south_america), 'Region'] = 'South America'

print(df)

                              Country  GDP_USD_billion         Region
0                       United States        28781.083  North America
1                               China        18532.633           Asia
2                             Germany         4591.100         Europe
3                               Japan         4110.452           Asia
4                               India         3937.011           Asia
5                      United Kingdom         3495.261         Europe
6                              France         3130.014         Europe
7                              Brazil         2331.391  South America
8                               Italy         2328.028         Europe
9                              Canada         2242.182  North America
10                             Russia         2056.844         Europe
11                             Mexico         2017.025  North America
12                          Australia         1790.348        Oceania
13                  

In [64]:
other_countries = df[df['Region'] == 'Other']
print(other_countries)

                              Country  GDP_USD_billion Region
13                              Korea         1760.947  Other
17                            Türkiye         1113.561  Other
21           Taiwan Province of China          802.958  Other
34           Islamic Republic of Iran          464.181  Other
38                      Hong Kong SAR          406.775  Other
60                    Slovak Republic          140.808  Other
75                      Côte d'Ivoire           86.911  Other
83   Democratic Republic of the Congo           73.761  Other
89                          Macao SAR           54.677  Other
134                 Brunei Darussalam           15.510  Other
135                 Republic of Congo           15.501  Other
136                        Lao P.D.R.           15.190  Other
137                       The Bahamas           14.390  Other
139                   Kyrgyz Republic           13.599  Other
164                        Cabo Verde            2.718  Other
165     

In [65]:
region_mapping = {
    'Korea': 'Asia',
    'Türkiye': 'Europe',
    'Taiwan Province of China': 'Asia',
    'Islamic Republic of Iran': 'Asia',
    'Hong Kong SAR': 'Asia',
    'Slovak Republic': 'Europe',
    'Côte d\'Ivoire': 'Africa',
    'Democratic Republic of the Congo': 'Africa',
    'Macao SAR': 'Asia',
    'Brunei Darussalam': 'Asia',
    'Republic of Congo': 'Africa',
    'Lao P.D.R.': 'Asia',
    'The Bahamas': 'North America',
    'Kyrgyz Republic': 'Asia',
    'Cabo Verde': 'Africa',
    'The Gambia': 'Africa',
    'St. Lucia': 'North America',
    'Timor-Leste': 'Asia',
    'St. Kitts and Nevis': 'North America',
    'St. Vincent and the Grenadines': 'North America'
}

# Region 열 업데이트
df.loc[df['Region'] == 'Other', 'Region'] = df.loc[df['Region'] == 'Other', 'Country'].map(region_mapping)

print(df)

                              Country  GDP_USD_billion         Region
0                       United States        28781.083  North America
1                               China        18532.633           Asia
2                             Germany         4591.100         Europe
3                               Japan         4110.452           Asia
4                               India         3937.011           Asia
5                      United Kingdom         3495.261         Europe
6                              France         3130.014         Europe
7                              Brazil         2331.391  South America
8                               Italy         2328.028         Europe
9                              Canada         2242.182  North America
10                             Russia         2056.844         Europe
11                             Mexico         2017.025  North America
12                          Australia         1790.348        Oceania
13                  

In [66]:
other_countries = df[df['Region'] == 'Other']
print(other_countries)

Empty DataFrame
Columns: [Country, GDP_USD_billion, Region]
Index: []


In [73]:
df.loc[df['Country'] == 'Egypt', 'Region'] = 'Africa'
df.loc[df['Country'] == 'Indonesia', 'Region'] = 'Asia'

- GDP가 100B USD이상이 되는 국가만을 구해서 화면에 출력해야 합니다.

In [74]:
# '2024' 열에서 값이 100 이상인 행들 필터링
filtered_df = df[df['GDP_USD_billion'] >= 100]
print(filtered_df)

                     Country  GDP_USD_billion         Region
0              United States        28781.083  North America
1                      China        18532.633           Asia
2                    Germany         4591.100         Europe
3                      Japan         4110.452           Asia
4                      India         3937.011           Asia
5             United Kingdom         3495.261         Europe
6                     France         3130.014         Europe
7                     Brazil         2331.391  South America
8                      Italy         2328.028         Europe
9                     Canada         2242.182  North America
10                    Russia         2056.844         Europe
11                    Mexico         2017.025  North America
12                 Australia         1790.348        Oceania
13                     Korea         1760.947           Asia
14                     Spain         1647.114         Europe
15                 Indon

 - 각 Region별로 top5 국가의 GDP 평균을 구해서 화면에 출력해야 합니다.

In [75]:
top5_per_region = (
    df.groupby("Region")
    .apply(lambda x: x.nlargest(5, "GDP_USD_billion"))
    .reset_index(drop=True)
)  # reset_index -> 인덱스 재정의, drop=True면 기존 인덱스는 삭제
top5_per_region

  df.groupby("Region")


Unnamed: 0,Country,GDP_USD_billion,Region
0,South Africa,373.233,Africa
1,Egypt,347.594,Africa
2,Algeria,266.78,Africa
3,Nigeria,252.738,Africa
4,Ethiopia,205.13,Africa
5,China,18532.633,Asia
6,Japan,4110.452,Asia
7,India,3937.011,Asia
8,Korea,1760.947,Asia
9,Indonesia,1475.69,Asia


In [76]:
average_gdp_per_region = top5_per_region.groupby("Region")["GDP_USD_billion"].mean().reset_index()
average_gdp_per_region['GDP_USD_billion'] = average_gdp_per_region['GDP_USD_billion'].round(2)
average_gdp_per_region

Unnamed: 0,Region,GDP_USD_billion
0,Africa,289.09
1,Asia,5963.35
2,Europe,3120.25
3,North America,6657.08
4,Oceania,417.44
5,South America,787.59
