# <font color=blue>Сбор датасета из таблиц википедии</font>

В датасете будут данные о населении, ВВП (номинал и ППС), протяженности автомобильных и железных дорог, уровне промышленного производства.

Датасет соберем из таблиц выложенных в английской википедии.

In [1455]:
import math
import re
import requests

import lxml.html as lh
import numpy as np
import pandas as pd
import seaborn as sns


sns.set()


def get_col_names(col_names):
    """Возвращает список именом столбцов. В таблице может быть 
    заголовок, состоящий из более чем одной строки. В таком случае
    `pandas.read_html()` возвращает заголовок таблицы в формате 
    `pandas.core.index.MultiIndex`, где индексами являются кортежи 
    строк. Данная функция объединяет уникальные элементы таких кортежей 
    в строки через '/'. Например кортеж `('Rank', 'Rank')` заменяется
    строкой `'Rank'`, а кортеж `('Length', 'Electrified')` строкой 
    `'Length/Electrified'`. Если элементом `col_names` является строка,
    то она остается без изменений.
    
    Параметры:
        col_names: последовательность имен столбцов в `pandas.DataFrame`
    
    Возвращает:
        список строк
    
    Исключения:
        ValueError, если элемент `col_names` -- не кортеж и не строка. 
    """
    names = []
    for n in col_names:
        if isinstance(n, tuple):
            sn = ''
            for i, w in enumerate(n):
                if w not in n[:i]:
                    sn += '/' + w
            names.append(sn[1:])
        elif isinstance(n, (str, int)):
            names.append(n)
        else:
            raise ValueError(
                "Unsupported column name type {}\n"
                "Only types `str`, `int` and `Tuple[str]` "
                "are supported.\n"
                "column name: {}".format(type(n), n)
            )
    return names


def remove_excess_info(s):
    """Возвращает строку, полученную из `s` удалением из нее квадратных
    и круглых скобок с их содержимым, а также удалением символа *.
    
    Параметры:
        s: Любой объект. Если не строка, объект возвращается без изменений.
    
    Возвращает:
        строка
    """
    if isinstance(s, str):
        s = re.sub(r'((\[[^\]]*])|(\*)|(~)|(%)|(\([^\)]*\)))', '', s)
        s = re.sub(' +', ' ', s)
        s = s.strip()
    return s

        
def tofloat(x):
    if isinstance(x, str) and x in '—–' \
            or isinstance(x, float) and math.isnan(x):
        return float('nan')
    if isinstance(x, str):
        x = re.sub('[, ]+', '', x)
    try:
        return float(x)
    except ValueError:
        raise ValueError(
            "Cannot convert {} to `float`\nx=={}".format(
                type(x), x))
    


def get_tables_from_html(url):
    """Возвращает список таблиц найденных на веб-странице с адресом `url`.
    
    Параметры:
        url: строка
    
    Возвращает:
        список объектов класса `pandas.DataFrame`
    """
    # Скачиваем веб-страницу
    page = requests.get(url)
    # Преобразуем скачанную страницу в html документ.
    doc = lh.fromstring(page.content.decode('utf-8'))
    # Извлекаем из документа все таблицы
    tables = doc.xpath('//table')
    dfs = []
    for table in tables:
        df = pd.read_html(lh.tostring(table))[0]
        # Следующие 2 строчки кода решают проблему многострочных заголовков
        col_names = get_col_names(df.columns)
        df.columns = col_names
        dfs.append(df)
    return dfs


def prepare_wiki_table(df, int_cols):
    df = df.applymap(remove_excess_info)
    for col in int_cols:
        df[col] = df[col].map(tofloat)
    return df


country_rename = {
    'Bahamas, The': 'Bahamas',
    'Brunei Darussalam': 'Brunei',
    'Cabo Verde': 'Cape Verde',
    'Congo, Democratic Republic of': 'Democratic Republic of the Congo',
    'Congo, Democratic Republic of the': 'Democratic Republic of the Congo',
    'Congo, Republic of': 'Republic of the Congo',
    'Congo, Republic of the': 'Republic of the Congo',
    'Congo': 'Republic of the Congo',
    'Republic of Congo': 'Republic of the Congo',
    'Côte d\'Ivoire': 'Cote d\'Ivoire',
    'Ivory Coast': 'Cote d\'Ivoire',
    'Curaçao': 'Curacao',
    'Gambia, The': 'Gambia',
    'Micronesia, Federated States of': 'Federated States of Micronesia',
    'Saint Helena, Ascensionand Tristan da Cunha': 'Saint Helena',
    'Swaziland': 'Eswatini',
    'Slovak Republic': 'Slovakia',
    'St. Kitts and Nevis': 'Saint Kitts and Nevis',
    'St. Lucia': 'Saint Lucia',
    'St. Vincent and the Grenadines': 'Saint Vincent and the Grenadines',
    'São Tomé and Príncipe': 'Sao Tome and Principe',
    'Korea, South': 'South Korea',
    'Korea, North': 'North Korea',
    'Timor-Leste': 'East Timor', 
    'West Bank and Gaza': 'Palestine',
    'West Bank': 'Palestine',
}

### Автодороги

In [1456]:
url = "https://en.wikipedia.org/wiki/List_of_countries_by_road_network_size"
dfs = get_tables_from_html(url)

In [1457]:
len(dfs)

3

In [1458]:
dfs[1].head()

Unnamed: 0,Rank,Country,Length (km)/Roads,Length (km)/Expressways,Date of information
0,1,United States,6853024,108394,2017[1]
1,—,European Union[n 1],6250547,84190,2014-2018[n 2]
2,2,India,5903293,1583,2019[3]
3,3,China,"4,846,500[4]",142500,2018[5]
4,4,Brazil,1751868,11000,2013


In [1459]:
roads_df = prepare_wiki_table(dfs[1], ['Rank', 'Length (km)/Roads', 'Length (km)/Expressways'])


In [1460]:
roads_df.head()

Unnamed: 0,Rank,Country,Length (km)/Roads,Length (km)/Expressways,Date of information
0,1.0,United States,6853024.0,108394.0,2017
1,,European Union,6250547.0,84190.0,2014-2018
2,2.0,India,5903293.0,1583.0,2019
3,3.0,China,4846500.0,142500.0,2018
4,4.0,Brazil,1751868.0,11000.0,2013


In [1461]:
print(roads_df.dtypes)

Rank                       float64
Country                     object
Length (km)/Roads          float64
Length (km)/Expressways    float64
Date of information         object
dtype: object


In [1462]:
roads_df = roads_df.set_index('Country')
roads_df = roads_df.rename(index=country_rename)

### Железные дороги

In [1463]:
url = "https://en.wikipedia.org/wiki/List_of_countries_by_rail_transport_network_size"
dfs = get_tables_from_html(url)

In [1464]:
len(dfs)

3

In [1465]:
dfs[1].head()

Unnamed: 0,Rank,Country,Length(km),Electrified length(km),Historic peak length(km),Area (km2) per km track,Population per km track,Nationalised or Private,Data year,Notes
0,—,European Union[n 1],218783.0,116561,"189,297[n 2]",20.46,2347,Both,2016-17,[3]
1,1,United States,149910.0,"1,600[4]","409,000[5]",65.55,2060,Private,2017,[3]
2,2,China,131000.0,"90,000[6]",,75.6[6],"10,945[6]",Nationalised,2017,[6]
3,3,Russia,85500.0,43700,,199.98,1678,Nationalised,2017,[7]
4,4,India,68442.0,35488,,48.8,19656,Nationalised,2019,[8]


In [1466]:
rail = prepare_wiki_table(
    dfs[1],
    ['Rank', 'Length(km)', 'Electrified length(km)', 'Area (km2) per km track', 'Population per km track']
) 

In [1467]:
rail.describe()

Unnamed: 0,Rank,Length(km),Electrified length(km),Area (km2) per km track,Population per km track
count,147.0,149.0,67.0,149.0,149.0
mean,73.986395,18390.13,7268.353731,1207.739262,36598.14
std,42.579175,114562.2,18789.504775,6606.3149,150365.2
min,1.0,0.3,0.0,1.18,674.0
25%,37.5,622.0,313.0,33.8,2347.0
50%,74.0,1931.0,1587.0,123.04,6604.0
75%,110.0,5459.0,4212.5,330.45,16534.0
max,147.0,1370782.0,116561.0,59200.0,1557550.0


In [1468]:
rail = rail.set_index('Country')
rail = rail.rename(index=country_rename)

### Население

In [1469]:
url = "https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population"
dfs = get_tables_from_html(url)

In [1470]:
len(dfs)

2

In [1471]:
dfs[0].head()

Unnamed: 0,Rank,Country(or dependent territory),Population,Date,% of worldpopulation,Source
0,1,China[b],1400260000,2 December 2019,18.1%,Official population clock[3]
1,2,India,1355550000,2 December 2019,17.5%,Official population clock[4]
2,3,United States[c],330343000,2 December 2019,4.26%,Official population clock[5]
3,4,Indonesia,266911900,1 July 2019,,Official annual projection[6]
4,5,Pakistan,217802000,2 December 2019,2.81%,Official population clock[7]


In [1472]:
population = prepare_wiki_table(dfs[0], ['Rank', 'Population', '% of worldpopulation'])

In [1473]:
population.head()

Unnamed: 0,Rank,Country(or dependent territory),Population,Date,% of worldpopulation,Source
0,1.0,China,1400260000.0,2 December 2019,18.1,Official population clock
1,2.0,India,1355550000.0,2 December 2019,17.5,Official population clock
2,3.0,United States,330343000.0,2 December 2019,4.26,Official population clock
3,4.0,Indonesia,266911900.0,1 July 2019,,Official annual projection
4,5.0,Pakistan,217802000.0,2 December 2019,2.81,Official population clock


In [1474]:
world_population = population.loc[population["Country(or dependent territory)"] == 'World'].loc[241, 'Population']

In [1475]:
world_population

7751517000.0

In [1476]:
population['% of worldpopulation'] = population['Population'] / world_population

In [1477]:
population.head()

Unnamed: 0,Rank,Country(or dependent territory),Population,Date,% of worldpopulation,Source
0,1.0,China,1400260000.0,2 December 2019,0.180643,Official population clock
1,2.0,India,1355550000.0,2 December 2019,0.174875,Official population clock
2,3.0,United States,330343000.0,2 December 2019,0.042617,Official population clock
3,4.0,Indonesia,266911900.0,1 July 2019,0.034434,Official annual projection
4,5.0,Pakistan,217802000.0,2 December 2019,0.028098,Official population clock


In [1478]:
population = population.set_index('Country(or dependent territory)')
population = population.rename(index=country_rename)

### ВВП по номиналу

In [1479]:
url = "https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)"
dfs = get_tables_from_html(url)

In [1480]:
len(dfs)

10

In [1481]:
dfs[0].head()

Unnamed: 0,0
0,Largest economies by nominal GDP in 2019[1]


In [1482]:
dfs[1].head()

Unnamed: 0,0,1,2
0,>$20 trillion $10–$20 trillion $5–$10 trillion...,$750 billion–$1 trillion $500–$750 billion $25...,$50–$100 billion $25–$50 billion $5–$25 billio...


In [1483]:
# Здесь таблица, объединяющая в себе 3 другие таблицы. 
# Наш обработчик не умеет такое парсить.
dfs[2].head(10)

Unnamed: 0,0,1,2
0,Per the International Monetary Fund (2019 esti...,Per the World Bank (2018)[20],Per the United Nations (2017)[21][22]
1,Rank Country/Territory GDP(US$million) World[...,Rank Country/Territory GDP(US$million) World ...,Rank Country/Territory GDP(US$million) World[...
2,Rank,Country/Territory,GDP(US$million)
3,,World[19],87265226
4,1,United States,21439453
5,—,European Union[23][n 1],18705132
6,2,China[n 2],14140163
7,3,Japan,5154475
8,4,Germany,3863344
9,5,India,2935570


In [1484]:
# по оценкам МВФ
dfs[3].head()

Unnamed: 0,Rank,Country/Territory,GDP(US$million)
0,,World[19],87265226
1,1,United States,21439453
2,—,European Union[23][n 1],18705132
3,2,China[n 2],14140163
4,3,Japan,5154475


In [1485]:
# по оценкам Всемирного банка
dfs[4].head()

Unnamed: 0,Rank,Country/Territory,GDP(US$million)
0,,World,85804391
1,1.0,United States,20494100
2,2.0,China[n 5],13608152
3,3.0,Japan,4970916
4,4.0,Germany,3996759


In [1486]:
# по оценкам ООН
dfs[5].head()

Unnamed: 0,Rank,Country/Territory,GDP(US$million)
0,,World[25],80501413
1,1.0,United States,19485394
2,2.0,China[n 5],12234781
3,3.0,Japan,4872415
4,4.0,Germany,3693204


Работаем с оценками МВФ.

In [1487]:
gdp_nominal = prepare_wiki_table(dfs[4], ['Rank', 'GDP(US$million)'])

In [1488]:
gdp_nominal.shape

(191, 3)

In [1489]:
gdp_nominal.describe()

Unnamed: 0,Rank,GDP(US$million)
count,185.0,191.0
mean,93.0,893798.7
std,53.549043,6451909.0
min,1.0,43.0
25%,47.0,9374.5
50%,93.0,40288.0
75%,139.0,242250.5
max,185.0,85804390.0


In [1490]:
gdp_nominal.columns

Index(['Rank', 'Country/Territory', 'GDP(US$million)'], dtype='object')

In [1491]:
gdp_nominal = gdp_nominal.set_index('Country/Territory')
gdp_nominal = gdp_nominal.rename(index=country_rename)

### ВВП по ППС

In [1492]:
url = "https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(PPP)"
dfs = get_tables_from_html(url)

In [1493]:
len(dfs)

10

In [1494]:
dfs[3].head()

Unnamed: 0,Rank,Country/Territory,GDP(millions of current Int$),GDP(millions of current Int$).1
0,,World,141859625,[9]
1,1,China[n 2],27308857,
2,—,European Union[n 3],22774165,[11]
3,2,United States,21439453,
4,3,India,11325669,


In [1495]:
gdp_ppp = prepare_wiki_table(dfs[3], ['Rank', 'GDP(millions of current Int$)'])

In [1496]:
gdp_ppp.head()

Unnamed: 0,Rank,Country/Territory,GDP(millions of current Int$),GDP(millions of current Int$).1
0,,World,141859625.0,
1,1.0,China,27308857.0,
2,,European Union,22774165.0,
3,2.0,United States,21439453.0,
4,3.0,India,11325669.0,


In [1497]:
gdp_ppp.describe()

Unnamed: 0,Rank,GDP(millions of current Int$)
count,187.0,193.0
mean,94.0,1586323.0
std,54.126395,10623850.0
min,1.0,47.0
25%,47.5,22179.0
50%,94.0,79591.0
75%,140.5,413119.0
max,187.0,141859600.0


In [1498]:
gdp_ppp = gdp_ppp.set_index('Country/Territory')
gdp_ppp = gdp_ppp.rename(index=country_rename)

### Промышленное производство

In [1499]:
url = "https://en.wikipedia.org/wiki/Manufacturing"
dfs = get_tables_from_html(url)

In [1500]:
dfs[1].head()

Unnamed: 0,Rank,Country/Region,Millions of $US,Year
0,,World,13171000,2017
1,1.0,China,4002752,2018
2,2.0,United States,2173319,2017
3,3.0,Japan,1007330,2017
4,4.0,Germany,832431,2018


In [1501]:
manufacturing = prepare_wiki_table(dfs[1], ['Rank', 'Millions of $US'])

In [1502]:
manufacturing.head()

Unnamed: 0,Rank,Country/Region,Millions of $US,Year
0,,World,13171000.0,2017
1,1.0,China,4002752.0,2018
2,2.0,United States,2173319.0,2017
3,3.0,Japan,1007330.0,2017
4,4.0,Germany,832431.0,2018


In [1503]:
manufacturing = manufacturing.set_index('Country/Region')
manufacturing = manufacturing.rename(index=country_rename)

### Объединение экономических показателей в один датасет

In [1504]:
roads_ = roads_df[['Length (km)/Roads', 'Length (km)/Expressways']]
roads_.columns = ['roads, km', 'expressways, km']

rail_ = rail[
    [
        'Length(km)',
        'Electrified length(km)',
        'Area (km2) per km track'
    ]
]
rail_.columns = [
    'railways, km',
    'electrified railways, km',
    'area (km2) per km track'
]

population_ = population[['Population', '% of worldpopulation']]
population_.columns = ['population, persons', 'population, %']

nominal_ = gdp_nominal[['GDP(US$million)']]
nominal_.columns = ['nominal GDP, US$million']

ppp_ = gdp_ppp[['GDP(millions of current Int$)']]
ppp_.columns = ['GDP PPP, millions of current Int$']

manufacturing_ = manufacturing[['Millions of $US']]
manufacturing_.columns = ['manufacturing, millions of $US']


stats = pd.concat(
    [
        roads_,
        rail_,
        population_,
        nominal_,
        ppp_,
        manufacturing_,
    ],
    axis=1,
    join='outer'
)



of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




In [1505]:
stats.head()


Unnamed: 0,"roads, km","expressways, km","railways, km","electrified railways, km",area (km2) per km track,"population, persons","population, %","nominal GDP, US$million","GDP PPP, millions of current Int$","manufacturing, millions of $US"
Abkhazia,,,,,,244832.0,3.2e-05,,,
Afghanistan,43916.0,,75.0,,8696.4,32225560.0,0.004157,19363.0,76486.0,
Albania,18000.0,177.0,334.0,,86.07,2862427.0,0.000369,15059.0,40151.0,
Algeria,113655.0,1394.0,4440.0,283.0,536.43,42200000.0,0.005444,180689.0,681396.0,
American Samoa,241.0,,,,,56700.0,7e-06,,,


In [1507]:
import os


fn = 'datasets/wiki_countries.tsv'
os.makedirs(os.path.split(fn)[0], exist_ok=True)

stats.to_csv(fn, sep='\t')

In [998]:
stats.shape

(243, 10)

In [1437]:
data = stats[['roads, km', 'railways, km', 'population, persons', 'nominal GDP, US$million']]

In [1438]:
world = data.loc[['World'], :]

In [1439]:
world

Unnamed: 0,"roads, km","railways, km","population, persons","nominal GDP, US$million"
World,64285009.0,1370782.0,7751517000.0,85804391.0


In [1440]:
data = data.drop(index='World')

In [1441]:
data = data.sort_values('nominal GDP, US$million', ascending=False)

In [1347]:
data.head(10)

Unnamed: 0,"roads, km","railways, km","population, persons","nominal GDP, US$million"
United States,6853024.0,149910.0,330343000.0,20494100.0
China,4846500.0,131000.0,1400260000.0,13608152.0
Japan,1215000.0,27311.0,126140000.0,4970916.0
Germany,644480.0,38594.0,83073100.0,3996759.0
United Kingdom,397039.0,16320.0,66435600.0,2825208.0
France,965446.0,29273.0,67081000.0,2777535.0
India,5903293.0,68442.0,1355550000.0,2726323.0
Italy,487700.0,16788.0,60262700.0,2073902.0
Brazil,1751868.0,38743.0,210795000.0,1868626.0
Canada,1042300.0,49452.0,37836400.0,1712510.0


In [1348]:
data = data.drop(['China', 'India'], axis=0)

In [1349]:
data.shape

(240, 4)

In [1350]:
data['roads, km'].fillna(0, inplace=True)
data['railways, km'].fillna(0, inplace=True)

In [1351]:
data.shape

(240, 4)

In [1352]:
dt = data.dropna(0)

In [1353]:
dt = dt.divide(world.iloc[0], axis='columns')

In [1354]:
dt = dt.sample(frac=1)

In [1355]:
train = dt.iloc[:-50, :]
test = dt.iloc[-50:, :]

In [1365]:
from sklearn import datasets
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_validate
from sklearn.neighbors import KNeighborsRegressor

In [1453]:
regressor = KNeighborsRegressor(4)
regressor.fit(train.iloc[:, :-1], train.iloc[:, -1:])
score = regressor.score(test.iloc[:, :-1], test.iloc[:, -1:])

In [1454]:
score

0.502224650694365

In [1434]:
dt = data.dropna(0)
dt = dt.divide(world.iloc[0], axis='columns')
train = dt.iloc[6::2, :]
test = dt.iloc[7::2, :]

In [1442]:
from sklearn.preprocessing import MinMaxScaler

In [1443]:
scaler = MinMaxScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(data), columns=data.columns)

In [1446]:
dt = df_scaled.dropna(0)
dt = dt.divide(world.iloc[0], axis='columns')
train = dt.iloc[4::2, :]
test = dt.iloc[5::2, :]

regressor = KNeighborsRegressor(10)
regressor.fit(train.iloc[:, :-1], train.iloc[:, -1:])
regressor.score(test.iloc[:, :-1], test.iloc[:, -1:])

0.4339724262194802