<div class="alert alert-info">
    <div style="font-weight:bold">
        <h1>
            <a href="https://www.globalfirepower.com/">GFP</a>
            Scrapper
        </h1>
    </div>
</div>

In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import time
import re

from functools import reduce

In [2]:
def get_countriesAttributes(url, indicator, class_="picTrans recordsetContainer boxShadow zoom"):
    """
    Given an URL from https://www.globalfirepower.com/ on an indicator, then retrieve all the
    attributes per country.

    # Params
    ---
    :param url: string
        The url of the indicator to scrap information from.
    :param indicator: string
        The name of the indicator.
    :param class_: str
        The class name to find the elements with the information. 
    """

    # Get the html file with the information
    r = requests.get(url)
    soup = BeautifulSoup(r.content)

    # Get all the found elements i.e. countries/rows with information
    attributes = list(map(lambda el: get_attributes(el), soup.find_all('div', class_=class_)))
    
    # Get a dataframe
    countriesAttributes = pd.DataFrame(attributes).rename(columns={'value': indicator})
    
    return countriesAttributes

def get_attributes(el, regex='[(\\t)(\\n)(\\r)(\s)(km)(bbl)(\$)(,)]'):
    """
    For each of the elements that contain information of a variable for a given counter,
    then return a dictionary with the relevant information such as the long and short name,
    and the actual value.

    # Params
    ---
    :param el: class BeautifulSoup(bs4.element.Tag)
        The beautifulsoup element to manipulate.
    :param regex: str
        A regex in order to clean the values into pure numbers.
    """
    attributes = {}
    attributes_request = [
        ('country_longName', 'div', 'longFormName'),
        ('country_shortName', 'div', 'shortFormName'),
        ('value', 'div', 'valueContainer')
    ]

    for name, tag, class_ in attributes_request:
        attribute = None
        try:
            attribute = el.find(tag, class_=class_).text.strip()

            if name == 'value':
                attribute = re.sub(regex, '', attribute)
                
        except Exception:
            pass

        attributes[name] = attribute  

    return attributes

In [3]:
# Global Fire Power dictionary
gfp_dict = {
    'Manpower': {
        'Total population': 'https://www.globalfirepower.com/total-population-by-country.php',
        'Reaching Military Age': 'https://www.globalfirepower.com/manpower-reaching-military-age-annually.php',
        'Active Service': 'https://www.globalfirepower.com/active-military-manpower.php',
        'Active Reserves': 'https://www.globalfirepower.com/active-reserve-military-manpower.php',
        'Paramilitary': 'https://www.globalfirepower.com/manpower-paramilitary.php',
        },
    'Equipment': {
        # Airpower
        'Fighters/Interceptors': 'https://www.globalfirepower.com/aircraft-total-fighters.php',
        'Attack/Strike': 'https://www.globalfirepower.com/aircraft-total-attack-types.php',
        'Helicopter Fleets': 'https://www.globalfirepower.com/aircraft-helicopters-total.php',
        # Landpower
        'Armored Fighting Vehicles': 'https://www.globalfirepower.com/armor-apc-total.php',
        'Towed Artillery': 'https://www.globalfirepower.com/armor-towed-artillery-total.php',
        'Rocket Projectors': 'https://www.globalfirepower.com/armor-mlrs-total.php',
        # Navy
        'Submarines': 'https://www.globalfirepower.com/navy-submarines.php',
        'Frigates': 'https://www.globalfirepower.com/navy-frigates.php',
        'Corvettes': 'https://www.globalfirepower.com/navy-corvettes.php',
        },
    'Finances': {
        # All in (USD)
        'Defense Budgets': 'https://www.globalfirepower.com/defense-spending-budget.php',
        'External Debt': 'https://www.globalfirepower.com/external-debt-by-country.php',
        'Purchasing Power Parity': 'https://www.globalfirepower.com/purchasing-power-parity.php',
        'Reserves of Foreign Exchange & Gold': 'https://www.globalfirepower.com/reserves-of-foreign-exchange-and-gold.php',
        },
    'Logistics': {
        'Airports': 'https://www.globalfirepower.com/major-serviceable-airports-by-country.php',
        'Labor Force': 'https://www.globalfirepower.com/labor-force-by-country.php',
        'Merchant Marine Strength': 'https://www.globalfirepower.com/merchant-marine-strength-by-country.php',
        # (km)
        'Railway Coverage': 'https://www.globalfirepower.com/railway-coverage.php',
        # (km)
        'Roadway Coverage': 'https://www.globalfirepower.com/roadway-coverage.php',
        },
    'Natural Resources and Geography': {
        # Barrels Per Day (bbl/day)
        'Oil Production': 'https://www.globalfirepower.com/oil-production-by-country.php',
        # Barrels Per Day (bbl/day)
        'Oil Consumption': 'https://www.globalfirepower.com/oil-consumption-by-country.php',
        # (km2)
        'Square Land Area': 'https://www.globalfirepower.com/square-land-area.php',
        },
    }

## Get data

In [4]:
dfs = []
for section, indicators in gfp_dict.items():
    print('* %s' % section.upper())
    for indicator_title, indicator_url in indicators.items():
        print('  - %s:' % indicator_title, indicator_url)
        countriesAttributes = get_countriesAttributes(indicator_url, indicator_title)
        dfs.append(countriesAttributes)

    print()
    time.sleep(15)

* MANPOWER
  - Total population: https://www.globalfirepower.com/total-population-by-country.php
  - Reaching Military Age: https://www.globalfirepower.com/manpower-reaching-military-age-annually.php
  - Active Service: https://www.globalfirepower.com/active-military-manpower.php
  - Active Reserves: https://www.globalfirepower.com/active-reserve-military-manpower.php
  - Paramilitary: https://www.globalfirepower.com/manpower-paramilitary.php

* EQUIPMENT
  - Fighters/Interceptors: https://www.globalfirepower.com/aircraft-total-fighters.php
  - Attack/Strike: https://www.globalfirepower.com/aircraft-total-attack-types.php
  - Helicopter Fleets: https://www.globalfirepower.com/aircraft-helicopters-total.php
  - Armored Fighting Vehicles: https://www.globalfirepower.com/armor-apc-total.php
  - Towed Artillery: https://www.globalfirepower.com/armor-towed-artillery-total.php
  - Rocket Projectors: https://www.globalfirepower.com/armor-mlrs-total.php
  - Submarines: https://www.globalfirepo

In [5]:
%%time

# Rename some columns with units
columns_dict_byunits = {
    'Defense Budget': 'Defense Budget (USD)',
    'External Debt': 'External Debt (USD)',
    'Purchasing Power Parity': 'Purchasing Power Parity (USD)',
    'Reserves of Foreign Exchange & Gold': 'Reserves of Foreign Exchange & Gold (USD)',
    'Railway Coverage': 'Railway Coverage (km)',
    'Roadway Coverage': 'Roadway Coverage (km)',
    'Oil Production': 'Oil Production (bbl)',
    'Oil Consumption': 'Oil Consumption (bbl)',
    'Square Land Area': 'Square Land Area (km2)',
    }

gfp_countries_indicators = (reduce(lambda df1, df2: df1.merge(df2, on=['country_longName', 'country_shortName'], how='outer'), dfs)
                            .fillna(0)
                            .rename(columns=columns_dict_byunits)
                           )
gfp_countries_indicators

CPU times: user 39.6 ms, sys: 132 μs, total: 39.7 ms
Wall time: 38.2 ms


Unnamed: 0,country_longName,country_shortName,Total population,Reaching Military Age,Active Service,Active Reserves,Paramilitary,Fighters/Interceptors,Attack/Strike,Helicopter Fleets,...,Purchasing Power Parity (USD),Reserves of Foreign Exchange & Gold (USD),Airports,Labor Force,Merchant Marine Strength,Railway Coverage (km),Roadway Coverage (km),Oil Production (bbl),Oil Consumption (bbl),Square Land Area (km2)
0,Afghanistan,AFG,39232003,823872,0,0,80000,0,0,11,...,60803000000,8500000000,46,9390000,0,0,34903,0,25000,652230
1,Albania,ALB,3101621,62032,6600,2000,500,0,0,19,...,40822000000,5635000000,3,1378000,69,424,3945,16100,26500,28748
2,Algeria,ALG,44758398,716134,325000,135000,150000,102,42,298,...,487716000000,56211000000,149,12312000,119,4020,104000,1415000,450500,2381740
3,Angola,ANG,35981281,359813,107000,0,10000,57,26,116,...,203868000000,14468000000,102,14462000,58,2761,26000,1200000,135000,1246700
4,Argentina,ARG,46621847,699328,108000,0,20000,24,10,90,...,986134000000,39653000000,916,21206000,203,17866,281290,700000,700000,2780400
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140,Venezuela,VEN,30518260,610365,109000,8000,220000,40,0,75,...,269068000000,10397000000,444,10245000,272,447,96189,605000,470500,912050
141,Vietnam,VET,104799174,1781586,600000,5000000,250000,41,33,99,...,1036000000000,109371000000,45,56203000,1975,2600,195468,200000,500000,331210
142,Yemen,YEM,31585062,600126,66700,0,20000,53,23,61,...,73630000000,595723500,57,7299000,30,0,71300,70200,76000,527968
143,Zambia,ZAM,20216029,363889,15150,0,1200,8,0,28,...,63030000000,2754000000,88,8113000,2,3126,67671,0,25500,752618


In [6]:
# Guarda el dataframe
gfp_countries_indicators.to_csv('gfp_countries_indicators.csv', index=False)