## Web Scraping 1: BeautifulSoup
[BeautifulSoup documentation](http://www.crummy.com/software/BeautifulSoup/bs4/doc/)

I'll start Project Luther by webscraping data from worldbank.org. The data consists of financial and societal welfare data by country and year (roughly 10 year intervals). 

In [2]:
from __future__ import print_function, division

In [3]:
########################
###### TIDY :) #########
########################

import requests
from bs4 import BeautifulSoup

def scrape_three_letter():
    """
    Returns a list of three letter country codes from the hardcoded website.
    url is hardcoded because the scraping is specific to the site.
    <h4 class="margin-clear">ALB</h4>
    
    scrape_three_letter() ==> ['AFG','ALA','ALB','DZA',]
    """
    url = 'https://laendercode.net/en/3-letter-list.html'
    
    try:
        response = requests.get(url)
        assert response.status_code == 200
    except:
        print(f'response code: {response.status_code}')  
    page = response.text
    soup = BeautifulSoup(page,"lxml")
    country_abv = [el.text for el in soup.find_all(class_='margin-clear')]
    return(country_abv)

tlc_list = scrape_three_letter()

In [4]:
tlc_list

['AFG',
 'ALA',
 'ALB',
 'DZA',
 'ASM',
 'AND',
 'AGO',
 'AIA',
 'ATA',
 'ATG',
 'ARG',
 'ARM',
 'ABW',
 'AUS',
 'AUT',
 'AZE',
 'BHS',
 'BHR',
 'BGD',
 'BRB',
 'BLR',
 'BEL',
 'BLZ',
 'BEN',
 'BMU',
 'BTN',
 'BOL',
 'BES',
 'BIH',
 'BWA',
 'BVT',
 'BRA',
 'IOT',
 'VGB',
 'BRN',
 'BGR',
 'BFA',
 'BDI',
 'KHM',
 'CMR',
 'CAN',
 'CPV',
 'CYM',
 'CAF',
 'TCD',
 'CHL',
 'CHN',
 'CXR',
 'CCK',
 'COL',
 'COM',
 'COK',
 'CRI',
 'HRV',
 'CUB',
 'CUW',
 'CYP',
 'CZE',
 'COD',
 'DNK',
 'DJI',
 'DMA',
 'DOM',
 'TLS',
 'ECU',
 'EGY',
 'SLV',
 'GNQ',
 'ERI',
 'EST',
 'ETH',
 'FLK',
 'FRO',
 'FJI',
 'FIN',
 'FRA',
 'GUF',
 'PYF',
 'ATF',
 'GAB',
 'GMB',
 'GEO',
 'DEU',
 'GHA',
 'GIB',
 'GRC',
 'GRL',
 'GRD',
 'GLP',
 'GUM',
 'GTM',
 'GGY',
 'GIN',
 'GNB',
 'GUY',
 'HTI',
 'HMD',
 'HND',
 'HKG',
 'HUN',
 'ISL',
 'IND',
 'IDN',
 'IRN',
 'IRQ',
 'IRL',
 'IMN',
 'ISR',
 'ITA',
 'CIV',
 'JAM',
 'JPN',
 'JEY',
 'JOR',
 'KAZ',
 'KEN',
 'KIR',
 'XKX',
 'KWT',
 'KGZ',
 'LAO',
 'LVA',
 'LBN',
 'LSO',
 'LBR',


In [5]:
import pandas as pd
import numpy as np
#second website with country data, needs 3-letter-code added to url

########################
###### TIDY :) #########
########################


a_country = tlc_list[0]

def process_country(a_country):
    """
    process_country(a_country) ==> DataFrame (df.shape() ==> 4 rows x 59 columns)
    Scrapes worldbank.org website for country related data starting with an input three letter
    country code as string. The website has 4 years of data for a country, all data is retreived. 
    This returns a pandas dataframe with 4 rows. 
    """
    #make beautifulsoup object from web data for a_country
    url_1 = f'http://databank.worldbank.org/data/views/reports/reportwidget.aspx?Report_Name=CountryProfile&Id=b450fd57&tbar=y&dd=y&inf=n&zm=n&country='
    url_2 = url_1 + a_country
    response = requests.get(url_2)
    page = response.text
    soup = BeautifulSoup(page,"lxml")
    
    #retreive year info from headers
    header_table=soup.find(id="table-ddpreport-header")
    years = [x.text for x in header_table.find_all(class_="spacer2")]    
    
    #retreive data
    data_table = soup.find(id="table-ddpreport-data")        
    rows = data_table.find_all(True, {"class":["rowdata level-0 ", "rowdata level-0 alternate"]})
    
    #Construct dataframe object COUTNRY YEAR F1 F2 F3...
    #I need to double check that this assembles the table correctly (compare to web data)
    df = pd.DataFrame()
    for row in rows: 
        row_data = [x.text for x in row.find_all('td')]
        df[row_data[0]] = row_data[1:]
    df.insert(0,'COUNTRY',a_country)
    df.insert(1,'YEAR',years)
    return df

#simple test for function above
process_country(a_country)



Unnamed: 0,COUNTRY,YEAR,"Population, total (millions)",Population growth (annual %),Surface area (sq. km) (thousands),Population density (people per sq. km of land area),Poverty headcount ratio at national poverty lines (% of population),Poverty headcount ratio at $1.90 a day (2011 PPP) (% of population),"GNI, Atlas method (current US$) (billions)","GNI per capita, Atlas method (current US$)",...,High-technology exports (% of manufactured exports),Statistical Capacity score (Overall average),Merchandise trade (% of GDP),Net barter terms of trade index (2000 = 100),"External debt stocks, total (DOD, current US$) (millions)","Total debt service (% of exports of goods, services and primary income)",Net migration (thousands),"Personal remittances, received (current US$) (millions)","Foreign direct investment, net inflows (BoP, current US$) (millions)",Net official development assistance received (current US$) (millions)
0,AFG,1990,12.25,3.9,652.9,18.8,..,..,..,..,...,..,..,..,..,0,..,2327,..,0,121.7
1,AFG,2000,20.09,3.5,652.9,30.8,..,..,..,..,...,..,..,72,100,0,..,929,..,0,136.0
2,AFG,2010,28.8,2.8,652.9,44.1,35.8,..,14.43,500,...,..,52,35,145,2425,0.4,448,362,191,6470.4
3,AFG,2017,35.53,2.5,652.9,54.4,..,..,20.18,570,...,..,51,41,161,2404,3.5,-300,378,53,4064.3


In [7]:
########################
###### TIDY :) #########
########################

#Running this cell can take up to 15 min. to scrape!!!!

def process_world(list_three_letter_abr):
    """
    uses webscraper helper function process_country() to create individual dataframes 
    which are appended into a large dataframe. 
    
    process_world(list_three_letter_abr) ==> df
    """
    total=pd.DataFrame() 

    for x in list_three_letter_abr : #remove slicing to generate whole list
        total = total.append(process_country(x))
    
    #after generating the whole set, relabel the indexes    
    total.index = range(total.shape[0])
    return total

#short test for function above
#pw1 = process_world(tlc_list[0:2])
#pw1

#for all countries use
pw1 = process_world(tlc_list)

In [9]:
#mapping dictionary for renaming columns
nick = {'COUNTRY': 'CNT',
'YEAR': 'YR',              
'Population, total (millions)': 'POP_TOT' ,        
'Population growth (annual %)': 'POP_GRO%',        
'Surface area (sq. km) (thousands)': 'AREA',            
'Population density (people per sq. km of land area)': 'POP_DEN',         
'Poverty headcount ratio at national poverty lines (% of population)': 'POV%',            
'Poverty headcount ratio at $1.90 a day (2011 PPP) (% of population)': 'POV_190%', 
'GNI, Atlas method (current US$) (billions)': 'GNI_ATL',
'GNI per capita, Atlas method (current US$)': 'GPC_ATL',
'GNI, PPP (current international $) (billions)': 'GNI_INT',
'GNI per capita, PPP (current international $)': 'GNI_PPP',
'Income share held by lowest 20%': 'INC_LOW20', 
'Life expectancy at birth, total (years)': 'LIF_EXP',
'Fertility rate, total (births per woman)': 'FER_RATE',
'Adolescent fertility rate (births per 1,000 women ages 15-19)': 'TEEN_FER_RATE',
'Contraceptive prevalence, any methods (% of women ages 15-49)': 'CONTR%',
'Births attended by skilled health staff (% of total)': 'HOSP_BIRTH%',
'Mortality rate, under-5 (per 1,000 live births)': 'MORT_RATE',
'Prevalence of underweight, weight for age (% of children under 5)'	: 'UNDER_WGT_BIRTH', 
'Immunization, measles (% of children ages 12-23 months)': 'IMM_MEAS%',
'Primary completion rate, total (% of relevant age group)': 'ELEM_GRAD%',
'School enrollment, primary (% gross)': 'SCH_ENR_PR%',
'School enrollment, secondary (% gross)': 'SCH_ENR_SEC%',
'School enrollment, primary and secondary (gross), gender parity index (GPI)': 'SCH_ENR_PR&SEC',
'Prevalence of HIV, total (% of population ages 15-49)': 'HIV%',
'Forest area (sq. km) (thousands)': 'FORR_AREA',
'Terrestrial and marine protected areas (% of total territorial area)': 'PROT_AREA%',
'Annual freshwater withdrawals, total (% of internal resources)': 'FRESH_WITH%',
'Urban population growth (annual %)': 'URB_GRO%',
'Energy use (kg of oil equivalent per capita)': 'ENRG_PERCAP',
'CO2 emissions (metric tons per capita)': 'CO2_PERCAP',
'Electric power consumption (kWh per capita)': 'ELEC_PERCAP',
'GDP (current US$) (billions)': 'GDP',
'GDP growth (annual %)': 'GDP_GROW%',
'Inflation, GDP deflator (annual %)': 'INF%',
'Agriculture, forestry, and fishing, value added (% of GDP)': 'AG_FOR_FISH%GDP',
'Industry (including construction), value added (% of GDP)': 'IND%GDP',
'Exports of goods and services (% of GDP)': 'EXP%GDP',
'Imports of goods and services (% of GDP)': 'IMP%GDP',
'Gross capital formation (% of GDP)': 'GCF%GDP',
'Revenue, excluding grants (% of GDP)': 'REV_NO_GRANT%GDP',
'Net lending (+) / net borrowing (-) (% of GDP)': 'LOANS%GDP',
'Time required to start a business (days)': 'TIME_START_BUS',
'Domestic credit provided by financial sector (% of GDP)': 'DOM_CRED%GDP',
'Tax revenue (% of GDP)': 'TAX_REV%GDP',
'Military expenditure (% of GDP)': 'MILIT%GDP',
'Mobile cellular subscriptions (per 100 people)': 'CELL_SUB%',
'Individuals using the Internet (% of population)': 'INTERNET%',
'High-technology exports (% of manufactured exports)': 'HIGH_TECH_EXP%',
'Statistical Capacity score (Overall average)': 'STAT_CAP',
'Merchandise trade (% of GDP)': 'MERCH%GDP',
'Net barter terms of trade index (2000 = 100)': 'NET_BART',
'External debt stocks, total (DOD, current US$) (millions)'	: 'EXT_DEBT_STOCK',
'Total debt service (% of exports of goods, services and primary income)': 'TOT_SEBT_SERV%',
'Net migration (thousands)': 'NET_MIG',
'Personal remittances, received (current US$) (millions)': 'REMIT',
'Foreign direct investment, net inflows (BoP, current US$) (millions)': 'FOR_INV_NET',
'Net official development assistance received (current US$) (millions):': 'OFF_DEV_ASS_NET'}

In [798]:
#CLEAN COMMAS and ".." in dataframe

def remove_commas(df):
    """
    removes commas from numbers with commas in a df.
    remove_commas(df) ==> df
    """
    df = df.applymap(lambda x: x.replace(',', ''))
    return df
    
remove_commas(pw1)    

def clean_df(df):
    """
    data entries include commas in numbers (need to remove), and '..' for data which is NA 
    (should replace with NaN). Also change number data to float and change column headers 
    to shortened nicknames. 

    clean_df(df) ==> df
    """
    

NameError: name 'pw1' is not defined

In [484]:
#total = total.reset_index()
#total.index = range(total.shape[0])

In [494]:
  #<==returns content of column 1

#total.iloc[:, [1,5]]  #<==returns Series version of column 1

nick = {'CNT'        :'COUNTRY',
 'YR'                : 'YEAR',
 'POP_TOT'           : 'Population, total (millions)',
 'POP_GRO%'          : 'Population growth (annual %)',
 'AREA'              : 'Surface area (sq. km) (thousands)',
 'POP_DEN'           :'Population density (people per sq. km of land area)',
 'POV%'              : 'Poverty headcount ratio at national poverty lines (% of population)',
 'POV_190%' : 'Poverty headcount ratio at $1.90 a day (2011 PPP) (% of population)',
 'GNI_ATL' : 'GNI, Atlas method (current US$) (billions)',
 'GPC_ATL' : 'GNI per capita, Atlas method (current US$)',
 'GNI_INT' : 'GNI, PPP (current international $) (billions)',
 'GNI_PPP' : 'GNI per capita, PPP (current international $)',
 'INC_LOW20' : 'Income share held by lowest 20%',
 'LIF_EXP' : 'Life expectancy at birth, total (years)',
 'FER_RATE' : 'Fertility rate, total (births per woman)',
 'TEEN_FER_RATE' : 'Adolescent fertility rate (births per 1,000 women ages 15-19)',
 'CONTR%' : 'Contraceptive prevalence, any methods (% of women ages 15-49)',
 'HOSP_BIRTH%' : 'Births attended by skilled health staff (% of total)',
 'MORT_RATE' : 'Mortality rate, under-5 (per 1,000 live births)',
 'UNDER_WGT_BIRTH' : 'Prevalence of underweight, weight for age (% of children under 5)',
 'IMM_MEAS%' : 'Immunization, measles (% of children ages 12-23 months)',
 'ELEM_GRAD%' : 'Primary completion rate, total (% of relevant age group)',
 'SCH_ENR_PR%' : 'School enrollment, primary (% gross)',
 'SCH_ENR_SEC%' : 'School enrollment, secondary (% gross)',
 'SCH_ENR_PR&SEC' : 'School enrollment, primary and secondary (gross), gender parity index (GPI)',
 'HIV%' : 'Prevalence of HIV, total (% of population ages 15-49)',
 'FORR_AREA' : 'Forest area (sq. km) (thousands)',
 'PROT_AREA%': 'Terrestrial and marine protected areas (% of total territorial area)',
 'FRESH_WITH%' :'Annual freshwater withdrawals, total (% of internal resources)',
 'URB_GRO%':'Urban population growth (annual %)',
 'ENRG_PERCAP':'Energy use (kg of oil equivalent per capita)',
 'CO2_PERCAP':'CO2 emissions (metric tons per capita)',
 'ELEC_PERCAP':'Electric power consumption (kWh per capita)',
 'GDP':'GDP (current US$) (billions)',
 'GDP_GROW%':'GDP growth (annual %)',
 'INF%':'Inflation, GDP deflator (annual %)',
 'AG_FOR_FISH%GDP':'Agriculture, forestry, and fishing, value added (% of GDP)',
 'IND%GDP':'Industry (including construction), value added (% of GDP)',
 'EXP%GDP':'Exports of goods and services (% of GDP)',
 'IMP%GDP':'Imports of goods and services (% of GDP)',
 'GCF%GDP':'Gross capital formation (% of GDP)',
 'REV_NO_GRANT%GDP':'Revenue, excluding grants (% of GDP)',
 'LOANS%GDP':'Net lending (+) / net borrowing (-) (% of GDP)',
 'TIME_START_BUS':'Time required to start a business (days)',
 'DOM_CRED%GDP':'Domestic credit provided by financial sector (% of GDP)',
 'TAX_REV%GDP':'Tax revenue (% of GDP)',
 'MILIT%GDP':'Military expenditure (% of GDP)',
 'CELL_SUB%':'Mobile cellular subscriptions (per 100 people)',
 'INTERNET%':'Individuals using the Internet (% of population)',
 'HIGH_TECH_EXP%':'High-technology exports (% of manufactured exports)',
 'STAT_CAP':'Statistical Capacity score (Overall average)',
 'MERCH%GDP':'Merchandise trade (% of GDP)',
 'NET_BART':'Net barter terms of trade index (2000 = 100)',
 'EXT_DEBT_STOCK':'External debt stocks, total (DOD, current US$) (millions)',
 'TOT_SEBT_SERV%':'Total debt service (% of exports of goods, services and primary income)',
 'NET_MIG':'Net migration (thousands)',
 'REMIT':'Personal remittances, received (current US$) (millions)',
 'FOR_INV_NET':'Foreign direct investment, net inflows (BoP, current US$) (millions)',
 'OFF_DEV_ASS_NET':'Net official development assistance received (current US$) (millions)'}

total#.loc[:,[nick['CNT'], nick['YR'],nick['ELEC_PERCAP'],nick['MILIT%GDP'], nick['TAX_REV%GDP'], nick['GDP'],nick['POV_190%']]]

Unnamed: 0,COUNTRY,YEAR,"Population, total (millions)",Population growth (annual %),Surface area (sq. km) (thousands),Population density (people per sq. km of land area),Poverty headcount ratio at national poverty lines (% of population),Poverty headcount ratio at $1.90 a day (2011 PPP) (% of population),"GNI, Atlas method (current US$) (billions)","GNI per capita, Atlas method (current US$)",...,High-technology exports (% of manufactured exports),Statistical Capacity score (Overall average),Merchandise trade (% of GDP),Net barter terms of trade index (2000 = 100),"External debt stocks, total (DOD, current US$) (millions)","Total debt service (% of exports of goods, services and primary income)",Net migration (thousands),"Personal remittances, received (current US$) (millions)","Foreign direct investment, net inflows (BoP, current US$) (millions)",Net official development assistance received (current US$) (millions)
0,AFG,1990,12.25,3.9,652.9,18.8,..,..,..,..,...,..,..,..,..,0,..,2327,..,0,121.7
1,AFG,2000,20.09,3.5,652.9,30.8,..,..,..,..,...,..,..,72,100,0,..,929,..,0,136.0
2,AFG,2010,28.80,2.8,652.9,44.1,35.8,..,14.43,500,...,..,52,35,145,2425,0.4,448,362,191,6470.4
3,AFG,2017,35.53,2.5,652.9,54.4,..,..,20.18,570,...,..,51,41,161,2404,3.5,-300,378,53,4064.3
4,ALA,1990,5288.10,1.7,134042.4,40.8,..,35.9,22236.07,4205,...,18,..,31,..,..,..,0,68441,196315,58242.2
5,ALA,2000,6121.68,1.3,134112.3,47.2,..,28.6,33517.78,5475,...,24,..,39,..,..,..,0,121611,1460994,49803.6
6,ALA,2010,6932.87,1.2,134311.4,53.4,..,15.7,65033.46,9380,...,17,..,47,..,..,..,0,417811,1860424,130685.4
7,ALA,2017,7530.36,1.2,134325.1,58.0,..,10.0,78060.79,10366,...,18,..,52,..,..,..,0,573286,1862732,157676.2
8,ALB,1990,3.29,1.8,28.8,119.9,..,..,2.22,680,...,..,..,29,..,0,4.3,-443,152,20,11.1
9,ALB,2000,3.09,-0.6,28.8,112.7,25.4,2.0,3.60,1170,...,1,..,37,100,1122,4.5,-177,598,143,317.9


In [512]:
import pickle
tot_2017 = total[total['YEAR']=='2017']
tot_2017.to_pickle("./tot_2017.pkl")

In [608]:
#total[total.COUNTRY=='JPN'] #this works
total[total.columns[total.columns.str.contains(r'pop|HIV')]]

def get_col(reg_ex, df):
    return df[df.columns[df.columns.str.contains(reg_ex)]]
    
import re
#search_terms=re.compile(r'YEAR|COUN|meas|dens|grow|Contr', re.IGNORECASE)
search_terms=re.compile(r'COUNTRY|YEAR|Population growth|density', re.IGNORECASE)
little_table = get_col(search_terms,total)
little_table[little_table['YEAR']=='2017']

Unnamed: 0,COUNTRY,YEAR,Population growth (annual %),Population density (people per sq. km of land area),"Life expectancy at birth, total (years)",Urban population growth (annual %)
3,AFG,2017,2.5,54.4,64,3.4
7,ALA,2017,1.2,58.0,72,2.0
11,ALB,2017,-0.1,104.9,78,1.5
15,DZA,2017,1.7,17.3,76,2.6
19,ASM,2017,0.1,278.2,..,0.0
23,AND,2017,-0.4,163.8,..,-0.5
27,AGO,2017,3.3,23.9,62,4.4
31,AIA,2017,1.2,58.0,72,2.0
35,ATA,2017,1.2,58.0,72,2.0
39,ATG,2017,1.0,231.8,76,0.5


In [719]:
from matplotlib import pyplot as plt
from matplotlib.pyplot import figure
%matplotlib inline
figure(num=None, figsize=(8, 8), dpi=80, facecolor='w', edgecolor='g')

tbl = get_col(r'Population growth|density|COUNTRY',total)
tbl
#x = list(tbl.iloc[:,1].values)
#y = list(tbl.iloc[:,2].values)
#plt.axis([0, 10, 0, 80])
#plt.scatter(x[0:100],y[0:100])
#plt.show()


Unnamed: 0,COUNTRY,Population growth (annual %),Population density (people per sq. km of land area)
0,AFG,3.9,18.8
1,AFG,3.5,30.8
2,AFG,2.8,44.1
3,AFG,2.5,54.4
4,ALA,1.7,40.8
5,ALA,1.3,47.2
6,ALA,1.2,53.4
7,ALA,1.2,58.0
8,ALB,1.8,119.9
9,ALB,-0.6,112.7


<Figure size 640x640 with 0 Axes>

In [709]:
# tbl = tbl[~tbl.iloc[:, 0].str.contains(r'\.\.')].copy()
# tbl = tbl[~tbl.iloc[:, 1].str.contains(r'\.\.')].copy()
tbl = tbl.applymap(lambda x: x.replace(',', ''))

In [710]:
tbl[tbl.iloc[:, 0].str.contains(r'\.\.')].copy()

Unnamed: 0,COUNTRY,Population growth (annual %),Population density (people per sq. km of land area)


In [712]:
tbl.iloc[:, 1] = pd.to_numeric(tbl.iloc[:, 1], errors='coerce')
tbl.iloc[:, 2] = pd.to_numeric(tbl.iloc[:, 2], errors='coerce')

In [720]:
tbl.dropna(inplace=True)
tbl

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,COUNTRY,Population growth (annual %),Population density (people per sq. km of land area)
0,AFG,3.9,18.8
1,AFG,3.5,30.8
2,AFG,2.8,44.1
3,AFG,2.5,54.4
4,ALA,1.7,40.8
5,ALA,1.3,47.2
6,ALA,1.2,53.4
7,ALA,1.2,58.0
8,ALB,1.8,119.9
9,ALB,-0.6,112.7


In [756]:
s = tbl[tbl.columns[1]]
pd.to_numeric?

In [None]:
tbl

In [767]:
tbl.iloc[:, [1, 2]]

Unnamed: 0,Population growth (annual %),Population density (people per sq. km of land area)
0,3.9,18.8
1,3.5,30.8
2,2.8,44.1
3,2.5,54.4
4,1.7,40.8
5,1.3,47.2
6,1.2,53.4
7,1.2,58.0
8,1.8,119.9
9,-0.6,112.7


In [785]:
numeric = tbl.iloc[:, [1, 2]]
numeric = numeric.applymap(lambda x: x.replace('..', 'NaN'))
numeric = numeric.applymap(lambda x: x.replace(',', ''))
numeric.astype(float)
#tbl.iloc[:, [1, 2]] = numeric


Unnamed: 0,COUNTRY,Population growth (annual %),Population density (people per sq. km of land area)
444,JPN,0.3,338.8
445,JPN,0.2,348.0
446,JPN,0.0,351.3
447,JPN,-0.2,347.8


In [759]:
tbl.astype(float, errors='ignore')

COUNTRY                                                object
Population growth (annual %)                           object
Population density (people per sq. km of land area)    object
dtype: object

In [749]:

#fig, ax = plt.subplots()
#tbl.plot(x=tbl.columns[1], y=tbl.columns[2], kind='scatter', ax=ax)
tbl.apply
tbl.apply(lambda x: x, tbl.columns[1], axis=1)
#tbl.info
#for k, v in tbl.iterrows():
    #ax.annotate(k, v)
#    print(k,v)

TypeError: apply() got multiple values for argument 'axis'

In [657]:
tbl#.plot(x=0,y=1)
#plt.plot(get_col(r'Population growth|density',total))

Unnamed: 0,Population growth (annual %),Population density (people per sq. km of land area)
0,3.9,18.8
1,3.5,30.8
2,2.8,44.1
3,2.5,54.4
4,1.7,40.8
5,1.3,47.2
6,1.2,53.4
7,1.2,58.0
8,1.8,119.9
9,-0.6,112.7


In [638]:
s = total['COUNTRY']
s.isin(['JPN', 'USA'])

0       False
1       False
2       False
3       False
4       False
5       False
6       False
7       False
8       False
9       False
10      False
11      False
12      False
13      False
14      False
15      False
16      False
17      False
18      False
19      False
20      False
21      False
22      False
23      False
24      False
25      False
26      False
27      False
28      False
29      False
        ...  
978     False
979     False
980     False
981     False
982     False
983     False
984     False
985     False
986     False
987     False
988     False
989     False
990     False
991     False
992     False
993     False
994     False
995     False
996     False
997     False
998     False
999     False
1000    False
1001    False
1002    False
1003    False
1004    False
1005    False
1006    False
1007    False
Name: COUNTRY, Length: 1008, dtype: bool

In [572]:
total[s.str.contains(r'JP')]

Unnamed: 0,COUNTRY,YEAR,"Population, total (millions)",Population growth (annual %),Surface area (sq. km) (thousands),Population density (people per sq. km of land area),Poverty headcount ratio at national poverty lines (% of population),Poverty headcount ratio at $1.90 a day (2011 PPP) (% of population),"GNI, Atlas method (current US$) (billions)","GNI per capita, Atlas method (current US$)",...,High-technology exports (% of manufactured exports),Statistical Capacity score (Overall average),Merchandise trade (% of GDP),Net barter terms of trade index (2000 = 100),"External debt stocks, total (DOD, current US$) (millions)","Total debt service (% of exports of goods, services and primary income)",Net migration (thousands),"Personal remittances, received (current US$) (millions)","Foreign direct investment, net inflows (BoP, current US$) (millions)",Net official development assistance received (current US$) (millions)
444,JPN,1990,123.54,0.3,377.8,338.8,..,..,3440.78,27850,...,24,..,17,..,..,..,46,508,1777,..
445,JPN,2000,126.84,0.2,377.8,348.0,..,..,4595.16,36230,...,29,..,18,100,..,..,164,773,10688,..
446,JPN,2010,128.07,0.0,378.0,351.3,..,0.2,5562.86,43440,...,18,..,26,87,..,..,358,1684,7441,..
447,JPN,2017,126.79,-0.2,378.0,347.8,..,..,4888.12,38550,...,16,..,28,95,..,..,250,4440,18838,..


In [566]:
total.query('COUNTRY == "USA" or COUNTRY == "JPN"')

Unnamed: 0,COUNTRY,YEAR,"Population, total (millions)",Population growth (annual %),Surface area (sq. km) (thousands),Population density (people per sq. km of land area),Poverty headcount ratio at national poverty lines (% of population),Poverty headcount ratio at $1.90 a day (2011 PPP) (% of population),"GNI, Atlas method (current US$) (billions)","GNI per capita, Atlas method (current US$)",...,High-technology exports (% of manufactured exports),Statistical Capacity score (Overall average),Merchandise trade (% of GDP),Net barter terms of trade index (2000 = 100),"External debt stocks, total (DOD, current US$) (millions)","Total debt service (% of exports of goods, services and primary income)",Net migration (thousands),"Personal remittances, received (current US$) (millions)","Foreign direct investment, net inflows (BoP, current US$) (millions)",Net official development assistance received (current US$) (millions)
444,JPN,1990,123.54,0.3,377.8,338.8,..,..,3440.78,27850,...,24,..,17,..,..,..,46,508,1777,..
445,JPN,2000,126.84,0.2,377.8,348.0,..,..,4595.16,36230,...,29,..,18,100,..,..,164,773,10688,..
446,JPN,2010,128.07,0.0,378.0,351.3,..,0.2,5562.86,43440,...,18,..,26,87,..,..,358,1684,7441,..
447,JPN,2017,126.79,-0.2,378.0,347.8,..,..,4888.12,38550,...,16,..,28,95,..,..,250,4440,18838,..
956,USA,1990,249.62,1.1,9629.1,27.3,..,0.5,6029.53,24150,...,33,..,15,103,..,..,4517,1170,48490,..
957,USA,2000,282.16,1.1,9632.0,30.8,..,0.7,10178.5,36070,...,34,..,20,100,..,..,5207,4395,350066,..
958,USA,2010,309.34,0.8,9831.5,33.8,..,1.0,15143.14,48950,...,20,..,22,97,..,..,4500,5930,259344,..
959,USA,2017,325.72,0.7,9831.5,35.6,..,1.2,18980.26,58270,...,20,..,20,100,..,..,4500,6621,348674,..


In [562]:
(total['COUNTRY'] == 'JPN') | (total['COUNTRY'] == 'USA')

0       False
1       False
2       False
3       False
4       False
5       False
6       False
7       False
8       False
9       False
10      False
11      False
12      False
13      False
14      False
15      False
16      False
17      False
18      False
19      False
20      False
21      False
22      False
23      False
24      False
25      False
26      False
27      False
28      False
29      False
        ...  
978     False
979     False
980     False
981     False
982     False
983     False
984     False
985     False
986     False
987     False
988     False
989     False
990     False
991     False
992     False
993     False
994     False
995     False
996     False
997     False
998     False
999     False
1000    False
1001    False
1002    False
1003    False
1004    False
1005    False
1006    False
1007    False
Name: COUNTRY, Length: 1008, dtype: bool

In [558]:
mask = any((
            total['COUNTRY']=='JPN',
            total['COUNTRY']=='USA',
            ))
# total[mask]
mask
       
#total[mask] #or total.filter(like='Population')

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [515]:

unpickled_df = pd.read_pickle("./tot_2017.pkl")
unpickled_df[]

Unnamed: 0,COUNTRY,YEAR,"Population, total (millions)",Population growth (annual %),Surface area (sq. km) (thousands),Population density (people per sq. km of land area),Poverty headcount ratio at national poverty lines (% of population),Poverty headcount ratio at $1.90 a day (2011 PPP) (% of population),"GNI, Atlas method (current US$) (billions)","GNI per capita, Atlas method (current US$)",...,High-technology exports (% of manufactured exports),Statistical Capacity score (Overall average),Merchandise trade (% of GDP),Net barter terms of trade index (2000 = 100),"External debt stocks, total (DOD, current US$) (millions)","Total debt service (% of exports of goods, services and primary income)",Net migration (thousands),"Personal remittances, received (current US$) (millions)","Foreign direct investment, net inflows (BoP, current US$) (millions)",Net official development assistance received (current US$) (millions)
3,AFG,2017,35.53,2.5,652.9,54.4,..,..,20.18,570,...,..,51,41,161,2404,3.5,-300,378,53,4064.3
7,ALA,2017,7530.36,1.2,134325.1,58.0,..,10.0,78060.79,10366,...,18,..,52,..,..,..,0,573286,1862732,157676.2
11,ALB,2017,2.87,-0.1,28.8,104.9,..,..,12.42,4320,...,1,89,58,93,8437,15.2,-40,1311,1033,168.5
15,DZA,2017,41.32,1.7,2381.7,17.3,..,..,163.52,3960,...,0,61,48,84,5466,1.0,-50,2093,1637,157.4
19,ASM,2017,0.06,0.1,0.2,278.2,..,..,..,..,...,..,..,108,170,..,..,..,..,..,..
23,AND,2017,0.08,-0.4,0.5,163.8,..,..,..,..,...,19,..,..,104,..,..,..,..,..,..
27,AGO,2017,29.78,3.3,1246.7,23.9,..,..,99.11,3330,...,..,46,42,132,35365,26.5,0,4,4104,206.8
31,AIA,2017,7530.36,1.2,134325.1,58.0,..,10.0,78060.79,10366,...,18,..,52,..,..,..,0,573286,1862732,157676.2
35,ATA,2017,7530.36,1.2,134325.1,58.0,..,10.0,78060.79,10366,...,18,..,52,..,..,..,0,573286,1862732,157676.2
39,ATG,2017,0.10,1.0,0.4,231.8,..,..,1.45,14170,...,..,54,43,58,..,..,0,31,49,0.1
