In [37]:
import pandas as pd
import altair as alt

In [2]:
def draw_lines(df, y_column, color_column='Entity'):
    df['Year'] = pd.to_datetime(df['Year'], format='%Y')

    return alt.Chart(df).mark_line(
        point=alt.OverlayMarkDef(color='black',size=60)
    ).encode(
        x='Year',
        y=y_column,
        color=color_column,
        tooltip='Year'
    ).interactive().properties(
        width=600,
        height=400
    )

def extract_relevant_countries(df: pd.DataFrame, relevant_countries: list) -> pd.DataFrame:
    return df[df['Entity'].isin(relevant_countries)]

## TODO: enhance tooltips, fix dot charts

In [3]:
rev_df = pd.read_csv('data/government-revenues-national-income.csv')
rev_df.head()

Unnamed: 0,Entity,Code,Year,National Gov Revenues (Wallis (2000)),Local Gov Revenues (Wallis (2000)),State Gov Revenues (Wallis (2000))
0,United States,USA,1902,3.0,4.0,0.8
1,United States,USA,1913,2.4,4.2,0.9
2,United States,USA,1922,5.8,5.2,1.7
3,United States,USA,1927,4.7,6.0,2.1
4,United States,USA,1934,6.0,7.6,3.8


In [4]:
rev_df = rev_df.rename(columns={'Local Gov Revenues (Wallis (2000))': 'Local', 'State Gov Revenues (Wallis (2000))': 'State', 'National Gov Revenues (Wallis (2000))': 'National'})
rev_df['Year'] = pd.to_datetime(rev_df['Year'], format='%Y')

alt.Chart(rev_df).mark_area().encode(
    x='Year',
    y=alt.Y('Revenue:Q', stack=True),
    color='Level:N'
).transform_fold(
    ['National', 'Local', 'State'],
    as_=['Level', 'Revenue']
)

In [5]:
vat_df = pd.read_csv('data/number-of-countries-having-implemented-a-vat.csv')
vat_df.head()

Unnamed: 0,Entity,Code,Year,Countries with VAT (OECD (2016))
0,Countries with VAT,,1960,1
1,Countries with VAT,,1964,3
2,Countries with VAT,,1967,4
3,Countries with VAT,,1968,7
4,Countries with VAT,,1969,9


In [6]:
vat_df['Year'] = pd.to_datetime(vat_df['Year'], format='%Y')

alt.Chart(vat_df).mark_line(
    point=alt.OverlayMarkDef(color='black',size=60)
).encode(
    x='Year',
    y='Countries with VAT (OECD (2016))',
    tooltip=['Year']
).properties(
    width=1000,
    height=500,
).interactive()

In [7]:
df = pd.read_csv('data/relative-weight-of-two-forms-of-consumption-taxation.csv')
df.head()

Unnamed: 0,Entity,Code,Year,Taxes on specific goods and services (OECD (2016)),Value Added Taxes (OECD (2016))
0,OECD average,,1965,24.28,2.218
1,OECD average,,1968,21.468,3.518
2,OECD average,,1969,20.925,4.467
3,OECD average,,1970,20.668,6.356
4,OECD average,,1971,19.01,7.933


In [8]:
df['Year'] = pd.to_datetime(df['Year'], format='%Y')
alt.Chart(df).mark_area().encode(    
    x='Year',
    y=alt.Y('Weight:Q', stack=True),
    color='Type:N'
).transform_fold(
    ['Taxes on specific goods and services (OECD (2016))', 'Value Added Taxes (OECD (2016))'],
    as_=['Type', 'Weight']
)

In [9]:
euro_df = pd.read_csv('data/income-taxes-share-of-revenue-europe.csv')
euro_df.head()

Unnamed: 0,Entity,Code,Year,Share of Revenue from Income tax (Flora (1983) and ICTD (2016))
0,Austria,AUT,1905,4.2
1,Austria,AUT,1925,14.8
2,Austria,AUT,1926,13.9
3,Austria,AUT,1927,14.4
4,Austria,AUT,1928,14.0


In [10]:
relevant_countries = extract_relevant_countries(euro_df, ['Denmark', 'Italy', 'Germany', 'France'])

draw_lines(relevant_countries, 'Share of Revenue from Income tax (Flora (1983) and ICTD (2016))')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Year'] = pd.to_datetime(df['Year'], format='%Y')


In [11]:
stat_df = pd.read_csv('data/statutory-corporate-income-tax-rate.csv')
stat_df.head()

Unnamed: 0,Entity,Code,Year,"Combined corporate income tax rate (CTSD, OECD (2019))"
0,Andorra,AND,2000,0.0
1,Andorra,AND,2001,0.0
2,Andorra,AND,2002,0.0
3,Andorra,AND,2003,0.0
4,Andorra,AND,2004,0.0


In [12]:
relevant_countries = extract_relevant_countries(stat_df, ['India', 'Argentina', 'Japan', 'Peru', 'Spain', 'Thailand', 'Singapore'])

draw_lines(relevant_countries, 'Combined corporate income tax rate (CTSD, OECD (2019))')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Year'] = pd.to_datetime(df['Year'], format='%Y')


In [13]:
red_df = pd.read_csv('data/tax-reduction-in-income-inequality.csv')
red_df.head()

Unnamed: 0,Entity,Code,Year,Percentage reduction in Gini (OECD (2019))
0,Australia,AUS,2007,22.601374
1,Australia,AUS,2008,22.601374
2,Australia,AUS,2010,23.183921
3,Australia,AUS,2012,23.869408
4,Australia,AUS,2014,24.654378


In [14]:
relevant_countries = extract_relevant_countries(red_df, ['Ireland', 'Denmark', 'France', 'Netherlands', 'United Kingdom', 'United States', 'Switzerland', 'Japan', 'South Korea', 'Turkey', 'Mexico'])

draw_lines(relevant_countries, 'Percentage reduction in Gini (OECD (2019))')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Year'] = pd.to_datetime(df['Year'], format='%Y')


In [15]:
df = pd.read_csv('data/tax-revenue-national-income-longrun.csv')
df.head()

Unnamed: 0,Entity,Code,Year,Tax Revenue (Piketty (2014))
0,France,FRA,1870,8.0
1,France,FRA,1880,8.0
2,France,FRA,1890,8.0
3,France,FRA,1896,8.9
4,France,FRA,1897,9.5


In [16]:
rel_df = extract_relevant_countries(df, ['France', 'Sweden', 'United Kingdom', 'United States'])
draw_lines(rel_df, 'Tax Revenue (Piketty (2014))')

In [17]:
df = pd.read_csv('data/tax-revenue-share-gdp-oecd-grsd.csv')
df.head()

Unnamed: 0,Entity,Code,Year,Total tax (% of GDP) (OECD (2018))
0,Africa,,2000,13.147724
1,Africa,,2001,13.832754
2,Africa,,2002,14.25786
3,Africa,,2003,14.578921
4,Africa,,2004,15.109871


In [18]:
relevant_countries = extract_relevant_countries(df, ['Brazil', 'Argentina', 'United States', 'Bolivia', 'Colombia', 'Cameroon', 'Indonesia', 'Democratic Republic of Congo'])

draw_lines(relevant_countries, 'Total tax (% of GDP) (OECD (2018))')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Year'] = pd.to_datetime(df['Year'], format='%Y')


In [19]:
df = pd.read_csv('data/country-level-taxes-vs-income.csv')
df['Continent'].unique()
new_df = df[(df['Continent'] == 'Asia')]
new_df['Year'].unique()

array([2015])

In [20]:
df_2015 = df[df['Year'] == 2015]
df_2015.head()

Unnamed: 0,Entity,Code,Year,"GDP per capita, PPP (constant 2017 international $)",Total tax revenue (% of GDP) (ICTD (2021)),Population (historical estimates),Continent
0,Abkhazia,OWID_ABK,2015,,,,Asia
14,Afghanistan,AFG,2015,2068.265869,7.874765,34413600.0,Asia
512,Africa,,2015,,,1182439000.0,
519,Akrotiri and Dhekelia,OWID_AKD,2015,,,,Asia
545,Albania,ALB,2015,11878.454102,23.985617,2890524.0,Europe


In [21]:
alt.Chart(df_2015).mark_circle().encode(
    x='Total tax revenue (% of GDP) (ICTD (2021)):Q',
    y='GDP per capita, PPP (constant 2017 international $):Q',
    color='Continent:N',
    size='Population (historical estimates):Q',
    tooltip=['Entity']
).properties(
    width=800,
    height=600
).interactive()

In [22]:
## TODO: clean this up, fix scales

alt.Chart(df_2015).mark_circle().encode(
    x=alt.X('Total tax revenue (% of GDP) (ICTD (2021)):Q', scale=alt.Scale(type='log')),
    y=alt.Y('GDP per capita, PPP (constant 2017 international $):Q', scale=alt.Scale(type='log')),
    color='Continent:N',
    size='Population (historical estimates):Q',
    tooltip=['Entity']
).properties(
    width=800,
    height=600
).interactive()

In [23]:
## TODO stacked area chart for
df = pd.read_csv('data/tax-revenues-by-source-gdp-LA.csv')
df.head()

Unnamed: 0,Entity,Code,Year,Resources (Arroyo-Abad and Lindert (2016)),Consumption (Arroyo-Abad and Lindert (2016)),Income and wealth (Arroyo-Abad and Lindert (2016)),Trade (Arroyo-Abad and Lindert (2016))
0,Colombia,COL,1905,1.8,,,2.5
1,Colombia,COL,1907,0.9,,,5.0
2,Colombia,COL,1908,1.0,,,3.7
3,Colombia,COL,1909,1.1,,,3.6
4,Colombia,COL,1910,0.8,,,3.6


In [24]:
peru = df[df['Entity'] == 'Peru']

alt.Chart(peru).mark_area().encode(
    x='Year',
    y='Share:Q',
    color='Source:N'
).transform_fold(
    ['Resources (Arroyo-Abad and Lindert (2016))', 'Consumption (Arroyo-Abad and Lindert (2016))', 'Income and wealth (Arroyo-Abad and Lindert (2016))', 'Trade (Arroyo-Abad and Lindert (2016))'],
    as_=['Source', 'Share']
)

In [25]:
df = pd.read_csv('data/taxation-vs-liberal-democracy.csv')
df.head()

Unnamed: 0,Entity,Code,Year,libdem_vdem_owid,Total tax revenue (% of GDP) (ICTD (2021)),Population (historical estimates),Continent
0,Abkhazia,OWID_ABK,2015,,,,Asia
1,Afghanistan,AFG,1789,0.035,,,
2,Afghanistan,AFG,1790,0.035,,3066156.0,
3,Afghanistan,AFG,1791,0.035,,,
4,Afghanistan,AFG,1792,0.035,,,


In [26]:
df_2015 = df[df['Year'] == 2015]
df_2015.head()

Unnamed: 0,Entity,Code,Year,libdem_vdem_owid,Total tax revenue (% of GDP) (ICTD (2021)),Population (historical estimates),Continent
0,Abkhazia,OWID_ABK,2015,,,,Asia
227,Afghanistan,AFG,2015,0.231,7.874765,34413600.0,Asia
496,Africa,,2015,0.299411,,1182439000.0,
539,Akrotiri and Dhekelia,OWID_AKD,2015,,,,Asia
641,Albania,ALB,2015,0.43,23.985617,2890524.0,Europe


In [27]:
alt.Chart(df_2015).mark_circle().encode(
    x='Total tax revenue (% of GDP) (ICTD (2021)):Q',
    y='libdem_vdem_owid:Q',
    color='Continent:N',
    size='Population (historical estimates):Q',
    tooltip=['Entity']
).properties(
    height=600,
    width=800
).interactive()

In [28]:
df = pd.read_csv('data/taxes-on-goods-and-services-gdp.csv')
df.head()

Unnamed: 0,Entity,Code,Year,Total taxes on goods and services (% of GDP) (ICTD (2021))
0,Afghanistan,AFG,2003,0.144814
1,Afghanistan,AFG,2004,0.503076
2,Afghanistan,AFG,2005,0.559229
3,Afghanistan,AFG,2006,1.721414
4,Afghanistan,AFG,2007,1.533667


In [29]:
rel_df = extract_relevant_countries(df, ['Austria', 'Germany', 'India', 'Venezuela', 'Afghanistan', 'Nigeria'])
draw_lines(rel_df, 'Total taxes on goods and services (% of GDP) (ICTD (2021))')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Year'] = pd.to_datetime(df['Year'], format='%Y')


In [30]:
df = pd.read_csv('data/proportion-of-domestic-budget-funded-by-domestic-taxes-of-gdp.csv')
df.head()

Unnamed: 0,Entity,Code,Year,Tax revenue (% of GDP)
0,Afghanistan,AFG,2006,6.967597
1,Afghanistan,AFG,2007,5.283457
2,Afghanistan,AFG,2008,6.087965
3,Afghanistan,AFG,2009,8.481758
4,Afghanistan,AFG,2010,9.169752


In [31]:
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="my_user_agent")

df['shape'] = geolocator.geocode(df['Entity'])

In [32]:
df.head()

Unnamed: 0,Entity,Code,Year,Tax revenue (% of GDP),shape
0,Afghanistan,AFG,2006,6.967597,
1,Afghanistan,AFG,2007,5.283457,
2,Afghanistan,AFG,2008,6.087965,
3,Afghanistan,AFG,2009,8.481758,
4,Afghanistan,AFG,2010,9.169752,


In [33]:
geolocator.geocode('Afghanistan')[1]

(33.7680065, 66.2385139)

In [34]:
from vega_datasets import data
data.us_10m.url

'https://vega.github.io/vega-datasets/data/us-10m.json'

In [35]:
data.world_110m

<vega_datasets.core.World_110M at 0x1365d8070>

In [36]:
rev_df = pd.read_csv('data/government-revenues-national-income.csv', parse_dates=True)
rev_df.head()

Unnamed: 0,Entity,Code,Year,National Gov Revenues (Wallis (2000)),Local Gov Revenues (Wallis (2000)),State Gov Revenues (Wallis (2000))
0,United States,USA,1902,3.0,4.0,0.8
1,United States,USA,1913,2.4,4.2,0.9
2,United States,USA,1922,5.8,5.2,1.7
3,United States,USA,1927,4.7,6.0,2.1
4,United States,USA,1934,6.0,7.6,3.8
