In [None]:
from brokenaxes import brokenaxes
import jsonstat
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
url = 'https://www.nomisweb.co.uk/api/v01/dataset/NM_31_1.jsonstat.json'
data = jsonstat.from_url(url)

print(data)

In [None]:
df = data.to_data_frame('geography')
df.reset_index(inplace=True)
df.head()

1 A table showing the male, female and total population in columns, per UK region in rows, as well as the UK total, for the most recent year;

In [None]:
# most recent year is the max year by value
question_1 = df[(df.date == df.date.max()) & (df.age == 'All ages') & (df.measures == 'Value')][['geography', 'sex','Value']]

question_1.columns.values[0] = 'region'
question_1.columns.values[-1] = 'value'

In [None]:
def get_england_population_stats(sex):
    print(sex)
    print(question_1.sex)
    population_by_sex = question_1[(question_1.region == 'England and Wales') & (question_1.sex == sex)].iloc[0]['value'] - question_1[(question_1.region == 'Wales') & (question_1.sex == sex)].iloc[0]['value']
    return {'region': 'England', 'sex' : sex, 'value': population_by_sex}

In [None]:
for sex in df.sex.unique():
    question_1 = question_1.append(get_england_population_stats(sex), ignore_index=True)

In [None]:
# drop entries corresponding to 'England and Wales' as this is already considered by the separate regions 
# of England and Wales
question_1 = question_1[question_1.region != 'England and Wales']

In [None]:
question_1 = question_1.append({'region': 'UK', 'sex' : 'Male', 'value': question_1[question_1.sex == 'Male']['value'].sum()}, ignore_index=True)
question_1 = question_1.append({'region': 'UK', 'sex' : 'Female', 'value': question_1[question_1.sex == 'Female']['value'].sum()}, ignore_index=True)
question_1 = question_1.append({'region': 'UK', 'sex' : 'Total', 'value': question_1[question_1.sex == 'Total']['value'].sum()}, ignore_index=True)

In [None]:
question_1.groupby(['region', 'sex']).sum().unstack('sex')

In [None]:
# England's population as a proportion of UK's population
question_1[(question_1.region == 'England') & (question_1.sex == 'Total')].iloc[0].value / question_1[(question_1.region == 'UK') & (question_1.sex == 'Total')].iloc[0].value

In [None]:
# Female:Male ratio by region
def get_female_to_male_ration(region):
    return question_1[(question_1.region == region) & (question_1.sex == 'Female')].iloc[0].value / question_1[(question_1.region == region) & (question_1.sex == 'Male')].iloc[0].value

print('England', get_female_to_male_ration('England'))
print('Northern Ireland', get_female_to_male_ration('Northern Ireland'))
print('Scotland', get_female_to_male_ration('Scotland'))
print('Wales', get_female_to_male_ration('Wales'))
print('UK', get_female_to_male_ration('UK'))

2 Exploratory data analysis to show how the population progressed by regions and age groups;

In [None]:
total_population_by_region = df[(df.age == 'All ages') & (df.sex == 'Total') & (df.measures == 'Value')][['geography', 'date', 'Value']]

In [None]:
for year in total_population_by_region.date.unique():
    value = total_population_by_region[(total_population_by_region.geography == 'England and Wales') & (total_population_by_region.date == year)].iloc[0]['Value'] - total_population_by_region[(total_population_by_region.geography == 'Wales') & (total_population_by_region.date == year)].iloc[0]['Value']
    total_population_by_region = total_population_by_region.append({'geography': 'England', 'date' : year, 'Value': value}, ignore_index=True)

In [None]:
total_population_by_region.head()

In [None]:
total_population_by_region.tail()

In [None]:
# Population for Northern Ireland only available from 1992
total_population_by_region[total_population_by_region.Value.isnull()]

In [None]:
england = total_population_by_region[total_population_by_region.geography == 'England'][['geography', 'date', 'Value']]
northern_ireland = total_population_by_region[total_population_by_region.geography == 'Northern Ireland'][['geography', 'date', 'Value']]
scotland = total_population_by_region[total_population_by_region.geography == 'Scotland'][['geography', 'date', 'Value']]
wales = total_population_by_region[total_population_by_region.geography == 'Wales'][['geography', 'date', 'Value']]

fig = plt.figure(figsize=(10, 8))

bax = brokenaxes(ylims=((0, 0.75e7), (4e7, 6e7)), hspace=.08)

bax.plot(england.date, england.Value, label='England')
bax.plot(northern_ireland.date, northern_ireland.Value, label='Northern Ireland')
bax.plot(scotland.date, scotland.Value, label='Scotland')
bax.plot(wales.date, wales.Value, label='Wales')
bax.set_xlabel('Year')
bax.set_ylabel('Population (10s millions)')


plt.xticks(rotation='vertical')
plt.title('Population Change by UK Region from 1981 to 2017', fontsize=20)
bax.legend(loc=5)

In [None]:
#percentage increase
def get_percentage_increase(df):
    return (df[df.date == df.date.max()].iloc[0].Value - df[df.date == df.date.min()].iloc[0].Value) / df[df.date == df.date.min()].iloc[0].Value

print('England', get_percentage_increase(england))
print('Northern Ireland', get_percentage_increase(northern_ireland))
print('Scotland', get_percentage_increase(scotland))
print('Wales', get_percentage_increase(wales))

In [None]:
# UK population by age group

In [None]:
wales_demographics = df[(df.geography == 'Wales') & (df.sex == 'Total') & (df.age != 'All ages') & (df.measures == 'Percent')]

chart_data = pd.DataFrame({
    'date': wales_demographics.date.unique().tolist()
}, index=range(1,38))

for age_group in wales_demographics.age.unique()[-3:]:
    chart_data[str(age_group)] = wales_demographics[(wales_demographics.age == age_group)].Value.tolist()

plt.stackplot(chart_data.date, chart_data['Aged 0 - 15'],  chart_data['Aged 16 - 64'],  chart_data['Aged 65 and over'], labels=chart_data.columns[1:])
plt.xticks(rotation='vertical')
plt.legend(loc='upper left')
plt.margins(0,0)
plt.title('Proportion change of Wales\' Population by Age Group')
plt.xlabel('Year')
plt.ylabel('Percentage (%)')

In [None]:
scotland_demographics = df[(df.geography == 'Scotland') & (df.sex == 'Total') & (df.age != 'All ages') & (df.measures == 'Percent')]

chart_data = pd.DataFrame({
    'date': scotland_demographics.date.unique().tolist()
}, index=range(1,38))

for age_group in scotland_demographics.age.unique()[-3:]:
    chart_data[str(age_group)] = scotland_demographics[(scotland_demographics.age == age_group)].Value.tolist()

plt.stackplot(chart_data.date, chart_data['Aged 0 - 15'],  chart_data['Aged 16 - 64'],  chart_data['Aged 65 and over'], labels=chart_data.columns[1:])
plt.xticks(rotation='vertical')
plt.legend(loc='upper left')
plt.margins(0,0)
plt.title('Proportion change of Scotland\'s Population by Age Group')
plt.xlabel('Year')
plt.ylabel('Percentage (%)')

In [None]:
northern_ireland_demographics = df[(df.geography == 'Northern Ireland') & (df.sex == 'Total') & (df.age != 'All ages') & (df.measures == 'Percent')]

chart_data = pd.DataFrame({
    'date': northern_ireland_demographics.date.unique().tolist()
}, index=range(1,38))

for age_group in northern_ireland_demographics.age.unique()[-3:]:
    chart_data[str(age_group)] = northern_ireland_demographics[(northern_ireland_demographics.age == age_group)].Value.tolist()

plt.stackplot(chart_data.date, chart_data['Aged 0 - 15'],  chart_data['Aged 16 - 64'],  chart_data['Aged 65 and over'], labels=chart_data.columns[1:])
plt.xticks(rotation='vertical')
plt.legend(loc='upper left')
plt.margins(0,0)
plt.title('Proportion change of Northern Ireland\'s Population by Age Group')
plt.xlabel('Year')
plt.ylabel('Percentage (%)')

In [None]:
england_demographics = df[(df.geography == 'England and Wales') & (df.sex == 'Total') & (df.age != 'All ages') & (df.measures == 'Percent')]

chart_data = pd.DataFrame({
    'date': england_demographics.date.unique().tolist()
}, index=range(1,38))

for age_group in england_demographics.age.unique()[-3:]:
    chart_data[str(age_group)] = england_demographics[(england_demographics.age == age_group)].Value.tolist()

plt.stackplot(chart_data.date, chart_data['Aged 0 - 15'],  chart_data['Aged 16 - 64'],  chart_data['Aged 65 and over'], labels=chart_data.columns[1:])
plt.xticks(rotation='vertical')
plt.legend(loc='upper left')
plt.margins(0,0)
plt.title('Proportion change of England\'s Population by Age Group')

In [None]:
england_and_wales_demographics = df[(df.geography == 'England and Wales') & (df.sex == 'Total') & (df.age != 'All ages') & (df.measures == 'Value')]
england_and_wales_demographics.head()

In [None]:
# For each year and age group
# * Find the "Value" for England only ("England and Wales" - "Wales") for that year and age group
# * Calculate percentage by dividing newly found "Value" by "Value" for "All ages"

In [None]:
england_and_wales_demographics = england_and_wales_demographics[~england_and_wales_demographics.Value.isnull()]

for year in england_and_wales_demographics.date.unique():
    for age_group in england_and_wales_demographics.age.unique()[-3:]:
        england_and_wales_pop = england_and_wales_demographics[(england_and_wales_demographics.date == year) & (england_and_wales_demographics.age == age_group)].iloc[0].Value
        wales_pop = wales_demographics[(wales_demographics.date == year) & (wales_demographics.age == age_group)].iloc[0].Value
        value = england_and_wales_pop - wales_pop

        england_and_wales_demographics = england_and_wales_demographics.append({'geography': 'England', 'date' : year, 'sex': 'Total', 'age': age_group, 'measures': 'Value', 'Value': value}, ignore_index=True)
        

In [None]:
england_and_wales_demographics[england_and_wales_demographics.geography == 'England']