In [47]:
import pandas as pd
import json
import networkx as nx
from networkx_query import search_nodes, search_edges
import numpy as np

In [48]:
options = pd.read_csv('./csv/lists/places_2020.csv')
areaType = pd.read_csv('/Users/theojolliffe/Documents/Census Data/censusAreaLookup.csv')
tfidf = pd.read_csv('/Users/theojolliffe/Documents/Wayback BBC/BBCRegionalTFIDF.csv')
typeLookup = {}
for i in areaType.index:
    typeLookup[areaType.iloc[i]['Name']]=areaType.iloc[i]['Group name']

In [49]:
areas = []
for i in options["code"]:
    try:
        areas.append(json.load(open(f'/Users/theojolliffe/Documents/Census Data/census-data-main/json/place/{i}.json', 'rb')))
    except FileNotFoundError:
        print(i)
        pass

E05009289
E05009290
E05009293
E05009294
E05009295
E05009297
E05009298
E05009299
E05009300
E05009301
E05009303
E05009306
E05009307
E05009312
E05011090


In [50]:
# Seperate the areas by area type
regions = []
for i in areas:
    if i['type']=='rgn':
        regions.append(i)
        
lads = []
for i in areas:
    if i['type']=='lad':
        lads.append(i)
        
countries = []
for i in areas:
    if (i['type']=='ew')|(i['type']=='ctry'):
        countries.append(i)

In [51]:
# Find each LAD local and national ranks and put in prioirity order
for thisLad in lads:
    
    # Empty array will be populted with ranks for each variable
    ranks = []

    # Filter areas with same parent
    sister_lads = []
    lad_code = thisLad['parents'][0]['code']
    for i in range(len(lads)):
        if  lad_code == lads[i]['parents'][0]['code']:
            sister_lads.append(lads[i])

    # Loop through the various data variables
    for a in thisLad['data']:
        if a in ['population', 'density', 'agemed']:
            b = 'value'
        else:
            b = 'perc'
        # Create nested object with localised rank
        thisLad['data'][a][b+"_rank_local"] = {}    
        for c in ['2011', 'change']:
            thisLad['data'][a][b+"_rank_local"][c] = {}
            for d in thisLad['data'][a][b][c]:
                vari = thisLad['data'][a][b][c][d]

                # Create sorted list of values from sister areas
                group_values = []
                for lad in sister_lads:
                    group_values.append(lad['data'][a][b][c][d])
                    group_values = [x if (type(x) == float) | (type(x) == int) else np.nan for x in group_values]
                group_values.sort(reverse=True)

                # Find index of value of area of interest
                varRank = group_values.index(vari) + 1

                # Convert bottom half rankings into negative values
                if varRank>len(group_values)/2:
                    varRank = varRank-len(group_values)-1
                    
                natRank = thisLad['data'][a][b+"_rank"][c][d]
                if natRank > 168:
                    natRank = natRank-336-1

                thisLad['data'][a][b+"_rank_local"][c][d] = varRank

                # Append ranking data to original array
                ranks.append({
                    'label': a+'_'+b+'_'+c+'_'+d, 
                    'locRank': varRank, 
                    'natRank': natRank, 
                    'value': vari})

    # Sort in rank order
    ranks = sorted(ranks, key=lambda x: (abs(x['locRank']), -abs(x['value'])))
    thisLad["Priorities"] = ranks

In [52]:
topics = [{1: 'population', 2: 'value', 3: 'all'}, 
          {1: 'health', 2: 'perc', 3: 'good'},
          {1: 'travel', 2: 'perc', 3: 'car_van'},
          {1: 'agemed', 2: 'value', 3: 'all'},
          {1: 'economic', 2: 'perc', 3: 'employee'},
          {1: 'health', 2: 'perc', 3: 'bad'},
          {1: 'ethnicity', 2: 'perc', 3: 'black'},
          {1: 'economic', 2: 'perc', 3: 'student'},
          {1: 'economic', 2: 'perc', 3: 'self-employed'}]

ladCodes = []
for lad in lads:
    ladCodes.append(lad['code'])
    
    
df = pd.DataFrame(columns=
                  ['lad', '2001',  '2011',  'change', 'natRank', 'localRank', 'topic', 'parent'])
for j in range(len(topics)):
    df1 = pd.DataFrame(index=ladCodes, columns=
                      ['lad', '2001',  '2011',  'change', 'natRank', 'localRank', 'topic', 'parent'])
    for i in range(len(lads)):
        df1['lad'].iloc[i] = lads[i]['name']
        df1['2001'].iloc[i] = lads[i]['data'][topics[j][1]][topics[j][2]]['2001'][topics[j][3]]
        df1['2011'].iloc[i] = lads[i]['data'][topics[j][1]][topics[j][2]]['2011'][topics[j][3]]    
        df1['change'].iloc[i] = lads[i]['data'][topics[j][1]][topics[j][2]]['change'][topics[j][3]]
        df1['natRank'].iloc[i] = lads[i]['data'][topics[j][1]][topics[j][2]+"_rank"]['change'][topics[j][3]]    
        df1['localRank'].iloc[i] = lads[i]['data'][topics[j][1]][topics[j][2]+"_rank_local"]['change'][topics[j][3]]    
        df1['topic'].iloc[i] = topics[j][1]+"_"+topics[j][3]
        df1['parent'].iloc[i] = lads[i]['parents'][0]['name']
    df = pd.concat([df,df1])
df['natRank'] = np.where(df['natRank'] > 168, df['natRank']-337,df['natRank'])



for topic in topics:
    for lad in lads:
        
#       Find the areas that this area has overtaken
        v2001 = lad['data'][topic[1]][topic[2]]['2001'][topic[3]]
        v2011 = lad['data'][topic[1]][topic[2]]['2011'][topic[3]]
        dfT = df[(df['topic']==topic[1]+"_"+topic[3])&(df['2001']>v2001)&(df['2011']<v2011)]
        obje = []
        for i in range(dfT.shape[0]):
            obje.append(dfT.iloc[i]['lad'])

        lad['data'][topic[1]][topic[2]+"_rank"]['overtake'] = {}
        lad['data'][topic[1]][topic[2]+"_rank"]['overtake'][topic[3]] = obje
        
#       Find the area immediatly above or below
        below = lad['data'][topic[1]][topic[2]+'_rank_local']['2011'][topic[3]]
        above = lad['data'][topic[1]][topic[2]+'_rank_local']['2011'][topic[3]]-2
        if below<0:
            below=below+1
            above=above+1
        reg = lad['parents'][0]['name']
        name_above = df[(df['parent']==reg)&(df['topic']==topic[1]+"_"+topic[3])].sort_values('2011', ascending=False).iloc[above]['lad']
        name_below = df[(df['parent']==reg)&(df['topic']==topic[1]+"_"+topic[3])].sort_values('2011', ascending=False).iloc[below]['lad']
        if above>0:
            area_above = {'name': name_above,
                         'value': df[(df['lad']==name_above)&(df['topic']==topic[1]+"_"+topic[3])]['2011'].iloc[0]}
        else:
            area_above = 'NaN'
        if below<len(df[(df['parent']==reg)&(df['topic']==topic[1]+"_"+topic[3])]):
            area_below = {'name': name_below,
                          'value': df[(df['lad']==name_below)&(df['topic']==topic[1]+"_"+topic[3])]['2011'].iloc[0]}
        else:
            area_below = 'NaN'
        
        if 'above_below' not in lad['data'][topic[1]][topic[2]+'_rank_local'].keys():
            lad['data'][topic[1]][topic[2]+'_rank_local']['above_below'] = {}

        if topic[3] not in lad['data'][topic[1]][topic[2]+'_rank_local']['above_below'].keys():
            lad['data'][topic[1]][topic[2]+'_rank_local']['above_below'][topic[3]] = {}
        
        lad['data'][topic[1]][topic[2]+'_rank_local']['above_below'][topic[3]]["above"] = area_above
        lad['data'][topic[1]][topic[2]+'_rank_local']['above_below'][topic[3]]["below"] = area_below
            
        # Add top and bottom three biggest movers for each subject to every area
        df_topic = df[(df['topic']==topic[1]+"_"+topic[3])&(abs(df['natRank'])<4)]
        ob = {}
        for index, row in df_topic.iterrows():
            ob[row['natRank']]= {row['lad']: row['change']}
        if 'top_bottom' not in lad['data'][topic[1]][topic[2]+'_rank'].keys():
            lad['data'][topic[1]][topic[2]+'_rank']['top_bottom'] = {}
        lad['data'][topic[1]][topic[2]+'_rank']['top_bottom'][topic[3]]=ob

In [57]:
# Create triple data for areas of closest proximity
geogTriples = []
for lad in lads:
    list1 = lad['bounds'][0]+lad['bounds'][1]
    for lad2 in lads:
        list2 = lad2['bounds'][0]+lad2['bounds'][1]
        listDif = [abs(list1[i]-list2[i]) for i in [0,1,2,3]]
        listDif.sort()
        if (sum(listDif[:3]) < 0.3) & (lad!=lad2):
            geogTriples.append([lad['name'], lad2['name'], ("near", round(sum(listDif[:3]), 2))])
            
# Find nearby area of same area type
for i in geogTriples:
    try:
        if ((i[0]=="Amber Valley")&(typeLookup[i[1]]==typeLookup["Amber Valley"])):
            nearSimilar = i[1]
            print(i[1])
    except:
        pass

Erewash


In [11]:
# Filter out priority list by subject
for lad in lads:
    topicList = ["density", "age10yr"] 
    subjectList = ["fair", "rent_free", "shared_ownership", "bicycle", "taxi", "moto", "bus", "other", "female", "male"]
    priorities = []
    for rank in lad['Priorities']:
        s=rank['label'].split("_")
        if len(s)>4:
            s[3] = s[3] + "_" + s[4]
        if ((s[2] == "change") & (s[0] not in topicList) & (s[3] not in subjectList)):
            priorities.append(rank)
    lad['pri'] = priorities

In [12]:
ambervalley = [lad for lad in lads if lad['name']=="Amber Valley"][0]
ambervalley['pri']

[{'label': 'economic_perc_change_self-employed',
  'locRank': 7,
  'natRank': 149,
  'value': 1.3},
 {'label': 'health_perc_change_bad',
  'locRank': -8,
  'natRank': -79,
  'value': -4.05},
 {'label': 'ethnicity_perc_change_white',
  'locRank': 8,
  'natRank': 57,
  'value': -0.95},
 {'label': 'ethnicity_perc_change_asian',
  'locRank': -10,
  'natRank': -46,
  'value': 0.38},
 {'label': 'tenure_perc_change_rented_private',
  'locRank': -12,
  'natRank': -112,
  'value': 4.99},
 {'label': 'travel_perc_change_home',
  'locRank': 12,
  'natRank': 146,
  'value': -4.58},
 {'label': 'health_perc_change_good',
  'locRank': 13,
  'natRank': 83,
  'value': 12.97},
 {'label': 'ethnicity_perc_change_mixed',
  'locRank': -13,
  'natRank': -86,
  'value': 0.39},
 {'label': 'ethnicity_perc_change_black',
  'locRank': -13,
  'natRank': -99,
  'value': 0.13},
 {'label': 'population_value_change_all',
  'locRank': -14,
  'natRank': -103,
  'value': 5.01},
 {'label': 'tenure_perc_change_owned',
  'lo

In [27]:
ambervalley['data']['density']['value']['2011']

{'all': 4.61}

In [20]:
ambervalley['data']['population']['value_rank_local']['2011']['all']

8

In [62]:
nearSimilarData['data']['population']['value']['2011']

{'all': 112081, 'female': 57147, 'male': 54934}

In [60]:
regions[3]['data']['population']['value']['2001']

{'all': 4172174.0, 'female': 2123316.0, 'male': 2048858.0}

In [36]:
countries[1]['data']['population']['value']['change']

{'all': 7.88, 'female': 6.85, 'male': 8.98}

In [360]:
stories = []

In [361]:
stories=stories+[i for i in ambervalley['pri'] if abs(i['locRank']) <= 5]

In [362]:
stories=stories+[i for i in ambervalley['pri'] if abs(i['natRank']) <= 10]

In [363]:
stories

[]

In [364]:
change = sorted(ambervalley['pri'], reverse=True, key=lambda x: abs(x['value']))
bigchange = [i for i in change if abs(i['value'])>4.5]
bigchange

[{'label': 'health_perc_change_good',
  'locRank': 13,
  'natRank': 83,
  'value': 12.97},
 {'label': 'travel_perc_change_car_van',
  'locRank': -16,
  'natRank': 89,
  'value': 6.23},
 {'label': 'population_value_change_all',
  'locRank': -14,
  'natRank': -103,
  'value': 5.01},
 {'label': 'tenure_perc_change_rented_private',
  'locRank': -12,
  'natRank': -112,
  'value': 4.99},
 {'label': 'travel_perc_change_home',
  'locRank': 12,
  'natRank': 146,
  'value': -4.58}]

In [366]:
for i in range(len(bigchange)):
    if len(stories)<4:
        stories=stories+[bigchange[i]]

In [367]:
stories

[{'label': 'health_perc_change_good',
  'locRank': 13,
  'natRank': 83,
  'value': 12.97},
 {'label': 'travel_perc_change_car_van',
  'locRank': -16,
  'natRank': 89,
  'value': 6.23},
 {'label': 'population_value_change_all',
  'locRank': -14,
  'natRank': -103,
  'value': 5.01},
 {'label': 'tenure_perc_change_rented_private',
  'locRank': -12,
  'natRank': -112,
  'value': 4.99}]

In [368]:
smallchange = sorted([i for i in change if abs(i['value'])<1], reverse=False, key=lambda x: abs(x['value']))
smallchange

[{'label': 'ethnicity_perc_change_black',
  'locRank': -13,
  'natRank': -99,
  'value': 0.13},
 {'label': 'ethnicity_perc_change_asian',
  'locRank': -10,
  'natRank': -46,
  'value': 0.38},
 {'label': 'ethnicity_perc_change_mixed',
  'locRank': -13,
  'natRank': -86,
  'value': 0.39},
 {'label': 'economic_perc_change_student',
  'locRank': 20,
  'natRank': -147,
  'value': 0.5},
 {'label': 'economic_perc_change_employee',
  'locRank': -19,
  'natRank': -127,
  'value': -0.81},
 {'label': 'economic_perc_change_unemployed',
  'locRank': 17,
  'natRank': -136,
  'value': 0.83},
 {'label': 'tenure_perc_change_rented_social',
  'locRank': 16,
  'natRank': -152,
  'value': -0.93},
 {'label': 'ethnicity_perc_change_white',
  'locRank': 8,
  'natRank': 57,
  'value': -0.95}]

In [369]:
for i in range(len(smallchange)):
    if len(stories)<5:
        stories=stories+[smallchange[i]]

In [370]:
stories

[{'label': 'health_perc_change_good',
  'locRank': 13,
  'natRank': 83,
  'value': 12.97},
 {'label': 'travel_perc_change_car_van',
  'locRank': -16,
  'natRank': 89,
  'value': 6.23},
 {'label': 'population_value_change_all',
  'locRank': -14,
  'natRank': -103,
  'value': 5.01},
 {'label': 'tenure_perc_change_rented_private',
  'locRank': -12,
  'natRank': -112,
  'value': 4.99},
 {'label': 'ethnicity_perc_change_black',
  'locRank': -13,
  'natRank': -99,
  'value': 0.13}]

In [371]:
def reg(i):
    s=i['label'].split("_")
    if len(s)>4:
        s[3] = s[3] + "_" + s[4]
    return regions[3]['data'][s[0]][s[1]][s[2]][s[3]]
regDiff = [(i['label'], i['value']-reg(i)) for i in change]
sorted(regDiff, reverse=True, key=lambda x: abs(x[1]))

[('population_value_change_all', -3.6400000000000006),
 ('ethnicity_perc_change_white', 3.2800000000000002),
 ('travel_perc_change_foot', -1.7999999999999998),
 ('ethnicity_perc_change_asian', -1.73),
 ('tenure_perc_change_rented_private', -1.6399999999999997),
 ('tenure_perc_change_owned', 1.5099999999999998),
 ('travel_perc_change_car_van', 1.1600000000000001),
 ('ethnicity_perc_change_black', -0.73),
 ('tenure_perc_change_rented_social', 0.63),
 ('ethnicity_perc_change_mixed', -0.47),
 ('health_perc_change_bad', -0.46999999999999975),
 ('economic_perc_change_inactive', 0.3799999999999999),
 ('economic_perc_change_self-employed', 0.37),
 ('economic_perc_change_employee', -0.36000000000000004),
 ('economic_perc_change_student', -0.31999999999999995),
 ('travel_perc_change_home', -0.16999999999999993),
 ('health_perc_change_good', 0.11000000000000121),
 ('travel_perc_change_train_metro', 0.07000000000000006),
 ('economic_perc_change_unemployed', -0.07000000000000006),
 ('agemed_value_c

In [280]:
def cou(i):
    s=i['label'].split("_")
    if len(s)>4:
        s[3] = s[3] + "_" + s[4]
    return countries[1]['data'][s[0]][s[1]][s[2]][s[3]]
couDiff = [(i['label'], i['value']-cou(i)) for i in change]
sorted(couDiff, reverse=True, key=lambda x: abs(x[1]))

[('travel_perc_change_car_van', 5.3100000000000005),
 ('ethnicity_perc_change_white', 4.56),
 ('population_value_change_all', -2.87),
 ('travel_perc_change_foot', -2.44),
 ('ethnicity_perc_change_asian', -2.42),
 ('tenure_perc_change_rented_private', -1.96),
 ('tenure_perc_change_owned', 1.8899999999999997),
 ('economic_perc_change_inactive', 1.2199999999999998),
 ('ethnicity_perc_change_black', -1.0499999999999998),
 ('travel_perc_change_train_metro', -0.95),
 ('travel_perc_change_home', -0.7600000000000002),
 ('ethnicity_perc_change_mixed', -0.5499999999999999),
 ('economic_perc_change_employee', -0.53),
 ('tenure_perc_change_rented_social', 0.5299999999999999),
 ('health_perc_change_bad', -0.5099999999999998),
 ('economic_perc_change_student', -0.36),
 ('health_perc_change_good', 0.33999999999999986),
 ('economic_perc_change_unemployed', -0.20000000000000007),
 ('agemed_value_change_all', -0.14000000000000012),
 ('economic_perc_change_self-employed', -0.1399999999999999)]

In [58]:
nearSimilarData = [lad for lad in lads if lad['name']==nearSimilar][0]
print(ambervalley['name'], " population: ", ambervalley['data']['population']['value']['2011']['all'])
print(nearSimilarData['name'], " population: ", nearSimilarData['data']['population']['value']['2011']['all'])
difference = ambervalley['data']['population']['value']['2011']['all']- nearSimilarData['data']['population']['value']['2011']['all']
print("Difference: ", difference, ", ", round(100*difference/ambervalley['data']['population']['value']['2011']['all']), "%")

Amber Valley  population:  122309
Erewash  population:  112081
Difference:  10228 ,  8 %


In [373]:
def near(i):
    s=i['label'].split("_")
    if len(s)>4:
        s[3] = s[3] + "_" + s[4]
    return nearSimilarData['data'][s[0]][s[1]][s[2]][s[3]]
nearDiff = [(i['label'], i['value']-near(i)) for i in change]
sorted(nearDiff, reverse=True, key=lambda x: abs(x[1]))

[('population_value_change_all', 3.21),
 ('travel_perc_change_car_van', -2.889999999999999),
 ('tenure_perc_change_owned', 2.3099999999999996),
 ('travel_perc_change_foot', 1.6),
 ('travel_perc_change_home', 1.3899999999999997),
 ('tenure_perc_change_rented_private', -1.2999999999999998),
 ('health_perc_change_bad', -0.52),
 ('economic_perc_change_unemployed', -0.43000000000000005),
 ('economic_perc_change_employee', 0.4099999999999999),
 ('health_perc_change_good', 0.25),
 ('travel_perc_change_train_metro', -0.16999999999999993),
 ('economic_perc_change_self-employed', 0.08000000000000007),
 ('ethnicity_perc_change_mixed', -0.07999999999999996),
 ('economic_perc_change_inactive', -0.07000000000000006),
 ('agemed_value_change_all', -0.06999999999999984),
 ('ethnicity_perc_change_white', 0.06000000000000005),
 ('tenure_perc_change_rented_social', -0.06000000000000005),
 ('ethnicity_perc_change_black', 0.03),
 ('economic_perc_change_student', 0.0),
 ('ethnicity_perc_change_asian', 0.0)]

In [374]:
[i for i in geogTriples if i[0]=="Amber Valley"]

[['Amber Valley', 'Derby', ('near', 0.19)],
 ['Amber Valley', 'Chesterfield', ('near', 0.3)],
 ['Amber Valley', 'Erewash', ('near', 0.25)],
 ['Amber Valley', 'North East Derbyshire', ('near', 0.22)],
 ['Amber Valley', 'Ashfield', ('near', 0.26)],
 ['Amber Valley', 'Broxtowe', ('near', 0.24)]]

In [375]:
sorted([(lad['name'], lad['pri'][0]) for lad in lads if lad['name'] in [i[1] for i in geogTriples if i[0]=="Amber Valley"]], key=lambda x: abs(x[1]['locRank']))


[('North East Derbyshire',
  {'label': 'travel_perc_change_home',
   'locRank': -1,
   'natRank': -3,
   'value': -9.76}),
 ('Derby',
  {'label': 'agemed_value_change_all',
   'locRank': -2,
   'natRank': -31,
   'value': -2.7}),
 ('Chesterfield',
  {'label': 'economic_perc_change_unemployed',
   'locRank': -2,
   'natRank': -26,
   'value': 0.12}),
 ('Ashfield',
  {'label': 'travel_perc_change_train_metro',
   'locRank': 2,
   'natRank': 125,
   'value': 2.69}),
 ('Broxtowe',
  {'label': 'agemed_value_change_all',
   'locRank': 4,
   'natRank': 31,
   'value': 5.26}),
 ('Erewash',
  {'label': 'population_value_change_all',
   'locRank': -5,
   'natRank': -33,
   'value': 1.8})]