In [3]:
# Importing libraries
import pandas as pd
import numpy as np

In [4]:
# User-defined parameters for the minimum and target populations and proportions
min_population = 3500
target_population = 25000
min_proportion = 2/3
target_proportion = 3/4

In [5]:
# Read the input matrix from the CSV file
raw_input_matrix = pd.read_csv('input_matrix.csv', index_col=0)

In [6]:
# Copy of the input matrix and rename the first column to "zones"
input_matrix = raw_input_matrix.rename_axis('zones', inplace=False)

In [13]:
# 100 by 100 submatrix for testing
input_matrix = input_matrix.iloc[100:200, 100:200]

In [10]:
# The resident_proportion_matrix is number of commutes between areas A and B, as a proportion of the total number
# of people who live in the area A. This is calculated by dividing each element of the input matrix (commutes between 
# A and B) by the sum of that row (total number who live in A). This answers the question: of all the people who 
# live in A, what proportion of them commute to B?
resident_proportion_matrix = input_matrix.div(input_matrix.sum(axis=1), axis=0)

In [11]:
# The worker_proportion_matrix is similar to the resident_proportion_matrix, but is a proportion of the total number 
# of people who work in the area B. This is calculated by dividing each element of the input matrix (commutes between 
# A and B) by the sum of that column (total number who work in B). This answers the question: of all the people who 
# work in B, what proportion of them commute from A?
worker_proportion_matrix = input_matrix.div(input_matrix.sum(axis=0), axis=1)

In [12]:
# This is a measure of the "closeness" of the relationship between areas A and B. It can be thought of as the number 
# of people who commute from A to B and from B to A, relative to all those who live and work in areas A and B.
score_matrix = resident_proportion_matrix * worker_proportion_matrix + resident_proportion_matrix.T * worker_proportion_matrix.T
score_matrix.to_csv('score_matrix.csv')

In [13]:
# Please create a lookup of the zone indices from the input to the area_id (initialise this to unique intergers)
# This will be used to map the indices of the score matrix to the area_id. Mkaes the zones the index.
zone_area_lookup_v1 = pd.DataFrame(index=input_matrix.index)
zone_area_lookup_v1['area'] = zone_area_lookup_v1.reset_index().index

# Create a copy of the lookup for use in the next section; v1 will be kept unchanged as the fundamental lookup.
zone_area_lookup = zone_area_lookup_v1.copy()

In [14]:
# y = mx + c for the line that separates the valid and invalid areas (for the area between the min and target populations)
m = (min_proportion - target_proportion) / (target_population - min_population)
c = target_proportion - m*min_population

In [15]:
# Create a dataframe with the summary statistics
summary_matrix = pd.DataFrame(index=input_matrix.index)

# Lookup each zone from summary_matrix using zone_area_lookup to find the area_id
summary_matrix['area'] = summary_matrix.index.map(zone_area_lookup['area'])

# Calculate summary statistics for each zone
summary_matrix['total_residents'] = input_matrix.sum(axis=1)
summary_matrix['total_workers'] = input_matrix.sum(axis=0)
summary_matrix['proportion_residents_who_work_in_area'] = np.diag(resident_proportion_matrix)
summary_matrix['proportion_workers_who_reside_in_area'] = np.diag(worker_proportion_matrix)

# Create a column that indicates whether the area is valid or not, based on the user-defined parameters and the ONS defined 'feasible reagion'
summary_matrix['valid_area'] = (summary_matrix['total_residents'] > min_population) & (summary_matrix[['proportion_residents_who_work_in_area', 'proportion_workers_who_reside_in_area']].min(axis=1) > min_proportion) & summary_matrix[['proportion_residents_who_work_in_area', 'proportion_workers_who_reside_in_area']].min(axis=1) > (c + m*summary_matrix['total_residents'])

# The maximum score for each area is the highest score in the row, and the index of the maximum score is the area
summary_matrix['max_score'] = score_matrix.max(axis=1)
summary_matrix['max_score_index'] = score_matrix.idxmax(axis=1)

# create a new column called "new_area" which the zones in the "max_score_index" to the are of area_id
summary_matrix['new_area'] = summary_matrix['max_score_index'].map(zone_area_lookup['area'])

# Create a new column called "changed_area" which is true if the new_area is different to the area
summary_matrix['changed_area'] = summary_matrix['new_area'] != summary_matrix['area']

In [16]:
# Order by score descending
summary_matrix.sort_values(by='max_score', ascending=False, inplace=True)