In [None]:
import requests
import pandas as pd
from zipfile import ZipFile 
import io
import json

# Download the EPC Schema

The EPC schema is only included in .zip files, so we make an empty request to the API with a .zip format to extract the schema file and save it.

In [None]:
with open('epc_token.txt', 'r') as file:
    epc_token = file.read()

schema_headers = {
    'Accept': 'application/zip',
    'Authorization': f'Basic {epc_token}'
}

schema_params = {
    'size': 0,
}

url = 'https://epc.opendatacommunities.org/api/v1/domestic/search'

In [None]:
response = requests.get(url, headers=schema_headers, params=schema_params)
response.status_code

In [None]:
zip = ZipFile(io.BytesIO(response.content))
schema_path = zip.extract(member="schema.json", path="EPC")

In [None]:
with open (schema_path, 'r') as file:
    schema = json.load(file)

# The first API request

Make a small API request with json format

In [None]:
with open('epc_token.txt', 'r') as file:
    epc_token = file.read()

headers = {
    'Accept': 'application/json',
    'Authorization': f'Basic {epc_token}'
}

params = {
    'size': 100,
    'postcode': 'M1'
}

url = 'https://epc.opendatacommunities.org/api/v1/domestic/search'

In [None]:
response = requests.get(url, headers=headers, params=params)
response.status_code
data = response.json()

# Download all data using pagination

In [None]:
with open('epc_token.txt', 'r') as file:
    epc_token = file.read()

headers = {
    'Accept': 'application/json',
    'Authorization': f'Basic {epc_token}'
}

params = {
    'postcode': 'M1',
    'size': 5000
}

url = 'https://epc.opendatacommunities.org/api/v1/domestic/search'

In [None]:
response = requests.get(url, headers=headers, params=params)
response.status_code
data = response.json()
search_after = response.headers['X-Next-Search-After']

In [None]:
while search_after != None:
    params["search-after"] = search_after
    response = requests.get(url, headers=headers, params=params)
    page_data = response.json()
    data['rows'] = data['rows'] + page_data['rows']
    try:
        search_after = response.headers['X-Next-Search-After']
    except KeyError:
        search_after = None

In [None]:
epc_df = pd.DataFrame(columns=data['column-names'], data=data['rows'])

In [None]:
epc_df

# Using the data

### Our question: for each type of property, for each tenure, what is the average increase in energy efficiency as a percentage of the current energy efficiency?

## Cleaning the data

### Convert datatypes

In [None]:
epc_df['inspection-date'] = pd.to_datetime(epc_df['inspection-date'], format="%Y-%m-%d")
epc_df['current-energy-efficiency'] = epc_df['current-energy-efficiency'].astype(int)
epc_df['potential-energy-efficiency'] = epc_df['potential-energy-efficiency'].astype(int)

### Convert categorical variables

In [None]:
epc_df['tenure'].value_counts()

In [None]:
epc_df['tenure'] = epc_df['tenure'].map({
    "rental (private)":"rental (private)",
    "rental (social)":"rental (social)",
    "owner-occupied":"owner-occupied",
    "Rented (private)":"rental (private)",
    "Rented (social)":"rental (social)",
    "Owner-occupied":"owner-occupied",    
})

In [None]:
epc_df['tenure'].value_counts()

### Remove duplicates

We will sort by inspection date first so that we can keep the most recent data for each building

In [None]:
epc_df = epc_df.sort_values(by='inspection-date', ascending=False)

In [None]:
epc_df = epc_df.drop_duplicates(subset=["uprn"], keep='first')

## Insights from data

In [None]:
epc_df["property-type"].value_counts()

In [None]:
epc_df["potential-efficiency-increase"] = (epc_df['potential-energy-efficiency'] - epc_df['current-energy-efficiency']) / epc_df['current-energy-efficiency']

In [None]:
epc_df = epc_df[["potential-efficiency-increase","tenure","property-type"]]
epc_df = epc_df.dropna()

In [None]:
print(epc_df.head(10))

In [None]:
epc_df_group = epc_df.groupby(["tenure","property-type"])

In [None]:
efficiency_increase_averages = epc_df_group["potential-efficiency-increase"].mean().unstack()

In [None]:
efficiency_increase_averages.to_csv("averages.csv")