In [None]:
import json
import datetime
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize
import pymongo
from bs4 import BeautifulSoup
import requests
import re
import pathlib
pd.set_option("max_colwidth", 200)

In [None]:
#Some useful mappings
# Actually not that useful, as some reports reference 'point 26', which afaik doesnt exist

july_2019_id_to_crit = {
    1: 'understand user needs',
    2: 'do ongoing user research',
    3: 'have a multidisciplinary team',
    4: "use agile methods",
    5: "Iterate and improve frequently",
    6: "Evaluate tools and systems",
    7: "Understand security and privacy issues",
    8: "Make all new source code open",
    9: "Use open standards and common platforms",
    10: "Test the end-to-end service",
    11: "Make a plan for being offline",
    12: "Make sure users succeed first time",
    13: "Make the user experience consistent with GOV.UK",
    14: "Encourage everyone to use the digital service",
    15: "Collect performance data",
    16: "Identify performance indicators",
    17: "Report performance data on the Performance Platform",
    18: "Test with the minister"
}
july_2019_id_to_crit = {key: value.lower().replace(' ', '-') for key, value in july_2019_id_to_crit.items()}
july_2019_crit_to_id = {value: key for key, value in july_2019_id_to_crit.items()}

In [None]:
current_crit = {
    1: "Understand users and their needs",
    2: "Solve a whole problem for users",
    3: "Provide a joined up experience across all channels",
    4: "Make the service simple to use",
    5: "Make sure everyone can use the service",
    6: "Have a multidisciplinary team",
    7: "Use agile ways of working",
    8: "Iterate and improve frequently",
    9: "Create a secure service which protects users’ privacy",
    10: "Define what success looks like and publish performance data",
    11: "Choose the right tools and technology",
    12: "Make new source code open",
    13: "Use and contribute to open standards, common components and patterns",
    14: "Operate a reliable service"
}
id_to_crit = {key: value.lower().replace(' ','-') for key, value in current_crit.items()}
crit_to_id = {value: key for key, value in id_to_crit.items()}

In [None]:
# Service assessment content pages
urls = ['https://www.gov.uk/service-standard-reports?page=1',
       'https://www.gov.uk/service-standard-reports?page=2',
       'https://www.gov.uk/service-standard-reports?page=3',
       'https://www.gov.uk/service-standard-reports?page=4',
       'https://www.gov.uk/service-standard-reports?page=5']

In [None]:
def get_assessment_path_and_metadata(urllist):
    """
    Scrape meta data from each page of
    https://www.gov.uk/service-standard-reports
    Should be 225 reports
    """
    assessments_dict= {}
    for url in urllist:
        page = requests.get(url)
        html = BeautifulSoup(page.content, 'html.parser')
        items = html.find_all("li", class_="gem-c-document-list__item")
        links = [item.find('a').get('href') for item in items]
        assessments = [item.find_all('li', class_="gem-c-document-list__attribute") for item in items]
        for index,item in enumerate(assessments):
            texts = [re.sub(r'[^\w\s]','',thing.text).lower().strip('\n').strip(' ').split(" ",1) for thing in item]
            record = {}
            for value in texts:
                record[value[0]] = value[1]
            assessments_dict[links[index]] = record
    
    df = pd.DataFrame.from_dict(assessments_dict,'index').reset_index()
    df.columns = ['path','assessment_outcome','stage','assessment_date']
    df['url'] = 'https://www.gov.uk' + df['path']
    df['api_path'] = 'https://www.gov.uk/api/content' + df['path']
    return df

In [None]:
df = get_assessment_path_and_metadata(urls)

In [None]:
assert(df.shape[0] == 224)

In [None]:
# Fix just one missing assessment date
print(df[df['assessment_date'].isna()])
df.iloc[75]['assessment_date'] = '12 September 2019'
df['assessment_date'] = df['assessment_date'].map(
    lambda x: datetime.datetime.strptime(x,"%d %B %Y").date())

In [None]:
#Tests meta data scraping

assert df[df['path']=='/service-standard-reports/file-your-confirmation-statement-alpha-assessment-report'][[
    'assessment_outcome', 'stage','assessment_date'
]].values[0].tolist() == ['met','alpha',datetime.datetime.strptime('2021-3-23',"%Y-%m-%d").date()]

assert df[df['path']=='/service-standard-reports/nhs-111'][[
    'assessment_outcome', 'stage','assessment_date'
]].values[0].tolist() == ['met','alpha',datetime.datetime.strptime('2016-4-28',"%Y-%m-%d").date()]


In [None]:
def scrape_assessment_outcome_from_para(url):
    """
    Scrapes outcome from paragraph as per 
    https://www.gov.uk/service-standard-reports/check-your-state-pension-beta-assessment
    
    Args:
        url (string): url for scraping 

    Returns:
        string: a string (hopefully) indicating pass/fail
    """
    request = requests.get(url)
    page = BeautifulSoup(request.content)
    paras = page.find_all("strong")
    paras = np.array([para.parent.text.lower() for para in paras])
    text = ([bool(re.search('result',para)) for para in paras])
    results = paras[text]
    try:
        result = [result.rsplit('\n')[1] for result in results]
        result
    except IndexError:
        result = None
    # Occasionally we get two results, for prior assessments, we just want the most recent one
    result = result[len(result)-1] if result else None
    return result

def scrape_summary_table(url):
    """
    Scrapes meta from table on page as per
    https://www.gov.uk/service-standard-reports/apply-for-a-blue-badge-beta-assessment
    
    Args:
        url (string): url for scraping 

    Returns:
        summary_table (dict): dictionary with table column1 as keys and table column2 as values
    
    """
    request = requests.get(url)
    page = BeautifulSoup(request.content)
    trs = page.find_all("tr")
    lines = [re.sub(r'[^\w\s]','',tr.text).lower().strip('\n').split('\n') for tr in trs]
    key = [x[0] for x in lines]
    value = [x[1] for x in lines]
    summary_table = dict(zip(key,value))

    return summary_table


In [None]:
#Get missing pass/fails from scraping paras
df['scraped_result'] = df['url'].map(scrape_assessment_outcome_from_para)


In [None]:
# Test scrape assessment outcome from para
assert df[df['path']=='/service-standard-reports/check-your-state-pension-beta-assessment'][[
    'scraped_result'
]].values[0].tolist() == ['pass']

assert df[df['path']=='/service-standard-reports/redundancy-payments-alpha-assessment'][[
    'scraped_result'
]].values[0].tolist() == ['met']

assert df[df['path']=='/service-standard-reports/civil-service-learning-course-booking-alpha-assessment'][[
    'scraped_result'
]].values[0].tolist() == ['not pass']



In [None]:
df

In [None]:
# Get missing meta data from summary table
df['summary_table'] = df['url'].map(scrape_summary_table)
df['st_result'] = df['summary_table'].map(lambda x: x.get('result'))
df['st_stage'] = df['summary_table'].map(lambda x: x.get('stage'))


In [None]:
# Test scrape assessment outcome from table
assert df[df['path']=='/service-standard-reports/apply-for-a-blue-badge-beta-assessment'][[
    'st_result',
    'st_stage'
]].values[0].tolist() == ['met','beta']

assert df[df['path']=='/service-standard-reports/electronic-data-collection-alpha-assessment'][[
    'st_result',
    'st_stage'
]].values[0].tolist() == ['met','alpha']

assert df[df['path']=='/service-standard-reports/driving-theory-test-booking-alpha'][[
    'st_result',
    'st_stage',
]].values[0].tolist() == ['not met','alpha']


In [None]:
# Coalesce our API, scraped p and scraped table outcomes/stages
# Priority is API > Summary table > p
df['stage'] =df['stage'].combine_first(df['st_stage'])
df['assessment_outcome'] = df['assessment_outcome'].combine_first(df['scraped_result'])
df['assessment_outcome'] = df['assessment_outcome'].combine_first(df['st_result'])

In [None]:
# Outcome description
df.groupby('assessment_outcome',dropna=False).size()

In [None]:
# Map outcome/stage strings to proper thing
stage_map = {
'alpha': 'alpha',
'alpha  reassessment':'alpha-reassessment',
'alpha reassessment' : 'alpha-reassessment',
'alpha2' : 'alpha',
'alphareassessment':'alpha-reassessment',
'beta': 'beta',
'beta reassessment':'beta-reassessment', 
'beta2': 'beta',
'betareassessment': 'beta-reassessment',
'live':'live',                
'live2':'live'       
}

outcome_map = {
'met' : 'met',                     
'not met' : 'not-met',                  
'not pass' : 'not-met',                 
'notmet' : 'not-met',                   
'pass' : 'met',                     
'pass with conditions': 'met',      
'passed' : 'met',      
'not-met': 'not-met',
}

In [None]:
# map variously phrased outcomes and stages to an outcome or stage
df['stage'] = df['stage'].map(lambda x: stage_map.get(x))
df['assessment_outcome'] = df['assessment_outcome'].map(lambda x: outcome_map.get(x))

In [None]:
df.groupby('assessment_outcome',dropna=False).size()

In [None]:
df.groupby('stage',dropna=False).size()

In [None]:
def find_met_standard_id_in_p(url):
    """
    Extract line containing standard id and status from <p> elements
    """
    request = requests.get(url)
    page = BeautifulSoup(request.content)
    paras = page.find_all("p")
    crit = []
    for para in paras:
        if re.search('point \d+', str(para)):
            crit.append(para.text)
    return crit

In [None]:
df['criteria-id-from-p'] = df['url'].map(find_met_standard_id_in_p)

In [None]:
def extract_id_and_status(lines):
    """
    Extract id and status from raw text string
    """
    data = {'met':[],'not-met':[]}
    criteria = [int(re.search('\d+',line).group()) for line in lines]
    status = ['not-met' if re.search('not',line) else 'met' for line in lines]
    [data[x].append(criteria[idx]) for idx,x in enumerate(status)]
    return data

In [None]:
df['criteria'] = df['criteria-id-from-p'].map(extract_id_and_status)

In [None]:
#Unnest criteria dict
df['met-criteria-from-p'] = df['criteria'].map(lambda x: x.get('met'))
df['unmet-criteria-from-p'] = df['criteria'].map(lambda x: x.get('not-met'))

In [None]:
#Test extract id and status from p
assert df[df['url']=='https://www.gov.uk/service-standard-reports/file-your-confirmation-statement-alpha-assessment-report'][[
    'met-criteria-from-p',
    'unmet-criteria-from-p'
]].values[0].tolist() == [[1,2,3,4,5,6,7,8,9,10,11,12,13,14],[]]

assert df[df['url']=='https://www.gov.uk/service-standard-reports/apply-for-a-deceased-persons-military-record-request-a-service-record-beta-reassessment-report'][[
    'met-criteria-from-p',
    'unmet-criteria-from-p'
]].values[0].tolist() == [[10,13],[]]

assert df[df['url']=='https://www.gov.uk/service-standard-reports/request-a-service-record-beta-reassessment-report'][[
    'met-criteria-from-p',
    'unmet-criteria-from-p'
]].values[0].tolist() == [[3,7,11,14,15,16],[10,13]]

In [None]:
def find_met_criteria_in_table(url):
    """
    Extracts individual criteria and outcome from two types of table:
    3 column table as per https://www.gov.uk/service-standard-reports/new-secure-access-alpha-reassessment
    4 column table as per https://www.gov.uk/service-standard-reports/claim-for-crown-court-defence
    """
    request = requests.get(url)
    page = BeautifulSoup(request.content)
    trs = page.find_all("tr")
    met_criteria_from_table = []
    unmet_criteria_from_table = []
    for x,tr in enumerate(trs):
        # 3 column table
        tds = tr.find_all('td')
        if len(tds) ==3:    
            for y,td in enumerate(tds):
                if td.string:
                    try:
                        point_id = int(td.string)
                    except ValueError:
                        continue
                    #ignore points we have no data for
                    if point_id <= 18:
                        if tds[y+2].string.lower() == 'met':
                            met_criteria_from_table.append(point_id)
                        elif tds[y+2].string.lower() == 'not met':
                            unmet_criteria_from_table.append(point_id)
        # 4 column table 
        elif len(tds)==4:
            for y,td in enumerate(tds):
                if td.string:
                    try:
                        point_id = int(td.string)
                    except ValueError:
                        continue
                    #ignore points we have no data for
                    if point_id <= 18:
                        if tds[y+1].string.lower() == 'yes':
                            met_criteria_from_table.append(point_id)
                        elif tds[y+1].string.lower() == 'no':
                            unmet_criteria_from_table.append(point_id)
    return {'met': met_criteria_from_table, 'unmet':unmet_criteria_from_table}

In [None]:
df['criteria_from_table'] = df['url'].map(lambda x: find_met_criteria_in_table(x))


In [None]:
# Unnest table criteria dicts
df['met-criteria-from-table'] = df['criteria_from_table'].map(lambda x: x.get('met'))
df['unmet-criteria-from-table'] = df['criteria_from_table'].map(lambda x: x.get('unmet'))

In [None]:
# Coalesce data scraped from table and from p
# Table is more reliable than p
df['met-criteria-from-table'] = df['met-criteria-from-table'].map(lambda x: np.nan if len(x)==0 else x)
df['unmet-criteria-from-table'] = df['unmet-criteria-from-table'].map(lambda x: np.nan if len(x)==0 else x)
df['all-met-criteria'] = df['met-criteria-from-table'].combine_first(df['met-criteria-from-p'])
df['met'] = df['all-met-criteria'].map(lambda x: list(set(map(int,x))))
df['all-unmet-criteria'] = df['unmet-criteria-from-table'].combine_first(df['unmet-criteria-from-p'])
df['not-met'] = df['all-unmet-criteria'].map(lambda x: list(set(map(int,x))))

In [None]:
# testing the criteria breakdown extraction
#Test extract id and status from p
assert df[df['url']=='https://www.gov.uk/service-standard-reports/apprenticeship-applications'][[
    'met',
    'not-met'
]].values[0].tolist() == [[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17],[]]


assert df[df['url']=='https://www.gov.uk/service-standard-reports/close-a-company-beta-service-assessment-report'][[
    'met',
    'not-met'
]].values[0].tolist() == [[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,17,18],[16]]

assert df[df['url']=='https://www.gov.uk/service-standard-reports/express-an-interest-in-a-repatriation-flight-alpha-assessment-report'][[
    'met',
    'not-met'
]].values[0].tolist() == [[11,13,7],[1,2,3,4,5,6,8,9,10,12,14]]





In [None]:
# Map IDs to service standard as best we can
cutoff = datetime.datetime.strptime('2019-06-30', '%Y-%m-%d' ).date()
df['current_service_standard'] = df.apply(lambda x:
    (x['assessment_date']>cutoff ) & (max(set(x['all-met-criteria']).union(set(x['all-unmet-criteria'])),default=1)<15),axis=1)

In [None]:
df.shape

In [None]:
# Tidy for export
export = df[['path',
             'assessment_outcome',
             'stage',
             'assessment_date',
             'current_service_standard',
             'met',
             'not-met']]

In [None]:
export = export.melt(id_vars=['path','assessment_outcome','stage','assessment_date','current_service_standard'],
           var_name='status',value_name='assessment_criteria_id')

In [None]:
export = export.explode('assessment_criteria_id')

In [None]:
export

In [None]:
def criteria_to_id_mapper(criteria, current_service_standard):
    """
    Maps id to 'correct' service standard
    """
    if current_service_standard==True:
        crit_codes = id_to_crit.get(criteria)
    else:
        crit_codes = july_2019_id_to_crit.get(criteria)
    return crit_codes  

export['criteria-desc'] = export.apply(
    lambda x: criteria_to_id_mapper(x['assessment_criteria_id'],x['assessment_date']),axis=1)


In [None]:
export['path'] = export['path'].map(lambda x: x.rsplit('/',1)[1])

In [None]:
export

In [None]:
export.to_csv('assessment_crit_breakdown.csv')

In [None]:
summary = export[['path','assessment_outcome','stage','assessment_date']].drop_duplicates()

In [None]:
summary.shape[0] == 224

In [None]:
summary.to_csv('service-assessment-summary.csv')