In [None]:
import re
from string import punctuation
import pandas as pd
import numpy as np
from tqdm import tqdm
#import spacy
#import nltk
#from nltk import word_tokenize, pos_tag
# from flair.data import Sentence
# from flair.models import SequenceTagger
# from segtok.segmenter import split_single
from datetime import datetime
import uuid

In [2]:
egdp = pd.read_csv("data/01_raw/ODS - Dental Practices and Practitioners/egdp/egdp.csv")


In [3]:
egdpprac = pd.read_csv("data/01_raw/ODS - Dental Practices and Practitioners/egdpprac/egdpprac.csv")

In [4]:
dental_practices_by_constituency = pd.read_csv("data/01_raw/NHS Dental Practices by Constituency/dental_practices_by_constituency.csv")

In [5]:
egdppracmem = pd.read_csv("data/01_raw/ODS - Dental Practices and Practitioners/egdppracmem/egdppracmem.csv")

In [6]:
def construct_dentist_list(egdp: pd.DataFrame):
    """
    Input, clean and preprocess the NHS ODS list of dentists from the 2014-16 period.
    :param egdp: DataFrame containing raw dentist data.
    :return prep_dentists: DataFrame containing cleaned and preprocessed dentist list data.
    """
    prep_dentists = egdp.copy(deep=True)

    # Column names
    cols_list = [
        "dentist_code",
        "dentist_name",
        "Unnamed: 2",
        "Unnamed: 3",
        "address_1",
        "address_2",
        "address_3",
        "address_4",
        "address_5",
        "postcode",
        "open_date",
        "close_date",
        "Unnamed: 12",
        "organisation_sub_type_code",
        "Unnamed: 14",
        "Unnamed: 15",
        "Unnamed: 16",
        "Unnamed: 17",
        "Unnamed: 18",
        "Unnamed: 19",
        "Unnamed: 20",
        "amended_record_indicator",
        "Unnamed: 22",
        "Unnamed: 23",
        "gdc_code",
        "Unnamed: 25",
        "Unnamed: 26",
    ]

    # Set to new headers
    prep_dentists.loc[-1] = prep_dentists.columns
    prep_dentists.index = prep_dentists.index + 1
    prep_dentists = prep_dentists.sort_index()
    prep_dentists.columns = cols_list

    # Remove unused columns
    for col in prep_dentists.columns:
        if "Unnamed" in prep_dentists.loc[0, col]:
            prep_dentists.loc[0, col] = np.nan
    prep_dentists = prep_dentists.dropna(axis=1, how="all")

    # Set up empty columns to hold namestrings
    for col in ["first_initial", "middle_initial", "last_name", "title"]:
        prep_dentists[col] = np.nan

    # Split dentist names into first initial, last name and title
    for index, row in tqdm(prep_dentists.iterrows(), total=prep_dentists.shape[0]):
        parts_list = row["dentist_name"].split(" ")
        row["title"] = parts_list.pop(-1)
        initials = parts_list.pop(-1)
        if len(initials) == 1:
            row["first_initial"] = initials
        elif len(initials) > 1:
            row["first_initial"] = initials[0]
            row["middle_initial"] = initials[-1]
        row["last_name"] = " ".join(parts_list)
        prep_dentists.loc[
            index, ["first_initial", "middle_initial", "last_name", "title"]
        ] = row[["first_initial", "middle_initial", "last_name", "title"]]

    return prep_dentists


In [7]:
prep_dentists = construct_dentist_list(egdp)

  prep_dentists.loc[
  prep_dentists.loc[
  prep_dentists.loc[
  prep_dentists.loc[
100%|██████████| 36455/36455 [00:28<00:00, 1258.90it/s]


In [8]:
def construct_dental_practices_list(egdpprac: pd.DataFrame): # Dental practices list
    """
    Input, clean and preprocess the NHS ODS list of dental practices from the 2014-16 period.
    :param egdpprac: DataFrame containing raw dental practice data.
    :return prep_practices: DataFrame containing cleaned and preprocessed dental practice list data.
    """
    prep_practices = egdpprac.copy(deep=True)

    # Column names
    cols_list = [
        "practice_code",
        "practice_name",
        "national_grouping",
        "high_level_health_geography",
        "address_1",
        "address_2",
        "address_3",
        "address_4",
        "address_5",
        "postcode",
        "open_date",
        "close_date",
        "status_code",
        "organisation_sub_type_code",
        "parent_organisation_code",
        "join_parent_date",
        "left_parent_date",
        "contact_telephone_number",
        "Unnamed: 18",
        "Unnamed: 19",
        "Unnamed: 20",
        "amended_report_indicator",
    ]

    # Set to new headers
    prep_practices.loc[-1] = prep_practices.columns
    prep_practices.index = prep_practices.index + 1
    prep_practices = prep_practices.sort_index()
    prep_practices.columns = cols_list

    # Remove unused columns
    for col in prep_practices.columns:
        if "Unnamed" in prep_practices.loc[0, col]:
            prep_practices.loc[0, col] = np.nan
    prep_practices = prep_practices.dropna(axis=1, how="all")

    return prep_practices


In [9]:
prep_practices = construct_dental_practices_list(egdpprac)

In [10]:
def construct_nhs_dental_practices_list(dental_practices_by_constituency): #nhs dental practices
    """
    Input, clean and preprocess the list of dental practices serving NHS customers sourced from the UK House of Commons
    Library dental practices dashboard, from November 2023. Note that this dashboard primarily contains NHS dental
    practices with only some private-only practices, so some dental practices in the labs data may not appear here and
    will require reference to prep_practices from 2014-16 outputted by construct_dental_practices_list.
    :param dental_practices_by_constituency: DataFrame containing raw NHS (and some private) dental practices data from
    November 2023.
    :return prep_nhs_practices: DataFrame containing cleaned and preprocessed NHS dental practices data from November
    2023.
    """
    prep_nhs_practices = dental_practices_by_constituency.copy(deep=True)

    # Drop duplicated index column and extraneous constituency-related columns
    prep_nhs_practices = prep_nhs_practices.drop(
        columns=["Unnamed: 0", "pcon", "con_name"]
    )

    # Rename columns
    prep_nhs_practices = prep_nhs_practices.rename(
        columns={
            "prac_code": "organisation_code",
            "prac_name": "practice_name",
            "icb": "high_level_health_geography",
            "address1": "address_1",
            "address2": "address_2",
            "address3": "address_3",
            "address4": "address_4",
            "address5": "address_5",
        }
    )

    return prep_nhs_practices


In [11]:
prep_nhs_practices = construct_nhs_dental_practices_list(dental_practices_by_constituency)


In [12]:
def construct_dentist_practice_mapping(egdppracmem: pd.DataFrame): #raw data file map dentist to practice
    """
    Input, clean and preprocess the NHS ODS list of dentists mapped to dental practices they are a part of from the
    2014-16 period.
    :param egdppracmem: DataFrame containing the raw list of dentists mapped to dental practices.
    :return prep_dent_prac_map: DataFrame containing cleaned and preprocessed dentist mapped to dental practice data.
    """
    prep_dent_prac_map = egdppracmem.copy(deep=True)

    # Column names
    cols_list = [
        "dentist_code",
        "practice_code",
        "practice_type",
        "join_practice_date",
        "left_practice_date",
        "amended_record_indicator",
    ]

    # Set to new headers
    prep_dent_prac_map.loc[-1] = prep_dent_prac_map.columns
    prep_dent_prac_map.index = prep_dent_prac_map.index + 1
    prep_dent_prac_map = prep_dent_prac_map.sort_index()
    prep_dent_prac_map.columns = cols_list

    # Remove unused columns
    for col in prep_dent_prac_map.columns:
        if "Unnamed" in prep_dent_prac_map.loc[0, col]:
            prep_dent_prac_map.loc[0, col] = np.nan
    prep_dent_prac_map = prep_dent_prac_map.dropna(axis=1, how="all")

    return prep_dent_prac_map


In [13]:
prep_dent_prac_map = construct_dentist_practice_mapping(egdppracmem)

In [110]:
als_sku_greatlab_data.columns

Index(['customer_product_cube_uuid', 'year_month', 'system_source', 'als_lab',
       'practice_code', 'practice_name', 'customer_id', 'customer_name',
       'product_code', 'product_description', 'quantity', 'net_sales',
       'nhs_or_private', 'net_unit_price'],
      dtype='object')

In [112]:
als_sku_greatlab_data

Unnamed: 0,customer_product_cube_uuid,year_month,system_source,als_lab,practice_code,practice_name,customer_id,customer_name,product_code,product_description,quantity,net_sales,nhs_or_private,net_unit_price
0,4583b128-900f-4c75-8221-02933e728519,2024-06-01,Great Lab,APlus,,Vita Dental Spa,VITA Dental Spa,Deborah Park,MO0511028G,Models,1,0.00,,0.00
1,971b40cb-047e-4297-983f-78b267d5f85e,2024-06-01,Great Lab,APlus,,Vita Dental Spa,VITA Dental Spa,Deborah Park,PR0610715P,Implant Special Tray,1,18.50,,18.50
2,42673a55-0864-4eb8-a8c5-88c730038e07,2024-06-01,Great Lab,APlus,,Vita Dental Spa,VITA Dental Spa,Deborah Park,PR0102031Z,Basic Bite Block,1,0.00,,0.00
3,7046244a-5026-4efa-8428-7f86fe6a1b9e,2024-06-01,Great Lab,APlus,,Vita Dental Spa,VITA Dental Spa,Deborah Park,MO0612701G,Resin Working Model,8,118.80,,14.85
4,fd961364-b83c-49ef-b98d-180d13034bcc,2024-06-01,Great Lab,APlus,,Vita Dental Spa,VITA Dental Spa,Deborah Park,MO0511001G,Resin Study Models,3,89.10,,29.70
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84932,3ed3e2fb-87f4-4dbb-8faf-67b4d4c090db,2024-12-01,Great Lab,Waterside,,Blyth Health Centre,KEdwardsBlyth,K Edwards (Blyth HC),OR0403829B,Essix Retainer,1,29.70,,29.70
84933,4508d11a-48d5-4712-a08c-8dc9e7d9165e,2024-12-01,Great Lab,Waterside,,Frederick Street Dental,MMcGuireFrederick,Mark McGuire (Frederick Street Dental),SHIP-07,Delivery Fee- Driver,2,4.00,,2.00
84934,bab017f8-a3cc-49a1-a218-231bbc9353a9,2024-12-01,Great Lab,Waterside,,Frederick Street Dental,MMcGuireFrederick,Mark McGuire (Frederick Street Dental),PR0610715B,Special Trays,4,65.40,,16.35
84935,025eba14-bf6d-4735-899e-be5412696ff8,2024-12-01,Great Lab,Waterside,,Frederick Street Dental,MMcGuireFrederick,Mark McGuire (Frederick Street Dental),PR0504402B,Full/Full,1,115.45,,115.45


In [119]:
# Get Ashford customers list
als_sku_greatlab_data["customer_id"].fillna("Unknown", inplace=True)

als_sku_greatlab_data.groupby(["customer_id","customer_name", "practice_name","als_lab","system_source"]
).size().reset_index().drop(columns=0)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  als_sku_greatlab_data["customer_id"].fillna("Unknown", inplace=True)


Unnamed: 0,customer_id,customer_name,practice_name,als_lab,system_source
0,ONYEL,Dr Ejike Onyelobi,Barton Dental DO NOT USE,Central Dental Laboratory,Great Lab
1,10.0,Mr Peter Reece,Treetops Dental Surgery,Dental Excellence,Great Lab
2,100.0,Vis Pather,Treetops Dental Surgery,Dental Excellence,Great Lab
3,101.0,Mr James Russell,New Park House Dental,Dental Excellence,Great Lab
4,107.0,Mr Karun Khanna,Church Stretton Dental Practice,Dental Excellence,Great Lab
...,...,...,...,...,...
3991,lMurphyWallsend652,E Robb (My Dentist Wallsend),My Dentist Wallsend,Waterside,Great Lab
3992,nOODHIT,Dr Noodeh,Hitchin Dental Practice,Central Dental Laboratory,Great Lab
3993,obex,Emma Grey,Obex Dental,Central Dental Laboratory,Great Lab
3994,rai001,Dr R Rai,Damira Hillcrest Dental Practice,Casterbridge,Great Lab


In [118]:
print(als_sku_greatlab_data.isnull().sum())


customer_product_cube_uuid        0
year_month                        0
system_source                     0
als_lab                           0
practice_code                 84937
practice_name                     0
customer_id                    1095
customer_name                     0
product_code                      0
product_description               0
quantity                          0
net_sales                         0
nhs_or_private                84937
net_unit_price                    0
dtype: int64


In [109]:
als_sku_transactor_datacube = pd.read_csv("data/processed_product_mapping/als_sku_transactor_datacube.csv")
als_sku_evident_datacube = pd.read_csv("data/processed_product_mapping/als_sku_evident_datacube.csv")
als_sku_labtrac_orders_data = pd.read_csv("data/processed_product_mapping/als_sku_labtrac_orders_data.csv")
als_sku_leca_orders_data = pd.read_csv("data/processed_product_mapping/als_sku_leca_orders_data.csv")
als_sku_ashford_orders_data = pd.read_csv("data/processed_product_mapping/als_sku_ashford_orders_data.csv")
als_sku_greatlab_data = pd.read_csv("data/pre_processed_combined/sales/combined_great_lab.csv")

  als_sku_evident_datacube = pd.read_csv("data/processed_product_mapping/als_sku_evident_datacube.csv")
  als_sku_labtrac_orders_data = pd.read_csv("data/processed_product_mapping/als_sku_labtrac_orders_data.csv")
  als_sku_leca_orders_data = pd.read_csv("data/processed_product_mapping/als_sku_leca_orders_data.csv")
  als_sku_ashford_orders_data = pd.read_csv("data/processed_product_mapping/als_sku_ashford_orders_data.csv")


In [17]:
def _is_org(row: pd.Series, parts_list: list = None):
    """

    :param row:
    :param parts_list:
    :return:
    """
    return False
    # org_flag = False
    #
    # # Break each name into constituent parts, splitting on spaces
    # parts_list = row["customer_name"].split(" ")
    #
    # # All names containing "ltd" are likely to be orgs
    # if "LTD" in parts_list:
    #     org_flag = True
    #
    #
    #
    # if org_flag:
    #     row["person_or_org"] = "Org"
    #     row["org_name"] = row["customer_name"]
    #
    # return org_flag, row


In [19]:
def _extract_person_name(row: pd.Series):
    """
    Extract the name of a person from a given names table row that is known the contain a person name.
    :param row: Single row of a names table, in the form of a pd.Series object, with index "first_name", "middle_name",
    "last_name", "title", "person_or_org", "org_name".
    :return row: Single row of a names table, in the form of a pd.Series object, with index "first_name", "middle_name",
    "last_name", "title", "person_or_org", "org_name", with the name extracted and entered in the correct locations.
    """
    # Break each name into constituent parts, splitting on spaces
    parts_list = row["customer_name"].split(" ")

    # Remove any parts that come after "-" or "HOSPITAL"
    for word in ["-", "HOSPITAL"]:
        if word in parts_list:
            index = parts_list.index(word)
            if type(index) == int:
                parts_list = parts_list[:index]
            elif type(index) == list:
                parts_list = parts_list[: index[0]]

    # Remove any parts that contain numbers or are empty
    parts_list = [
        part
        for part in parts_list
        if re.search("\d", part) is None  # Check there are no numbers
        and part != ""  # Check the part is not empty
        and not any(
            char in punctuation for char in part
        )  # Check if the part contains punctuation
    ]

    # Remove everything after a part that contains punctuation
    punc_parts = [
        part for part in parts_list if any(char in punctuation for char in part)
    ]
    if len(punc_parts) > 0:
        index = parts_list.index(punc_parts[0])
        if type(index) == int:
            parts_list = parts_list[:index]
        elif type(index) == list:
            parts_list = parts_list[: index[0]]

    # Person flow: Log title, then take forename (first non-title part) and surname (last non-title part), and then
    # extract first letter of firstname
    parts_list = [part.upper() for part in parts_list]

    if len(parts_list) > 0:
        if parts_list[0] in ["DR", "MR", "MISS", "MS", "MRS"]:
            row["title"] = parts_list[0]
            row["first_name"] = parts_list[1] if len(parts_list) > 1 else np.nan
            row["last_name"] = parts_list[-1] if len(parts_list) > 2 else np.nan
            row["middle_name"] = (
                " ".join(parts_list[2:-1]) if len(parts_list) > 3 else np.nan
            )
        else:
            row["first_name"] = parts_list[0] if len(parts_list) > 0 else np.nan
            row["last_name"] = parts_list[-1] if len(parts_list) > 1 else np.nan
            row["middle_name"] = (
                " ".join(parts_list[1:-1]) if len(parts_list) > 2 else np.nan
            )

        row["person_or_org"] = "Person"

    # Correct a potential mistake for certain Dent8 customers
    if row["last_name"] == "IDH":
        row["last_name"] = row["middle_name"]
        row["middle_name"] = np.nan

    return row


In [120]:
def extract_labs_customers_list(
    als_sku_transactor_datacube,
    als_sku_evident_datacube,
    als_sku_labtrac_orders_data,
    als_sku_leca_orders_data,
    als_sku_ashford_orders_data,
    als_sku_greatlab_data
):
    """
    Extract all unique lab-level customer-practice combinations from all labs across Transactor, Evident and Labtrac,
    and store in a single table.
    :param als_sku_transactor_datacube: DataFrame containing Transactor data.
    :param als_sku_evident_datacube: DataFrame containing Evident data.
    :param als_sku_labtrac_orders_data: DataFrame containing Labtrac data.
    :param als_sku_leca_orders_data: DataFrame containing Leca data.
    :param als_sku_ashford_orders_data: DataFrame containing Leca data.
    :return all_labs_customers_list: DataFrame containing lab-level customer-practice combinations across all labs.
    """
    # Get Transactor customers list
    prep_transactor_customer_list = (
        als_sku_transactor_datacube.groupby(
            [
                "customer_id",
                "customer_name",
                "ship_name",
                "ship_address",
                "als_lab",
                "system_source",
            ]
        )
        .size()
        .reset_index()
        .drop(columns=0)
    )
    prep_transactor_customer_list = prep_transactor_customer_list.merge(
        als_sku_transactor_datacube[["ship_address", "ship_postcode"]],
        on="ship_address",
        how="left",
    )
    prep_transactor_customer_list = prep_transactor_customer_list.rename(
        columns={
            "ship_name": "practice_name",
            "ship_postcode": "practice_address_postcode",
        }
    )
    prep_transactor_customer_list = prep_transactor_customer_list.drop(
        columns=["ship_address"]
    )
    prep_transactor_customer_list = prep_transactor_customer_list.drop_duplicates(
        subset=["customer_id", "customer_name"]
    ).reset_index(drop=True)

    # Get Evident customers list
    prep_evident_customer_list = (
        als_sku_evident_datacube.groupby(
            [
                "customer_name",
                "practice_name",
                "als_lab",
                "system_source",
            ]
        )
        .size()
        .reset_index()
        .drop(columns=0)
    )
    prep_evident_customer_list["practice_address_postcode"] = "Unknown"
    prep_evident_customer_list["customer_id"] = "Unknown"
    prep_evident_customer_list = prep_evident_customer_list[
        [
            "customer_id",
            "customer_name",
            "practice_name",
            "practice_address_postcode",
            "als_lab",
            "system_source",
        ]
    ]

    # Get Labtrac customers list
    prep_labtrac_customer_list = (
        als_sku_labtrac_orders_data.groupby(
            [
                "customer_id",
                "customer_name",
                "als_lab",
                "system_source",
            ]
        )
        .size()
        .reset_index()
        .drop(columns=0)
    )
    prep_labtrac_customer_list = prep_labtrac_customer_list.merge(
        als_sku_labtrac_orders_data[
            [
                "customer_id",
                "customer_name",
                "practice_name",
                "practice_address_postcode",
            ]
        ],
        on=["customer_id", "customer_name"],
        how="left",
    )
    prep_labtrac_customer_list = prep_labtrac_customer_list.drop_duplicates(
        subset=["customer_id", "customer_name"]
    ).reset_index(drop=True)

    # Get Leca customers list
    als_sku_leca_orders_data["practice_address_postcode"] = als_sku_leca_orders_data[
        "practice_code"
    ]
    prep_leca_customer_list = (
        als_sku_leca_orders_data.groupby(
            [
                "customer_name",
                "practice_name",
                "practice_code",
                "als_lab",
                "system_source",
            ]
        )
        .size()
        .reset_index()
        .drop(columns=0)
    )
    prep_leca_customer_list = prep_leca_customer_list.rename(
        columns={"practice_code": "practice_address_postcode"}
    )
    prep_leca_customer_list = prep_leca_customer_list.merge(
        als_sku_leca_orders_data[["customer_id", "customer_name"]], on="customer_name"
    )
    prep_leca_customer_list = prep_leca_customer_list.drop_duplicates(
        subset=["customer_id", "customer_name"]
    )

    # Get Ashford customers list
    prep_ashford_customer_list = (
        als_sku_ashford_orders_data.groupby(
            [
                "customer_id",
                "customer_name",
                "practice_name",
                "als_lab",
                "system_source",
            ]
        )
        .size()
        .reset_index()
        .drop(columns=0)
    )
    prep_ashford_customer_list["practice_address_postcode"] = "Unknown"

    # Get Great Lab customers list
    als_sku_greatlab_data["customer_id"].fillna("Unknown", inplace=True)

    prep_greatlab_customer_list = als_sku_greatlab_data.groupby(["customer_id","customer_name", "practice_name","als_lab","system_source"]
    ).size().reset_index().drop(columns=0)

    # Combine the customers list across Transactor, Evident and Labtrac into the same table
    all_labs_customers_list = pd.concat(
        [
            prep_transactor_customer_list,
            prep_evident_customer_list,
            prep_labtrac_customer_list,
            prep_leca_customer_list,
            prep_ashford_customer_list,
            prep_greatlab_customer_list
        ]
    )

    # Get list of only customer names and IDs for each ALS lab in one table
    names = (
        all_labs_customers_list.groupby(by=["customer_id", "customer_name", "als_lab"])
        .size()
        .reset_index()
        .rename(columns={0: "count"})
    )

    for col in [
        "first_name",
        "middle_name",
        "last_name",
        "title",
        "person_or_org",
        "org_name",
    ]:
        names[col] = np.nan

    # For loop through all the names
    for index, row in tqdm(names.iterrows(), total=names.shape[0]):
        # Check if name is an org name and extract to org name columns if so
        if _is_org(row=row):
            row["person_or_org"] = "Organisation"
            row["org_name"] = row["customer_name"]
        # If name is not an organisation name, extract the person name
        else:
            # Extract the person name
            names.loc[
                index,
                [
                    "first_name",
                    "middle_name",
                    "last_name",
                    "title",
                    "person_or_org",
                    "org_name",
                ],
            ] = _extract_person_name(row=row)[
                [
                    "first_name",
                    "middle_name",
                    "last_name",
                    "title",
                    "person_or_org",
                    "org_name",
                ]
            ]

    all_labs_customers_list = all_labs_customers_list.merge(
        names[
            [
                "customer_id",
                "customer_name",
                "als_lab",
                "first_name",
                "middle_name",
                "last_name",
                "title",
                "person_or_org",
                "org_name",
            ]
        ],
        on=["customer_id", "customer_name", "als_lab"],
        how="left",
    )

    return all_labs_customers_list


In [122]:
all_labs_customers_list = extract_labs_customers_list(
    als_sku_transactor_datacube,
    als_sku_evident_datacube,
    als_sku_labtrac_orders_data,
    als_sku_leca_orders_data,
    als_sku_ashford_orders_data,
    als_sku_greatlab_data
)

  names.loc[
  names.loc[
  names.loc[
  names.loc[
  names.loc[
100%|██████████| 24879/24879 [00:27<00:00, 903.08it/s] 


In [125]:
def match_dentists_local(
    all_labs_customers_list: pd.DataFrame,
    prep_dentists: pd.DataFrame,
    # prep_practices: pd.DataFrame,
    # prep_nhs_practices: pd.DataFrame,
    # prep_dent_prac_map: pd.DataFrame,
):
    """
    Map ALS customers recorded in the lab-level data to dentist codes recorded in the NHS ODS list of dentists from the
    2014-16 period, for customers where a matching dentist record can be found.
    :param all_labs_customers_list: DataFrame containing lab-level customer-practice combinations across all labs.
    :param prep_dentists: DataFrame containing cleaned and preprocessed dentist list data.
    :return all_labs_customers_dentists_local_matched: DataFrame containing lab-level customer-practice combinations
    across all labs.
    """
    customers = all_labs_customers_list
    for col in ["dentist_code", "gdc_code", "dentist_name"]:
        customers[col] = np.nan

    for index, row in tqdm(customers.iterrows(), total=customers.shape[0]):
        customer_title = row["title"]
        customer_last_name = row["last_name"]
        if not pd.isna(row["first_name"]):
            customer_first_initial = row["first_name"][0]
        else:
            customer_first_initial = np.nan
        if not pd.isna(row["middle_name"]):
            customer_middle_initial = row["middle_name"][0]
        else:
            customer_middle_initial = np.nan
        if customer_last_name != np.nan and customer_first_initial != np.nan:
            search_return = prep_dentists.loc[
                (prep_dentists["first_initial"] == customer_first_initial)
                & (prep_dentists["last_name"] == customer_last_name),
                :,
            ].reset_index(drop=True)
            search_return_len = search_return.shape[0]
            
            if search_return_len == 0:
                continue
            elif search_return_len == 1:
                row["dentist_code"] = search_return["dentist_code"][0]
                row["gdc_code"] = search_return["gdc_code"][0]
                row["dentist_name"] = search_return["dentist_name"][0]
            elif search_return_len > 1:
                if customer_title != np.nan:
                    search_return_title = search_return.loc[
                        search_return["title"] == customer_title
                    ].reset_index(drop=True)
                    search_return_title_len = search_return_title.shape[0]
                    if search_return_title_len == 1:
                        row["dentist_code"] = search_return_title["dentist_code"][0]
                        row["gdc_code"] = search_return_title["gdc_code"][0]
                        row["dentist_name"] = search_return_title["dentist_name"][0]
                    elif search_return_title_len > 1:
                        search_return_title_middle_initial = search_return_title.loc[
                            search_return_title["middle_initial"]
                            == customer_middle_initial
                        ].reset_index(drop=True)
                        search_return_title_middle_initial_len = (
                            search_return_title_middle_initial.shape[0]
                        )
                        if search_return_title_middle_initial_len == 1:
                            row["dentist_code"] = search_return_title_middle_initial[
                                "dentist_code"
                            ][0]
                            row["gdc_code"] = search_return_title_middle_initial[
                                "gdc_code"
                            ][0]
                            row["dentist_name"] = search_return_title_middle_initial[
                                "dentist_name"
                            ][0]
                        elif search_return_title_middle_initial_len > 1:
                            row["dentist_code"] = search_return_title_middle_initial[
                                "dentist_code"
                            ][0]
                            row["gdc_code"] = search_return_title_middle_initial[
                                "gdc_code"
                            ][0]
                            row["dentist_name"] = search_return_title_middle_initial[
                                "dentist_name"
                            ][0]
                        elif search_return_title_middle_initial_len == 0:
                            row["dentist_code"] = search_return_title["dentist_code"][0]
                            row["gdc_code"] = search_return_title["gdc_code"][0]
                            row["dentist_name"] = search_return_title["dentist_name"][0]
                else:
                    search_return_middle_initial = search_return.loc[
                        (search_return["middle_initial"] == customer_middle_initial)
                    ].reset_index(drop=True)
                    search_return_middle_initial_len = (
                        search_return_middle_initial.shape[0]
                    )
                    if search_return_middle_initial_len == 1:
                        row["dentist_code"] = search_return_middle_initial[
                            "dentist_code"
                        ][0]
                        row["gdc_code"] = search_return_middle_initial["gdc_code"][0]
                        row["dentist_name"] = search_return_middle_initial[
                            "dentist_name"
                        ][0]
                    elif search_return_middle_initial_len > 1:
                        row["dentist_code"] = search_return_middle_initial[
                            "dentist_code"
                        ][0]
                        row["gdc_code"] = search_return_middle_initial["gdc_code"][0]
                        row["dentist_name"] = search_return_middle_initial[
                            "dentist_name"
                        ][0]
                    elif search_return_middle_initial_len == 0:
                        row["dentist_code"] = search_return["dentist_code"][0]
                        row["gdc_code"] = search_return["gdc_code"][0]
                        row["dentist_name"] = search_return["dentist_name"][0]
        customers.loc[index, ["dentist_code", "gdc_code", "dentist_name"]] = row[
            ["dentist_code", "gdc_code", "dentist_name"]
        ]

    # TODO: Incorporate dental practice checking into the customer-dentist matching flow

    all_labs_customers_dentists_local_matched = customers

    return all_labs_customers_dentists_local_matched


In [124]:
all_labs_customers_dentists_local_matched = match_dentists_local(all_labs_customers_list,prep_dentists)

  customers.loc[index, ["dentist_code", "gdc_code", "dentist_name"]] = row[
  customers.loc[index, ["dentist_code", "gdc_code", "dentist_name"]] = row[
100%|██████████| 26823/26823 [01:31<00:00, 291.90it/s]


In [126]:
all_labs_customers_dentists_local_matched.to_csv("data/processed_customer_mapping/all_labs_customers_dentists_local_matched.csv",
                                                 index=False)

In [127]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup

In [133]:
def match_dentists_scraped(all_labs_customers_dentists_local_matched):
    """

    :param all_labs_customers_dentists_local_matched:
    :return:
    """
    customers = all_labs_customers_dentists_local_matched

    # Load in all customer GDC numbers that were already previously scraped
    # previous_scraped_customers = pd.read_csv("data/processed_customer_mapping/scrap_data_latest.csv") or
    previous_scraped_customers = pd.read_csv("data/processed_customer_mapping/all_labs_customers_dentists_local_scraped_matched.csv")
    previous_scraped_customers = previous_scraped_customers.loc[
        previous_scraped_customers["gdc_code"].isna() == False
    ]

    # Add in already-scraped customer GDC numbers
    customers = customers.merge(
        previous_scraped_customers,
        on=[
            "customer_id",
            "customer_name",
            "practice_name",
            "als_lab",
            "system_source",
            "practice_address_postcode",
            "first_name",
            "middle_name",
            "last_name",
            "title",
            "person_or_org",
            "org_name",
        ],
        how="left",
        suffixes=("_x", "_y"),
    )

    # Combine GDC code columns
    customers["gdc_code_x"] = customers["gdc_code_x"].fillna(customers["gdc_code_y"])
    customers["dentist_name_x"] = customers["dentist_name_x"].fillna(
        customers["dentist_name_y"]
    )

    # Remove extraneous merged columns and rename GDC code column
    customers = customers.rename(
        columns={
            "gdc_code_x": "gdc_code",
            "dentist_code_x": "dentist_code",
            "dentist_name_x": "dentist_name",
        }
    )
    for col in customers.columns:
        if "_y" in col or "_x" in col:
            customers = customers.drop(columns=col)
    
    if "is_org" in customers.columns:
        customers = customers.drop(
            columns=[
                "registered_on",
                "registrant_type",
                "current_registration_period",
                "qualifications",
                "is_org",
            ]
        )
    else:
        customers = customers.drop(
            columns=[
                "registered_on",
                "registrant_type",
                "current_registration_period",
                "qualifications",
            ]
        )

    # Get all customer rows where a local match could not be found
    scraped_customers = customers.loc[
        customers["gdc_code"].isna() == True, :
    ].reset_index(drop=True)
    for col in [
        "registrant_type",
        "registered_on",
        "current_registration_period",
        "qualifications",
        "is_org",
    ]:
        scraped_customers[col] = np.nan

    # Flag certain rows as likely be orgs
    scraped_customers["is_org"] = False
    search_term = "education|denture|dental|ltd|the|shop|lab|labs|laboratories|laboratory|pct|dentist|limited|park|health|trust|orthodontics|studio|hmp|teeth|clinic|hospital|solution|solutions|dentures|dentist|studios".upper()
    for col in ["first_name", "middle_name", "last_name"]:
        scraped_customers[col] = scraped_customers[col].fillna("Unknown")
        scraped_customers.loc[
            scraped_customers[col].str.contains(search_term), "is_org"
        ] = True
        scraped_customers.loc[scraped_customers[col] == "Unknown", col] = np.nan
    scraped_customers = scraped_customers.loc[
        scraped_customers["is_org"] != True
    ].reset_index(drop=True)

    # scraped_customers = scraped_customers.iloc[:50]

    # Disable Image Loading
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument("--blink-settings=imagesEnabled=false")

    # Add Options to Webdriver
    driver = webdriver.Chrome(options=chrome_options)

    # Open GDC search page
    driver.get("https://olr.gdc-uk.org/SearchRegister")

    # Click the deny cookies button
    driver.find_element(By.CLASS_NAME, "cc-btn.cc-deny").click()

    # Iterate through each customer row and scrape dentist data from General Dentist Council webste
    total_index = 0
    for index, row in tqdm(
        scraped_customers.iterrows(), total=scraped_customers.shape[0]
    ):
        title = row["title"]
        first_name = row["first_name"]
        middle_name = row["middle_name"]
        last_name = row["last_name"]

        # Ensure that there is a first name and a last name to search for
        if pd.isna(first_name) and pd.isna(last_name):
            continue

        # Open GDC search page
        driver.get("https://olr.gdc-uk.org/SearchRegister")

        # # Click on the "All Registers" search option
        driver.execute_script("window.scrollTo(0, 1000)")
        driver.find_element(By.XPATH, "//*[@for='all']").click()

        # Enter first name
        if not pd.isna(first_name):
            driver.find_element(By.ID, "FirstName").send_keys(first_name)

        # Enter last name
        if not pd.isna(last_name):
            driver.find_element(By.ID, "Surname").send_keys(last_name)

        #print(f'Dentist name - [{first_name} {last_name}]')
        # Click on the "Include Erased registrants" search option                
        element = driver.find_element(By.ID, "IncludeErasedRegistrants")
        driver.execute_script("arguments[0].scrollIntoView(true);", element)
        driver.execute_script("arguments[0].click();", element)       
        # Locate the element
        # element = WebDriverWait(driver, 10).until(
        #     EC.presence_of_element_located((By.ID, "IncludeErasedRegistrants"))
        # )

        # # Scroll to the element
        # driver.execute_script("arguments[0].scrollIntoView(true);", element)

        # #driver.execute_script("document.querySelector('.card-body').style.display='none';")
        # # Click using JavaScript
        # driver.execute_script("arguments[0].click();", element)

        # Click to search
        element = driver.find_element(By.CLASS_NAME, "btn.btn-primary")
        driver.execute_script("arguments[0].click();", element)

        # Find all td tags in the table
        html = driver.page_source
        soup = BeautifulSoup(html, "html.parser")
        results_table = soup.find_all("td")
        # first_link = soup.find_all("td")[0].contents[1].attrs["href"]

        # Return first result if there are results provided in by the search
        if len(results_table) > 0:
            scraped_first_name = results_table[1].contents[0]
            scraped_last_name = results_table[0].text.strip("\n")
            row["dentist_name"] = f"{scraped_first_name} {scraped_last_name}"
            row["gdc_code"] = results_table[2].contents[0]
            row["registrant_type"] = results_table[4].contents[0]

            scraped_customers.loc[
                index, ["dentist_name", "gdc_code", "registrant_type"]
            ] = row[["dentist_name", "gdc_code", "registrant_type"]]

        total_index += 1
        if total_index % 100 == 0:
            scraped_customers.to_csv(f"data/processed_customer_mapping/scrapped_data/{str(datetime.now()).replace(':', '')}"
                + ".csv",
                index=False,
            )

    customers = customers.merge(
        scraped_customers[
            [
                "customer_id",
                "customer_name",
                "practice_name",
                "als_lab",
                "dentist_code",
                "gdc_code",
                "dentist_name",
                "registrant_type",
                "registered_on",
                "current_registration_period",
                "qualifications",
            ]
        ],
        on=[
            "customer_id",
            "customer_name",
            "practice_name",
            "als_lab",
        ],
        how="left",
    )
    for cols in [
        ["dentist_code_x", "dentist_code_y"],
        ["gdc_code_x", "gdc_code_y"],
        ["dentist_name_x", "dentist_name_y"],
    ]:
        customers[cols[0]] = customers[cols[0]].fillna(customers[cols[1]])
    customers = customers.drop(
        columns=["dentist_code_y", "gdc_code_y", "dentist_name_y"]
    )
    customers = customers.rename(
        columns={
            "dentist_code_x": "dentist_code",
            "gdc_code_x": "gdc_code",
            "dentist_name_x": "dentist_name",
        }
    )

    all_labs_customers_dentists_local_scraped_matched = customers

    return all_labs_customers_dentists_local_scraped_matched


In [134]:
all_labs_customers_dentists_local_scraped_matched = match_dentists_scraped(all_labs_customers_dentists_local_matched)

  scraped_customers.loc[
  scraped_customers.loc[
100%|██████████| 9543/9543 [2:09:43<00:00,  1.23it/s]  


In [135]:
all_labs_customers_dentists_local_scraped_matched.to_csv("data/processed_customer_mapping/all_labs_customers_dentists_local_scraped_matched.csv",
                                                 index=False)

In [150]:
def join_customer_matched_data(
    all_labs_customers_dentists_local_scraped_matched: pd.DataFrame,
    als_sku_transactor_datacube: pd.DataFrame,
    als_sku_evident_datacube: pd.DataFrame,
    als_sku_labtrac_orders_data: pd.DataFrame,
    als_sku_leca_orders_data: pd.DataFrame,
    als_sku_ashford_orders_data: pd.DataFrame,
    als_sku_greatlab_data: pd.DataFrame
):
    """

    :param all_labs_customers_dentists_local_scraped_matched:
    :param als_sku_transactor_datacube:
    :param als_sku_evident_datacube:
    :param als_sku_labtrac_orders_data:
    :return: als_final_datacube:
    """
    # Group Labtrac data by year-month and lab-level customer details
    labtrac = als_sku_labtrac_orders_data.copy(deep=True)
    labtrac["order_invoiced_date"] = pd.to_datetime(labtrac["order_invoiced_date"])
    labtrac["order_invoiced_date_year"] = labtrac["order_invoiced_date"].dt.year
    labtrac["order_invoiced_date_month"] = labtrac["order_invoiced_date"].dt.month

    labtrac = labtrac.drop(columns="order_invoiced_date")
    labtrac_datacube = (
        labtrac.groupby(
            by=[
                "order_invoiced_date_year",
                "order_invoiced_date_month",
                "system_source",
                "als_lab",
                "practice_name",
                "practice_address_road",
                "practice_address_town",
                "practice_address_postcode",
                "customer_id",
                "customer_name",
                "product_code",
                "product_description",
                "order_status",
                "nhs_private_tag",
                "als_sku_code",
                "als_product_category",
                "als_product_sub_category",
                "als_product_name",
                "als_product_material",
                "als_product_standard",
                "als_product_type",
            ],
            dropna=False,
        )[
            [
                "quantity",
                "net_sales",
            ]
        ]
        .sum()
        .reset_index(drop=False)
    )

    labtrac_datacube["day"] = 1
    for col in ["order_invoiced_date_year", "order_invoiced_date_month", "day"]:
        labtrac_datacube[col] = labtrac_datacube[col].astype(str)
    labtrac_datacube["year_month"] = (
        labtrac_datacube["order_invoiced_date_year"]
        + "-"
        + labtrac_datacube["order_invoiced_date_month"]
        + "-"
        + labtrac_datacube["day"]
    )
    labtrac_datacube["year_month"] = pd.to_datetime(labtrac_datacube["year_month"])

    labtrac_datacube["practice_address"] = (
        labtrac_datacube["practice_name"].str.strip()
        + ", "
        + labtrac_datacube["practice_address_road"].str.strip()
        + ", "
        + labtrac_datacube["practice_address_town"].str.strip()
        + ", "
        + labtrac_datacube["practice_address_postcode"].str.strip()
    )

    labtrac_datacube["practice_address"] = labtrac_datacube["practice_address"].str.replace(", nan", "")

    labtrac_datacube = labtrac_datacube.drop(
        columns=[
            "order_invoiced_date_year",
            "order_invoiced_date_month",
            "day",
            "practice_address_road",
            "practice_address_town",
            "order_status",
            "als_product_standard",
        ]
    )

    labtrac_datacube["practice_id"] = np.nan

    # Remove extraneous columns from Transactor data and re-group by month
    transactor_datacube = als_sku_transactor_datacube.copy(deep=True)
    transactor_datacube = (
        transactor_datacube.drop(
            columns=[
                "original_lab_price_band",
                "net_unit_price",
                "discounted_unit_price",
                "tax_sales",
            ]
        )
        .groupby(
            by=[
                "year_month",
                "system_source",
                "als_lab",
                "ship_id",
                "ship_name",
                "ship_address",
                "ship_postcode",
                "customer_id",
                "customer_name",
                "product_code",
                "product_description",
                "nhs_private_tag",
                "als_sku_code",
                "als_product_category",
                "als_product_sub_category",
                "als_product_name",
                "als_product_material",
                "als_product_standard",
                "als_product_type",
            ],
            dropna=False,
        )[["quantity", "net_sales"]]
        .sum()
        .reset_index(drop=False)
        .rename(
            columns={
                "ship_id": "practice_id",
                "ship_name": "practice_name",
                "ship_address": "practice_address",
                "ship_postcode": "practice_address_postcode",
            }
        )
        .drop(columns="als_product_standard")
    )

    # Process Evident data
    evident_datacube = als_sku_evident_datacube.copy(deep=True)
    evident_datacube = evident_datacube.drop(
        columns=[
            "quantity_remake",
            "gross_sales",
            "customer_product_cube_uuid",
            "als_product_standard",
        ]
    )

    # Remove extraneous columns from Leca data and re-group by month
    leca = als_sku_leca_orders_data.copy(deep=True)

    leca_datacube = (
        leca.groupby(
            by=[
                "year_month",
                "system_source",
                "als_lab",
                "practice_code",
                "practice_name",
                "customer_id",
                "customer_name",
                "product_code",
                "product_description",
                "nhs_private_tag",
                "als_sku_code",
                "als_product_category",
                "als_product_sub_category",
                "als_product_name",
                "als_product_material",
                "als_product_standard",
                "als_product_type",
            ],
            dropna=False,
        )[
            [
                "quantity",
                "net_sales",
            ]
        ]
        .sum()
        .reset_index(drop=False)
    )
    leca_datacube["year_month"] = pd.to_datetime(leca_datacube["year_month"])
    leca_datacube = leca_datacube.drop(columns=["als_product_standard"])
    leca_datacube = leca_datacube.rename(
        columns={"practice_code": "practice_address_postcode"}
    )

    # Group ashford data by year-month and lab-level customer details
    ashford = als_sku_ashford_orders_data.copy(deep=True)
    ashford["order_invoiced_date"] = pd.to_datetime(
        ashford["order_invoiced_date"], format="mixed"
    )
    ashford["order_invoiced_date_year"] = ashford["order_invoiced_date"].dt.year
    ashford["order_invoiced_date_month"] = ashford["order_invoiced_date"].dt.month

    ashford = ashford.drop(columns="order_invoiced_date")
    ashford_datacube = (
        ashford.groupby(
            by=[
                "order_invoiced_date_year",
                "order_invoiced_date_month",
                "system_source",
                "als_lab",
                "practice_name",
                "customer_id",
                "customer_name",
                "product_code",
                "product_description",
                "nhs_private_tag",
                "als_sku_code",
                "als_product_category",
                "als_product_sub_category",
                "als_product_name",
                "als_product_material",
                "als_product_standard",
                "als_product_type",
            ],
            dropna=False,
        )[
            [
                "quantity",
                "net_sales",
            ]
        ]
        .sum()
        .reset_index(drop=False)
    )

    ashford_datacube["day"] = 1
    for col in ["order_invoiced_date_year", "order_invoiced_date_month", "day"]:
        ashford_datacube[col] = ashford_datacube[col].astype(str)
    ashford_datacube["year_month"] = (
        ashford_datacube["order_invoiced_date_year"]
        + "-"
        + ashford_datacube["order_invoiced_date_month"]
        + "-"
        + ashford_datacube["day"]
    )
    ashford_datacube["year_month"] = pd.to_datetime(ashford_datacube["year_month"])

    ashford_datacube = ashford_datacube.drop(
        columns=[
            "order_invoiced_date_year",
            "order_invoiced_date_month",
            "day",
            "als_product_standard",
        ]
    )

    ashford_datacube[["practice_id", "practice_address"]] = np.nan

    # Group GreatLab data by year-month and lab-level customer details
    greatlab = als_sku_greatlab_data.copy(deep=True)
    greatlab["year_month"] = pd.to_datetime(greatlab["year_month"])
    greatlab.rename(columns={"product_code": "als_sku_code"})
    greatlab = greatlab.rename(columns={"product_code": "als_sku_code"})
    
    greatlab_datacube = (
        greatlab.groupby(
            by=[
                "year_month",
                "system_source",
                "als_lab",
                "practice_name",
                "customer_id",
                "customer_name",
                "als_sku_code",
                "product_description",
                "nhs_or_private",
            ],
            dropna=False,
        )[
            [
                "quantity",
                "net_sales",
            ]
        ]
        .sum()
        .reset_index(drop=False)
    )

    # Combine datacubes from all three systems into one DataFrame
    als_datacube = pd.concat(
        [
            transactor_datacube,
            labtrac_datacube,
            evident_datacube,
            leca_datacube,
            ashford_datacube,
            greatlab_datacube
        ]
    ).reset_index(drop=True)
    
    # Clean the customers data table
    customers = all_labs_customers_dentists_local_scraped_matched.drop(
        columns=[
            "practice_address_postcode",
            "practice_name",
            "first_name",
            "middle_name",
            "last_name",
            "title",
            "person_or_org",
            "org_name",
            "dentist_code",
            "registrant_type",
            "registered_on",
            "current_registration_period",
            "qualifications",
        ]
    )

    customers = customers.dropna(subset=["gdc_code", "dentist_name"], how="all")
    customers = customers.drop_duplicates()
    for col in ["customer_id", "customer_name", "als_lab", "system_source"]:
        customers.loc[customers[col].isna() == True, col] = "Unknown"
        customers[col] = customers[col].astype(str)
        als_datacube.loc[als_datacube[col].isna() == True, col] = "Unknown"
        als_datacube[col] = als_datacube[col].astype(str)

    als_datacube = als_datacube.merge(
        customers,
        on=[
            "customer_id",
            "customer_name",
            # "practice_name",
            "als_lab",
            "system_source",
        ],
        how="left",
    )

    # Change any "Unknown" values to np.nan
    for col in als_datacube.columns:
        if als_datacube.loc[als_datacube[col] == "Unknown"].shape[0] > 0:
            als_datacube.loc[als_datacube[col] == "Unknown", col] = np.nan

    # Strip off the Platinum, Gold, Silver, Bronze etc. letters
    for char in ["B", "G", "Z", "P", "S"]:
        als_datacube.loc[
            als_datacube["als_sku_code"].str[-1] == char, "als_sku_code"
        ] = als_datacube.loc[
            als_datacube["als_sku_code"].str[-1] == char, "als_sku_code"
        ].str[
            :-1
        ]

    # Ensure that all miscellaneous ALS SKU codes are correct
    als_datacube.loc[als_datacube["als_sku_code"] == "MISC001AD", "als_sku_code"] = "MISC0001AD"

    # Fill missing ALS SKU information with MISC codes
    misc_code = "MISC0001AD"
    misc_product = "Miscellaneous"
    als_datacube.loc[
        als_datacube["als_sku_code"].isna() == True,
        [
            "als_sku_code",
            "als_product_category",
            "als_product_sub_category",
            "als_product_name",
            "als_product_type",
        ]
    ] = [misc_code, misc_product, misc_product, misc_product, misc_product]

    for code in [
        "MISC0001AD",
        "Denture Misc",
        "MISC001Ad",
        "Misc",
        "Implants MISC",
        "MISC",

    ]:
        als_datacube.loc[
            als_datacube["als_sku_code"] == code,
            [
                "als_sku_code",
                "als_product_category",
                "als_product_sub_category",
                "als_product_name",
                "als_product_type",
            ]
        ] = [misc_code, misc_product, misc_product, misc_product, misc_product]

    # Ensure the Year-Month date column is all in the same format
    als_datacube["year_month"] = als_datacube["year_month"].astype(str)
    als_datacube["year_month"] = als_datacube["year_month"].str[:7]

    # Reorder columns
    als_datacube = als_datacube[
        [
            "year_month",
            "system_source",
            "als_lab",
            "practice_id",
            "practice_name",
            "practice_address",
            "practice_address_postcode",
            "customer_id",
            "customer_name",
            "gdc_code",
            "dentist_name",
            "product_code",
            "product_description",
            "nhs_private_tag",
            "als_sku_code",
            "als_product_category",
            "als_product_sub_category",
            "als_product_name",
            "als_product_material",
            "als_product_type",
            "quantity",
            "net_sales",
        ]
    ].rename(
        columns={"gdc_code": "gdc_dentist_code", "dentist_name": "gdc_dentist_name"}
    )

    return als_datacube


In [None]:
als_datacube = join_customer_matched_data(
    all_labs_customers_dentists_local_scraped_matched,
    als_sku_transactor_datacube,
    als_sku_evident_datacube,
    als_sku_labtrac_orders_data,
    als_sku_leca_orders_data,
    als_sku_ashford_orders_data,
    als_sku_greatlab_data,
)

In [153]:
als_datacube.to_csv("data/processed_customer_mapping/als_datacube.csv",index=False)

In [142]:
als_datacube.shape

(1062925, 22)