In [19]:
import pandas as pd
import uuid
import os
import numpy as np
from tqdm import tqdm
from bs4 import BeautifulSoup

In [3]:
als_datacube = pd.read_csv("data/processed_customer_mapping/als_datacube.csv", low_memory=False)

In [4]:
def extract_practices_list(
    als_datacube: pd.DataFrame,
):
    """

    :param als_datacube:
    :return:
    """
    practices_list = (
        als_datacube.groupby(
            by=[
                "system_source",
                "als_lab",
                "practice_id",
                "practice_name",
                "practice_address",
                "practice_address_postcode",
            ],
            dropna=False,
        )["net_sales"]
        .sum()
        .reset_index(drop=False)
        .drop(columns="net_sales")
        .drop_duplicates(keep="first")
    )

    for col in [
        "system_source",
        "als_lab",
        "practice_id",
        "practice_name",
        "practice_address",
        "practice_address_postcode",
    ]:
        practices_list[f"original_{col}"] = practices_list[col]

    # Leca
    practices_list.loc[practices_list["als_lab"] == "Leca", "practice_address"] = (
        practices_list.loc[practices_list["als_lab"] == "Leca", "practice_name"]
        + ", "
        + practices_list.loc[
            practices_list["als_lab"] == "Leca", "practice_address_postcode"
        ]
    )

    # Bristol Crown
    practices_list.loc[
        practices_list["als_lab"] == "Bristol Crown", "practice_address"
    ] = practices_list.loc[
        practices_list["als_lab"] == "Bristol Crown", "practice_name"
    ]

    # Fill NA address values with practice name and postcode values
    practices_list.loc[
        (practices_list["practice_name"].isna() == False)
        & (practices_list["practice_address_postcode"].isna() == False)
        & practices_list["practice_address"].isna(),
        "practice_address",
    ] = (
        practices_list.loc[
            (practices_list["practice_name"].isna() == False)
            & (practices_list["practice_address_postcode"].isna() == False)
            & practices_list["practice_address"].isna(),
            "practice_name",
        ]
        + ", "
        + practices_list.loc[
            (practices_list["practice_name"].isna() == False)
            & (practices_list["practice_address_postcode"].isna() == False)
            & practices_list["practice_address"].isna(),
            "practice_address_postcode",
        ]
    )
    practices_list["practice_address"] = practices_list["practice_address"].fillna(
        practices_list["practice_name"]
    )

    practices_list["practice_address"] = practices_list["practice_address"].fillna(
        practices_list["practice_address_postcode"]
    )

    practices_list = practices_list.dropna(subset="practice_address")

    practices_list["practice_uuid"] = practices_list.apply(
        lambda _: uuid.uuid4(), axis=1
    ).astype(str)

    als_datacube = als_datacube.merge(
        practices_list,
        left_on=[
            "system_source",
            "als_lab",
            "practice_id",
            "practice_name",
            "practice_address",
            "practice_address_postcode",
        ],
        right_on=[
            "original_system_source",
            "original_als_lab",
            "original_practice_id",
            "original_practice_name",
            "original_practice_address",
            "original_practice_address_postcode",
        ],
        how="left",
    )

    for col in als_datacube.columns:
        if col[-2:] == "_x":
            als_datacube = als_datacube.rename(columns={col: col[:-2]})
        elif (col[-2] == "_y") or ("original_" in col):
            als_datacube = als_datacube.drop(columns=col)

    return practices_list, als_datacube


In [10]:
datacube_practices_list, als_final_datacube_private_practices_uuid = extract_practices_list(als_datacube)

In [None]:
# datacube_practices_to_tag_addresses_manually_tagged = pd.read_excel("data/02_intermediate/corporate_practices/datacube_practices_to_tag_addresses_manually_tagged.xlsx")

In [15]:
def save_datacube_practices_list_as_separate_files(
    datacube_practices_list: pd.DataFrame,
):
    """

    :param datacube_practices_list:
    :return:
    """
    max_index = datacube_practices_list.shape[0]
    num_files = max_index // 2000

    file_indices_list = [[i * 2000, i * 2000 + 2000] for i in range(0, num_files)]
    file_indices_list.append([file_indices_list[-1][1], max_index])

    for indices in tqdm(file_indices_list):
        save_data = datacube_practices_list.iloc[indices[0] : indices[1]]
        save_data.to_csv(f"data/02_intermediate/corporate_practices/datacube_practices_list_separated/datacube_practices_list_{indices[0]}_{indices[1]}.csv",index=False)

    return datacube_practices_list

In [17]:
datacube_practices_list2 = save_datacube_practices_list_as_separate_files(datacube_practices_list)

100%|██████████| 6/6 [00:00<00:00, 79.19it/s]


In [None]:
def prep_datacubes_practices_list_addresses_kml(datacube_practices_list):
    """

    :return:
    """
    folder_path = "data/02_intermediate/corporate_practices/datacube_practices_list_addresses_kml"
    files = os.listdir(folder_path)
    i = 0
    df_dict = dict()
    for file in tqdm(files):
        file_path = os.path.join(folder_path, file)
        with open(file_path, "r", encoding="utf-8") as page:
            soup = BeautifulSoup(page, "lxml")

            for placemark in tqdm(soup.find_all("placemark")):
                system_source = placemark.find_all("description")[0].contents[1]
                als_lab = placemark.find_all("description")[0].contents[2].contents[1]
                practice_id = (
                    placemark.find_all("description")[0].contents[3].contents[1]
                )
                practice_name = (
                    placemark.find_all("description")[0].contents[4].contents[1]
                )
                practice_address = placemark.find_all("name")[0].text
                practice_address_postcode = (
                    placemark.find_all("description")[0].contents[6].contents[1]
                )
                original_system_source = (
                    placemark.find_all("description")[0].contents[7].contents[1]
                )
                original_als_lab = (
                    placemark.find_all("description")[0].contents[8].contents[1]
                )
                original_practice_id = (
                    placemark.find_all("description")[0].contents[9].contents[1]
                )
                original_practice_name = (
                    placemark.find_all("description")[0].contents[10].contents[1]
                )
                original_practice_address = (
                    placemark.find_all("description")[0].contents[11].contents[1]
                )
                original_practice_address_postcode = (
                    placemark.find_all("description")[0].contents[12].contents[1]
                )
                practice_uuid = (
                    placemark.find_all("description")[0].contents[13].contents[1]
                )

                coordinates = placemark.contents[9].contents[1].contents[0]
                df_dict[i] = {
                    "system_source": str(system_source),
                    "als_lab": str(als_lab),
                    "practice_id": str(practice_id),
                    "practice_name": str(practice_name),
                    "practice_address": str(practice_address),
                    "practice_address_postcode": str(practice_address_postcode),
                    "original_system_source": str(original_system_source),
                    "original_als_lab": str(original_als_lab),
                    "original_practice_id": str(original_practice_id),
                    "original_practice_name": str(original_practice_name),
                    "original_practice_address": str(original_practice_address),
                    "original_practice_address_postcode": str(
                        original_practice_address_postcode
                    ),
                    "practice_uuid": str(practice_uuid),
                    "coordinates": str(coordinates),
                }
                i = i + 1
    df = pd.DataFrame.from_dict(df_dict, orient="index")
    for col in df.columns:
        df[col] = df[col].astype(str).str.lstrip(": ")
        df.loc[df[col] == "", col] = np.nan

    df[["longitude", "latitude", "extra"]] = df["coordinates"].str.split(
        ",", expand=True
    )
    df = df.drop(columns=["coordinates", "extra"])
    df[["latitude", "longitude"]] = df[["latitude", "longitude"]].astype(float)

    prep_datacubes_practices_list_locations = datacube_practices_list.merge(
        df,
        on="practice_uuid",
        how="left",
    )

    for col in prep_datacubes_practices_list_locations.columns:
        if col[-2:] == "_x":
            prep_datacubes_practices_list_locations = (
                prep_datacubes_practices_list_locations.rename(columns={col: col[:-2]})
            )
        elif "_y" in col:
            prep_datacubes_practices_list_locations = (
                prep_datacubes_practices_list_locations.drop(columns=col)
            )

    return prep_datacubes_practices_list_locations


In [18]:
prep_datacubes_practices_list_locations = prep_datacubes_practices_list_addresses_kml(datacube_practices_list)

NameError: name 'prep_datacubes_practices_list_addresses_kml' is not defined