# Example plant matching notebook

In [None]:
cd ..

Note: The functions to read in the japan asset datasets have some mistakes in them, this is because I chanhged them a lot (as well as the configs) while I was matching and so there is not a final clean version of the functions.

In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import yaml
import ast
import re
import pandas_gbq as pd_gbq
import geopandas as gpd
import os

pd.set_option('display.max_columns', None)

In [None]:
import powerplantmatching as ppm

In [None]:
def get_matches(config, source_1, source_2):
    fp = os.path.join(os.path.expanduser("~"), ".local", "share", f"powerplantmatching/data/out/{config['hash']}/Matched_{source_1}_{source_2}.csv")
    matched_df = pd.read_csv(fp)
    matched_df = matched_df.iloc[2:]
    matched_df.scores = matched_df.scores.astype(float)

    print(f"scores min: {matched_df.scores.min()}, max: {matched_df.scores.max()}")

    source_1_cols = [col for col in matched_df.columns if '.1' not in col]
    source_2_cols = ['Unnamed: 0']+[col for col in matched_df.columns if '.1' in col] + ['scores']
    if "KITAMOTO" in source_1:
        name1 = 'KITAMOTO'
        name2 = source_2
    elif "KITAMOTO" in source_2:
        name1 = source_1
        name2 = 'KITAMOTO'
    else:
        name1 = source_1
        name2 = source_2
    matched_df = (pd.concat([matched_df[source_1_cols].assign(source=name1),
    matched_df[source_2_cols].set_axis(source_1_cols, axis='columns').assign(source=name2)])
    .rename(columns={'Unnamed: 0': 'id'}))
    return matched_df.sort_values(['id', 'source'])

def add_distance(df, match_index_col='id', source_1='KITAMOTO', source_2='GEM'):
    df_kit = df[df.source==source_1]
    gdf_kit = gpd.GeoDataFrame(df_kit, geometry=gpd.points_from_xy(df_kit.lon, df_kit.lat), crs='EPSG:4326')
    df_fit = df[df.source==source_2]
    gdf_fit = gpd.GeoDataFrame(df_fit, geometry=gpd.points_from_xy(df_fit.lon, df_fit.lat), crs='EPSG:4326')

    gdf_fit = gdf_fit.to_crs(crs=32654)
    gdf_kit = gdf_kit.to_crs(crs=32654)

    gdf = gdf_fit.merge(gdf_kit, on=match_index_col, suffixes=('_fit', '_kit'))
    gdf['distance_meters'] = gdf.apply(lambda row: row['geometry_fit'].distance(row['geometry_kit']), axis=1)

    return df.merge(gdf[[match_index_col, 'distance_meters']], on=match_index_col)

def save_new_mappings(new_matches, current_mappings_path, source_1, source_2):
    if os.path.exists(current_mappings_path):
        current_mappings = pd.read_csv(current_mappings_path)
        print("Length of existing mappings:", len(current_mappings))
        # if source_1 not in current_mappings.columns or source_2 not in current_mappings.columns:
        #     print("Error: Source columns not found in current mappings")
        #     return
        # check that the new matches are not already in the current mappings
        if not ((new_matches[source_1].isin(current_mappings[source_1]))|(new_matches[source_2].isin(current_mappings[source_2]))).any():
            current_mappings = pd.concat([current_mappings, new_matches])
            print("Length of updated mappings:", len(current_mappings))
            current_mappings.to_csv(current_mappings_path, index=False)
        else:
            print("Error: IDs in new matches found in current mappings")
    else:
        print("File does not exist, creating new file")
        print("Length of new mappings:", len(new_matches))
        new_matches.to_csv(current_mappings_path, index=False)

# Kitamoto and GEM matching

This groups together all rows of data that belong to the same plant (this is not matching at the individual unit level)

In [None]:
with open('japan/configs/config_kitamoto_gem.yaml') as f:
    config = yaml.safe_load(f)

data = ppm.powerplants(config=config, update=True, fill_geopositions=False, filter_missing_geopositions=False)

In [None]:
# Ignore wierd naming of 'KITAMOTOGEM' this is because I had multiple Kitamoto functions in the data.py file

matched_df = get_matches(config, 'GEM', 'KITAMOTOGEM')
matched_df = add_distance(matched_df, match_index_col='id', source_1='KITAMOTO', source_2='GEM')

- Automatically save matches with very high scores
- The rest I would then save to a google sheet to validate them, then load matches back into my notebook

In [None]:
# e.g.
validated = matched_df[matched_df.scores >= 0.995].copy()
validated.projectID = validated.projectID.apply(lambda x: list(ast.literal_eval(x)))
validated = validated.explode('projectID')


matches = {}

for i, gp in validated.groupby('id'):
    v1 = gp[gp.source == 'KITAMOTO'].projectID.values[0]
    v2 = gp[gp.source == 'GEM'].projectID.values
    for val in v2:
        matches[val] = v1

new_matches = pd.DataFrame(matches.items(), columns=['GEM', 'KITAMOTO'])

In [None]:
# Save the validated matches
current_mappings_path = config['current_mappings']
print("Saving new matches to:", current_mappings_path)
save_new_mappings(new_matches, current_mappings_path, 'GEM', 'KITAMOTO')

# Kitamoto and FIT matching

FIT is a very large dataset so filter the config target technologies and admin-1s to match smaller portions of data at a time

In [None]:
with open('japan/configs/config_kitamoto_fit.yaml') as f:
    config = yaml.safe_load(f)

In [None]:
# I also made it possible to not use data.py to load the data, but to instead pass dataframes to the main function

def load_kitamoto_for_fit_matching(config):
    data_config = config['KITAMOTO']

    df = pd_gbq.read_gbq(f"""SELECT plant_name, business_name, asset_class, output_mw, latitude, longitude, admin_1, hash_id
    FROM {data_config['bq_table']}
    WHERE invalid_latlon = False""", project_id=config['gcp_project'])

    df = df.rename(columns={'business_name':'Name', 'output_mw':'Capacity', 'latitude':'lat', 'longitude':'lon', 'hash_id':'projectID'})

    df['Technology'] = df.asset_class.map(data_config['technology_map'])

    # filter to target techs
    df = df[df.Technology.isin(config['target_technologies'])]
    # filter to target admin_1s
    df = df[df.admin_1.isin(config['target_admin_1s'])]

    if data_config["remove_matched_data"]:
        # logger.info("Removing already matched data from KITAMOTO")
        path = config["current_mappings"]
        if os.path.exists(path):
            current_mappings = pd.read_csv(path)
            df = df[~df["projectID"].isin(current_mappings.KITAMOTO)]
        else:
            print(f"Current mappings do not exist: {path}")

    if data_config["aggregate_units"]:
        # logger.info("Aggregating KITAMOTO units")
        df = df.groupby(['plant_name', 'Name', 'Technology']).agg({'Capacity': 'sum', 'projectID': set, 'lat': pd.Series.mean,
                                            'lon': pd.Series.mean,
                                            'admin_1': pd.Series.mode}).reset_index()

    if data_config["clean_name"]:
        # logger.info("Cleaning KITAMOTO names")
        df = df.pipe(lambda x: ppm.cleaning.clean_name(x, config))

    df = df[config["target_columns"]]
    df = df.pipe(ppm.cleaning.set_column_name, "KITAMOTO")

    print(f"KITAMOTO: {len(df)} projects")

    return df

df_kitamoto = load_kitamoto_for_fit_matching(config)

In [None]:
df_fit = ppm.data.FIT(raw=False, update=True, config=config)

In [None]:
dfs = [df_kitamoto, df_fit]
data = ppm.powerplants(config=config, update=True, fill_geopositions=False, filter_missing_geopositions=False, dfs=dfs)

In [None]:
matched_df = get_matches(config, 'FIT', 'KITAMOTO')
matched_df = add_distance(matched_df, match_index_col='id', source_1='KITAMOTO', source_2='FIT')

# Kitamoto, FIT, GEM, and HJKS matching

Example of matching 4 sources at the same time

In [None]:
with open('japan/configs/config_all_four_sources.yaml') as f:
    config = yaml.safe_load(f)

df_fit = ppm.data.FIT_ALL(raw=False, update=True, config=config)
df_kit = ppm.data.KITAMOTO_ALL(raw=False, update=True, config=config)
df_hjks = ppm.data.HJKS_ALL(raw=False, update=True, config=config)
df_gem = ppm.data.GEM_ALL(raw=False, update=True, config=config)

In [None]:
dfs = [df_fit, df_kit, df_hjks, df_gem]
data = ppm.powerplants(config=config,
                       update=True,
                       fill_geopositions=False,
                       filter_missing_geopositions=False,
                       dfs=dfs,
                       return_cross_matches=False)

In [None]:
data_cross_matched = ppm.powerplants(config=config,
                                     update=True,
                                     fill_geopositions=False,
                                     filter_missing_geopositions=False,
                                     dfs=dfs,
                                     return_cross_matches=True)