# Example plant matching notebook

In [None]:
cd ..

In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import yaml
import ast
import re
import pandas_gbq as pd_gbq
import geopandas as gpd
import os

pd.set_option('display.max_columns', None)

In [None]:
import powerplantmatching as ppm

In [None]:
def get_matches(config, source_1, source_2):
    matched_df = pd.read_csv(f"/Users/adminuser/.local/share/powerplantmatching/data/out/{config['hash']}/Matched_{source_1}_{source_2}.csv")
    matched_df = matched_df.iloc[2:]
    matched_df.scores = matched_df.scores.astype(float)

    print(f"scores min: {matched_df.scores.min()}, max: {matched_df.scores.max()}")

    source_1_cols = [col for col in matched_df.columns if '.1' not in col]
    source_2_cols = ['Unnamed: 0']+[col for col in matched_df.columns if '.1' in col] + ['scores']
    if "KITAMOTO" in source_1:
        name1 = 'KITAMOTO'
        name2 = source_2
    elif "KITAMOTO" in source_2:
        name1 = source_1
        name2 = 'KITAMOTO'
    else:
        name1 = source_1
        name2 = source_2
    matched_df = (pd.concat([matched_df[source_1_cols].assign(source=name1),
    matched_df[source_2_cols].set_axis(source_1_cols, axis='columns').assign(source=name2)])
    .rename(columns={'Unnamed: 0': 'id'}))
    return matched_df.sort_values(['id', 'source'])

def add_distance(df, match_index_col='id', source_1='KITAMOTO', source_2='GEM'):
    df_kit = df[df.source==source_1]
    gdf_kit = gpd.GeoDataFrame(df_kit, geometry=gpd.points_from_xy(df_kit.lon, df_kit.lat), crs='EPSG:4326')
    df_fit = df[df.source==source_2]
    gdf_fit = gpd.GeoDataFrame(df_fit, geometry=gpd.points_from_xy(df_fit.lon, df_fit.lat), crs='EPSG:4326')

    gdf_fit = gdf_fit.to_crs(crs=32654)
    gdf_kit = gdf_kit.to_crs(crs=32654)

    gdf = gdf_fit.merge(gdf_kit, on=match_index_col, suffixes=('_fit', '_kit'))
    gdf['distance_meters'] = gdf.apply(lambda row: row['geometry_fit'].distance(row['geometry_kit']), axis=1)

    return df.merge(gdf[[match_index_col, 'distance_meters']], on=match_index_col)

def save_new_mappings(new_matches, current_mappings_path, source_1, source_2):
    if os.path.exists(current_mappings_path):
        current_mappings = pd.read_csv(current_mappings_path)
        print("Length of existing mappings:", len(current_mappings))
        # if source_1 not in current_mappings.columns or source_2 not in current_mappings.columns:
        #     print("Error: Source columns not found in current mappings")
        #     return
        # check that the new matches are not already in the current mappings
        if not ((new_matches[source_1].isin(current_mappings[source_1]))|(new_matches[source_2].isin(current_mappings[source_2]))).any():
            current_mappings = pd.concat([current_mappings, new_matches])
            print("Length of updated mappings:", len(current_mappings))
            current_mappings.to_csv(current_mappings_path, index=False)
        else:
            print("Error: IDs in new matches found in current mappings")
    else:
        print("File does not exist, creating new file")
        print("Length of new mappings:", len(new_matches))
        new_matches.to_csv(current_mappings_path, index=False)

# Kitamoto and GEM matching

This groups together all rows of data that belong to the same plant (this is not matching at the individual unit level)

In [None]:
with open('japan/configs/config_gem.yaml') as f:
    config = yaml.safe_load(f)

data = ppm.powerplants(config=config, update=True, fill_geopositions=False, filter_missing_geopositions=False)

In [None]:
# Ignore wierd naming of 'KITAMOTOGEM' this is because I had multiple Kitamoto functions in the data.py file

matched_df = get_matches(config, 'GEM', 'KITAMOTOGEM')
matched_df = add_distance(matched_df, match_index_col='id', source_1='KITAMOTO', source_2='GEM')

- Automatically save matches with very high scores
- The rest I would then save to a google sheet to validate them, then load matches back into my notebook

In [None]:
# e.g.
validated = matched_df[matched_df.scores >= 0.995].copy()
validated.projectID = validated.projectID.apply(lambda x: list(ast.literal_eval(x)))
validated = validated.explode('projectID')


matches = {}

for i, gp in validated.groupby('id'):
    v1 = gp[gp.source == 'KITAMOTO'].projectID.values[0]
    v2 = gp[gp.source == 'GEM'].projectID.values
    for val in v2:
        matches[val] = v1

new_matches = pd.DataFrame(matches.items(), columns=['GEM', 'KITAMOTO'])

In [None]:
# Save the validated matches
current_mappings_path = config['current_mappings']
print("Saving new matches to:", current_mappings_path)
save_new_mappings(new_matches, current_mappings_path, 'GEM', 'KITAMOTO')