In [144]:
import pandas as pd
import re

In [145]:
# Load coal mine data (Primary dataset)
mine_full = pd.read_excel("./Data/new coal/Global-Coal-Mine-Tracker-April-2024 (1).xlsx", 
                          sheet_name="Global Coal Mine Tracker (Non-C")

# Load coal power plant data and calculate the estimated capacity using the capacity factor

plant_full = pd.read_excel("./Data/new coal/Global-Coal-Plant-Tracker-January-2025.xlsx", sheet_name="Units")
plant_full['Est_capacity (MW)'] = plant_full['Capacity (MW)'] * plant_full['Capacity factor']

# Load coal terminal data note: The terminal dataset capacity may be overestimated, as some terminals handle "all cargo"
terminal_full = pd.read_excel("./Data/new coal/Global-Coal-Terminals-Tracker-December-2024.xlsx", sheet_name="Terminals")

In [146]:
# Filter plant and terminal data
plant = plant_full[(plant_full['Status'].str.lower() == 'operating') & (plant_full['Country/Area'] == 'India')]
terminal = terminal_full[(terminal_full['Status'].str.lower() == 'operating') & (terminal_full['Country/Area'] == 'India')]
# Filter mine data
total_data = pd.read_excel("./Data/total_data.xlsx")
mine = total_data.merge(mine_full, left_on="asset_id", right_on="GEM Mine ID", how="inner")
mine = mine[mine['Primary Consumer, Destination'] == 'Coal Plant'].drop_duplicates(subset=['GEM Mine ID'], keep='last')
mine


Unnamed: 0,asset_id,name,latitude,longitude,country,production,firm,GEM Mine ID,MSHA ID,Country,...,Location Accuracy,"Primary Consumer, Destination","Coal Plant, Steel Plant, Terminal","Coal Plant, Steel Plant, Terminal GEM Wiki",Reported Coal Mine Methane Emissions (thousand tonnes CO2e),Year of Reported Coal Mine Methane Emissions,GEM Coal Mine Methane Emissions Estimate (MCM/yr),GEM Coal Mine Methane Emissions Estimate (M tonnes/yr),CMM Emissions (CO2e 20 years),CMM Emissions (CO2e 100 years)
1,M0465,Adriyala Coal Mine,18.6648,79.579,India,1.7,The Singareni Collieries Co Ltd [100%],M0465,,India,...,Exact,Coal Plant,Ramagundam power station,https://www.gem.wiki/Ramagundam_power_station,,,12.6,0.008,0.7,0.3
9,M0474,Amlohri Coal Mine,24.134,82.593,India,15.0,Coal India Ltd [100.0%],M0474,,India,...,Exact,Coal Plant,Rihand power station,https://www.gem.wiki/Rihand_power_station,,,74.3,0.05,4.1,1.5
10,M0475,Amrapali Coal Mine,23.889479,85.00171,India,22.59,Coal India Ltd [100.0%],M0475,,India,...,Exact,Coal Plant,Barh I power station,https://www.gem.wiki/Barh_I_power_station,,,48.6,0.033,2.7,1.0
24,M0488,Bina Coal Mine,24.1483,82.7476,India,10.5,Coal India Ltd [100.0%],M0488,,India,...,Exact,Coal Plant,Anpara thermal power station,https://www.gem.wiki/Anpara_thermal_power_station,,,33.5,0.022,1.8,0.7
34,M0513,Dipka Coal Mine,22.345077,82.544192,India,33.24,Coal India Ltd [100.0%],M0513,,India,...,Exact,Coal Plant,Korba Super Thermal Power Station (NTPC),https://www.gem.wiki/Korba_Super_Thermal_Power...,,,130.2,0.087,7.2,2.6
38,M0515,Dulanga Coal Mine,21.951705,83.807564,India,7.0,NTPC Ltd [100%],M0515,,India,...,Exact,Coal Plant,Darlipalli Super Thermal Power Station,https://www.gem.wiki/Darlipalli_Super_Thermal_...,,,6.3,0.004,0.4,0.1
46,M0527,Gevra Coal Mine,22.336312,82.545748,India,59.11,Coal India Ltd [100.0%],M0527,,India,...,Exact,Coal Plant,Korba Super Thermal Power Station (NTPC),https://www.gem.wiki/Korba_Super_Thermal_Power...,,,63.0,0.042,3.5,1.3
63,M0552,Gopalji Kaniha Coal Mine,21.095844,85.079292,India,12.35,Coal India Ltd [100.0%],M0552,,India,...,Exact,Coal Plant,Talcher Kaniha power station,https://www.gem.wiki/Talcher_Kaniha_power_station,,,14.6,0.01,0.8,0.3
68,M0556,Khadia Coal Mine,24.1318,82.7144,India,15.0,Coal India Ltd [100.0%],M0556,,India,...,Exact,Coal Plant,Anpara thermal power station,https://www.gem.wiki/Anpara_thermal_power_station,,,13.5,0.009,0.7,0.3
90,M0580,Moher Amlohri Coal Mine,24.135,82.599444,India,18.28,Sasan Power Ltd [100%],M0580,,India,...,Exact,Coal Plant,Sasan Ultra Mega Power Project,https://www.gem.wiki/Sasan_Ultra_Mega_Power_Pr...,,,21.6,0.015,1.2,0.4


## Sheet 4: Supply Chain

Step 1: Supplement the supply chian relationships from mine data to plant data

In [147]:
# Define mapping rules for renaming 'Coal Plant, Steel Plant, Terminal' values
plant_name_mapping = {
    "Darlipalli Super Thermal Power Station": "Darlipali power station", #confirm to be typo
    "Talcher Kaniha power station": "Talcher Kaniha Super Thermal Power Station",
    "Neyveli Thermal Power Station": ["New Neyveli Thermal Power Station", "Neyveli Thermal Power Station I", "Neyveli Thermal Power Station II"],
    "Kakatiya Power Station": "Kakatiya Thermal Power Project",
    "M.P. Power Generating Company": ["Shree Singaji Thermal Power Project", "Satpura Thermal Power Station"],
    "Anpara thermal power station": ["Anpara power station", "Anpara-C power station", "Anpara-D power station"]
}

# Expand 'Coal Plant, Steel Plant, Terminal' column with mapped names
expanded_plant_names = []
for name in mine['Coal Plant, Steel Plant, Terminal']:
    if name in plant_name_mapping:
        mapped_names = plant_name_mapping[name]
        if isinstance(mapped_names, list):
            expanded_plant_names.append(";".join(mapped_names))  # Join multiple mappings with semicolon
        else:
            expanded_plant_names.append(mapped_names)
    else:
        expanded_plant_names.append(name)

mine['Coal Plant, Steel Plant, Terminal'] = expanded_plant_names

# Iterate through mine dataset and link to plant dataset
for index, row in mine.iterrows():
    plant_names = row['Coal Plant, Steel Plant, Terminal'].split(';')  # Split mapped names if multiple
    mine_id = row['GEM Mine ID']  # Use GEM Mine ID instead of name
    
    for plant_name in plant_names:
        plant_name = plant_name.strip()
        matching_plants = plant[plant['Plant name'].str.contains(plant_name, case=False, na=False)]
        
        # Assign GEM Mine ID to the 'Coal source' column in the plant dataset
        for plant_index in matching_plants.index:
            if pd.isna(plant.at[plant_index, 'Coal source']) or plant.at[plant_index, 'Coal source'] == '':
                plant.at[plant_index, 'Coal source'] = mine_id
            else:
                plant.at[plant_index, 'Coal source'] += f", {mine_id}"

  matching_plants = plant[plant['Plant name'].str.contains(plant_name, case=False, na=False)]


Step 2: Manually link the mine or the coalfields with plant according to the coal source info (unfinished,waiting for the final version of ownership sheet), then extract mine-plant supply chain relationships

In [148]:
# Drop rows where 'Coal source' is NaN
plant = plant.dropna(subset=['Coal source'])

# Define mapping rules for linking mine ID and plant data
coal_source_updates = {
    "Dudhichua": "M0514",
    "Gare IV/1 coal mine": "M0520",
    "Pakri Barwadih": "M0601",  # This can be ignored, as coal block != coal mine
    "Dulanga": "M0515",  # This can be ignored, coal block != coal mine
    "Talaipalli": "M0640",
    "Talabira": "M1745",
    "Manoharpur": "M3888",
    "Amelia and Dongrital Blocks": "M0471",
    "Paras east and Kanta basin": "M0604",
    "Tokisud": "M0643",
    "Sriiampur": "M1740",
    "Talcher": "M1746",
    "West Bokaro": "M0649",
    "Bina": "M0488",
    "Moher-Amlori": "M0580"
}

# Function to update 'Coal source' column by appending corresponding Mine IDs
def update_coal_source(coal_source): 

    coal_source = str(coal_source)
    additional_values = [value for key, value in coal_source_updates.items() if key in coal_source]
    
    if additional_values:
        return coal_source + ", " + ", ".join(additional_values)
    return coal_source

# Apply the update function to 'Coal source' column
plant['Coal source'] = plant['Coal source'].apply(update_coal_source)

# Create new DataFrame with 'supplier', 'client', 'recipe', and 'product' columns
new_data = []
for index, row in plant.iterrows():
    coal_sources = str(row['Coal source']).split(',')  # Split multiple sources
    for source in coal_sources:
        source = source.strip()  # Remove extra spaces
        match = re.match(r'^M\d{4}$', source)  # Check if it matches 'M' followed by 4 digits
        if match:
            supplier = source  # Assign supplier from the match
            client = row['GEM location ID']  # Assign client
            product = row['Coal type']  # Assign product
            recipe = None  # Placeholder for recipe, can be updated if needed
            new_data.append([supplier, client, recipe, product])  # Add to new data list

# Create new DataFrame with the extracted values
supply_chain_MP = pd.DataFrame(new_data, columns=['supplier', 'client', 'recipe', 'product'])

# Remove duplicate rows where supplier and client are the same, keep only the first occurrence
supply_chain_MP = supply_chain_MP.drop_duplicates(subset=['supplier', 'client'], keep='first')


In [149]:
supply_chain_MP

Unnamed: 0,supplier,client,recipe,product
0,M1746,L100000102442,,unknown
2,M0488,L100000102610,,bituminous
3,M0556,L100000102610,,bituminous
12,M0488,L100000102611,,bituminous
13,M0556,L100000102611,,bituminous
16,M0488,L100000102612,,bituminous
17,M0556,L100000102612,,bituminous
18,M0514,L100000102612,,bituminous
24,M0475,L100000102092,,unknown
26,M3013,L100000102388,,bituminous


In [150]:
# 1️⃣ Create supply_chain DataFrame
supply_chain_TP = pd.DataFrame(columns=['supplier', 'client', 'recipe', 'product'])
supply_chain_TP['supplier'] = terminal['GEM Terminal ID'].unique()

# 2️⃣ Filter 'GEM Terminal ID' from terminal data where 'Coal Source' contains different countries
indonesia_terminal_ids = terminal[terminal['Coal Source'].str.contains('Indonesia', case=False, na=False, regex=True)]['GEM Terminal ID'].unique()
australia_terminal_ids = terminal[terminal['Coal Source'].str.contains('Australia', case=False, na=False, regex=True)]['GEM Terminal ID'].unique()
mozambique_terminal_ids = terminal[terminal['Coal Source'].str.contains('Mozambique', case=False, na=False, regex=True)]['GEM Terminal ID'].unique()
south_africa_terminal_ids = terminal[terminal['Coal Source'].str.contains('South Africa', case=False, na=False, regex=True)]['GEM Terminal ID'].unique()

# 3️⃣ Filter 'GEM Plant ID' from plant data where 'Coal Source' contains different countries
indonesia_plant_ids = plant[plant['Coal source'].str.contains('Indonesia', case=False, na=False, regex=True)]['GEM location ID'].unique()
australia_plant_ids = plant[plant['Coal source'].str.contains('Australia', case=False, na=False, regex=True)]['GEM location ID'].unique()
mozambique_plant_ids = plant[plant['Coal source'].str.contains('Mozambique', case=False, na=False, regex=True)]['GEM location ID'].unique()
south_africa_plant_ids = plant[plant['Coal source'].str.contains('South Africa', case=False, na=False, regex=True)]['GEM location ID'].unique()

# 4️⃣ Create a new DataFrame to store the expanded matching relationships
expanded_rows = []

# 5️⃣ Iterate through the filtered terminal IDs and duplicate rows to match multiple plant IDs
for terminal_id in indonesia_terminal_ids:
    for plant_id in indonesia_plant_ids:
        expanded_rows.append({'supplier': terminal_id, 'client': plant_id, 'recipe': '', 'product': ''})

for terminal_id in australia_terminal_ids:
    for plant_id in australia_plant_ids:
        expanded_rows.append({'supplier': terminal_id, 'client': plant_id, 'recipe': '', 'product': ''})

for terminal_id in mozambique_terminal_ids:
    for plant_id in mozambique_plant_ids:
        expanded_rows.append({'supplier': terminal_id, 'client': plant_id, 'recipe': '', 'product': ''})

for terminal_id in south_africa_terminal_ids:
    for plant_id in south_africa_plant_ids:
        expanded_rows.append({'supplier': terminal_id, 'client': plant_id, 'recipe': '', 'product': ''})

# 6️⃣ Convert the list to a DataFrame and append to supply_chain
expanded_df = pd.DataFrame(expanded_rows)

# 7️⃣ Append the expanded data to supply_chain
supply_chain_TP = pd.concat([supply_chain_TP, expanded_df], ignore_index=True)

supply_chain_TP


Unnamed: 0,supplier,client,recipe,product
0,T1180,,,
1,T1183,,,
2,T1155,,,
3,T1054,,,
4,T1162,,,
5,T1178,,,
6,T1181,,,
7,T1175,,,
8,T1059,,,
9,T1171,,,


In [151]:
# 1️⃣ Create a new DataFrame to store plant data and remove rows where 'Coal Source' contains specific countries
plant_filtered = plant[
    ~plant['Coal source'].str.contains('Indonesia|Australia|South Africa|Mozambique', case=False, na=False, regex=True)
].copy()

# 2️⃣ Filter only rows where 'Coal Source' contains 'mported' (case insensitive)
plant_filtered = plant_filtered[
    plant['Coal source'].str.contains('mported', case=False, na=False, regex=True)
].copy()

# 3️⃣ Identify suppliers (GEM Terminal ID) in supply_chain that do not have a client
suppliers_without_clients = supply_chain_TP[supply_chain_TP['client'].isna()]['supplier'].unique()

# 4️⃣ Retrieve all GEM Plant IDs from the filtered plant data
filtered_plant_ids = plant_filtered['GEM location ID'].unique()

# 5️⃣ For each supplier without a client, duplicate rows and assign all matching GEM Plant IDs
expanded_rows_imported = []
for supplier in suppliers_without_clients:
    for plant_id in filtered_plant_ids:  # Assign all GEM Plant IDs from plant_filtered only to suppliers without clients
        expanded_rows_imported.append({
            'supplier': supplier,
            'client': plant_id,
            'recipe': '',
            'product': ''
        })

# 6️⃣ Convert to a DataFrame and append to supply_chain
expanded_df_imported = pd.DataFrame(expanded_rows_imported)
supply_chain_TP = pd.concat([supply_chain_TP, expanded_df_imported], ignore_index=True)

# 7️⃣ Remove rows where 'client' is still missing and reset the index
supply_chain_TP = supply_chain_TP.dropna(subset=['client']).reset_index(drop=True)

  plant_filtered = plant_filtered[


In [152]:
# 1️⃣ Create four new columns in the supply_chain dataset
supply_chain_TP['supplier_lat'] = None
supply_chain_TP['supplier_lon'] = None
supply_chain_TP['client_lat'] = None
supply_chain_TP['client_lon'] = None

# 2️⃣ Retrieve latitude and longitude for suppliers using GEM Terminal ID from the terminal dataset (remove duplicates)
supplier_coords = terminal[['GEM Terminal ID', 'Latitude', 'Longitude']].drop_duplicates(subset=['GEM Terminal ID']).set_index('GEM Terminal ID')

# 3️⃣ Perform mapping and expand into two separate columns
supplier_mapped = supply_chain_TP['supplier'].map(supplier_coords.to_dict(orient='index'))
supplier_mapped = supplier_mapped.apply(lambda x: pd.Series([x['Latitude'], x['Longitude']]) if isinstance(x, dict) else pd.Series([None, None]))

# 4️⃣ Assign the values to the supply_chain DataFrame
supply_chain_TP[['supplier_lat', 'supplier_lon']] = supplier_mapped

# 5️⃣ Retrieve latitude and longitude for clients using GEM Location ID from the plant dataset (remove duplicates)
client_coords = plant[['GEM location ID', 'Latitude', 'Longitude']].drop_duplicates(subset=['GEM location ID']).set_index('GEM location ID')

# 6️⃣ Perform mapping and expand into two separate columns
client_mapped = supply_chain_TP['client'].map(client_coords.to_dict(orient='index'))
client_mapped = client_mapped.apply(lambda x: pd.Series([x['Latitude'], x['Longitude']]) if isinstance(x, dict) else pd.Series([None, None]))

# 7️⃣ Assign the values to the supply_chain DataFrame
supply_chain_TP[['client_lat', 'client_lon']] = client_mapped

import numpy as np

# Function to calculate the distance between two latitude-longitude points using the Haversine formula
def haversine(lat1, lon1, lat2, lon2):
    R = 6371  # Earth’s radius in kilometers
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2
    c = 2 * np.arcsin(np.sqrt(a))
    return R * c

# Compute the distance between each supplier-client pair in the supply_chain dataset
supply_chain_TP['distance_km'] = supply_chain_TP.apply(
    lambda row: haversine(row['supplier_lat'], row['supplier_lon'], row['client_lat'], row['client_lon'])
    if not any(pd.isna([row['supplier_lat'], row['supplier_lon'], row['client_lat'], row['client_lon']])) else np.nan,
    axis=1
)

# Retain only rows where the distance is less than 1000 km
supply_chain_TP = supply_chain_TP[supply_chain_TP['distance_km'] < 1000].reset_index(drop=True)

# Retrieve Coal Type from the plant dataset using client ID (GEM location ID)
client_coal_type = plant[['GEM location ID', 'Coal type']].drop_duplicates(subset=['GEM location ID']).set_index('GEM location ID')

# Map the Coal Type values to the 'product' column in supply_chain
supply_chain_TP['product'] = supply_chain_TP['client'].map(client_coal_type['Coal type'])

# Drop latitude, longitude, and distance columns from supply_chain
supply_chain_TP = supply_chain_TP.drop(columns=['supplier_lat', 'supplier_lon', 'client_lat', 'client_lon', 'distance_km'], errors='ignore')

In [None]:
supply_chain = pd.concat([supply_chain_TP, supply_chain_MP])

In [156]:
supply_chain

Unnamed: 0,supplier,client,recipe,product
0,T1160,L100000102537,,subbituminous
1,T1160,L100000102539,,unknown
2,T1160,L100000102055,,unknown
3,T1157,L100000102537,,subbituminous
4,T1157,L100000102539,,unknown
...,...,...,...,...
171,M2822,L100000102228,,lignite
175,M0552,L100000102502,,bituminous
181,M0649,L100000102287,,bituminous
183,M0591,L100000102371,,bituminous


In [155]:
excel_path = "/Users/user/Documents/GitHub/Thesis_2425/Data/coal_supply_chain_india.xlsx"  

with pd.ExcelWriter(excel_path, engine="openpyxl", mode="a", if_sheet_exists="replace") as writer:
    supply_chain.to_excel(writer, sheet_name="supply_chain", index=False)
