In [7]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

    
df_gem = pd.read_csv("data/loaded_asset/asset_level_open_source_gem.csv", low_memory=False)

# coerce start year into numerical format
for var in ['start_year', 'capacity']:
    df_gem[var] = pd.to_numeric(df_gem[var], errors='coerce') 

# drop all observations with a missing start_year
df_gem = df_gem[df_gem.start_year.notnull()].reset_index()

# drop all observations with a start year outside of 2001-2022
df_gem = df_gem[df_gem.start_year.between(2001, 2023)]

# main and sub sector
df_gem[['sector_main', 'sector_sub']] = df_gem.sector.str.split("/", expand = True, n = 1)
df_gem.isnull().sum()

# encode subsectors
label_encoder = LabelEncoder()
df_gem['sector_main_num'] = label_encoder.fit_transform(df_gem['sector_main'])
df_gem['sector_num'] = label_encoder.fit_transform(df_gem['sector'])

# keep only relevant columns
cols_to_keep = ['latitude', 'longitude', 'uid_gem', 'sector_main', 
                'sector_sub', 'sector_main_num', 
                'start_year', 'capacity', 'capacity_unit', 'asset_name', 'owner_name', 'country']

df_gem = df_gem[cols_to_keep]

df_gem.head()

Unnamed: 0,latitude,longitude,uid_gem,sector_main,sector_sub,sector_main_num,start_year,capacity,capacity_unit,asset_name,owner_name,country
0,28.4624,-0.0576,L900124,wind power,onshore,11,2014.0,10.0,mw,Kabertene wind farm,Shariket Kahraba wa Taket Moutadjadida (SKTM),Algeria
1,25.8577,34.4182,L900045,wind power,onshore,11,2018.0,240.0,mw,Gulf Of Ziet Wind Complex,New and Renewable Energy Authority (NREA),Egypt
2,25.8577,34.4182,L900045,wind power,onshore,11,2018.0,220.0,mw,Gulf Of Ziet Wind Complex,New and Renewable Energy Authority (NREA),Egypt
3,25.8577,34.4182,L900045,wind power,onshore,11,2018.0,120.0,mw,Gulf Of Ziet Wind Complex,New and Renewable Energy Authority (NREA),Egypt
4,28.4005,32.9572,L900026,wind power,onshore,11,2019.0,263.0,mw,Ras Ghareb wind farm,Engie SA,Egypt


In [24]:
# step 1: aggregate unit-specific information by uid_gem

cols_for_agg = ['capacity', 'start_year', 'sector_sub', 'uid_gem']

# 1a: first observaitons 
df_gem_first = df_gem[cols_for_agg].groupby('uid_gem').nth(0) \
                    .rename(columns={'start_year': 'start_year_first', 
                                        'capacity': 'capacity_first', 
                                        'sector_sub': 'sector_sub_first'}).reset_index()

assert(len(df_gem_first) == df_gem_first.uid_gem.nunique())

# 1b: list of info for subsequent units
df_gem_list = df_gem[cols_for_agg].groupby('uid_gem').agg(list).reset_index()

assert(len(df_gem_list) == df_gem_list.uid_gem.nunique())

# step 2: keep non-changing information about each asset

invariant_cols = ['latitude', 'longitude', 'uid_gem', 'sector_main', 'sector_main_num', 
            'capacity_unit', 'country']

df_gem_invariant = df_gem[invariant_cols].drop_duplicates('uid_gem', keep='first')

assert(len(df_gem_invariant) == df_gem_invariant.uid_gem.nunique())


In [31]:
# merge aggregated datasets 
df_gem = pd.merge(df_gem_invariant, df_gem_first, on = 'uid_gem')
df_gem = pd.merge(df_gem, df_gem_list, on = 'uid_gem').reset_index(drop=True)

# retrieve number of units within an asset
df_gem['number_units'] = df_gem.start_year.apply(lambda x: len(x))

# check lenght of data 
assert(len(df_gem) == df_gem.uid_gem.nunique())

# export data
df_gem.to_csv('data/assets_for_deforestation.csv', index=False, sep='\t', encoding='utf-8')



Unnamed: 0,latitude,longitude,uid_gem,sector_main,sector_main_num,capacity_unit,country,index,capacity_first,start_year_first,sector_sub_first,capacity,start_year,sector_sub,number_units
0,28.4624,-0.0576,L900124,wind power,11,mw,Algeria,0,10.0,2014.0,onshore,[10.0],[2014.0],[onshore],1
1,25.8577,34.4182,L900045,wind power,11,mw,Egypt,1,240.0,2018.0,onshore,"[240.0, 220.0, 120.0]","[2018.0, 2018.0, 2018.0]","[onshore, onshore, onshore]",3
2,28.4005,32.9572,L900026,wind power,11,mw,Egypt,4,263.0,2019.0,onshore,[263.0],[2019.0],[onshore],1
3,28.1338,33.2602,L900035,wind power,11,mw,Egypt,5,252.0,2021.0,onshore,[252.0],[2021.0],[onshore],1
4,29.1988,32.6210,L900044,wind power,11,mw,Egypt,7,33.0,2001.0,onshore,"[33.0, 30.0, 47.0, 85.0, 80.0, 120.0]","[2001.0, 2003.0, 2004.0, 2006.0, 2008.0, 2009.0]","[onshore, onshore, onshore, onshore, onshore, ...",6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24462,-29.0175,153.4450,L200402,bioenergy,1,mw,Australia,37866,38.0,2007.0,,[38.0],[2007.0],[None],1
24463,-28.3107,153.4361,L200403,bioenergy,1,mw,Australia,37867,30.0,2007.0,,[30.0],[2007.0],[None],1
24464,-19.5574,147.3308,L200405,bioenergy,1,mw,Australia,37869,68.0,2010.0,,[68.0],[2010.0],[None],1
24465,-21.1647,149.1348,L201417,bioenergy,1,mw,Australia,37870,40.0,2013.0,,[40.0],[2013.0],[None],1
