# Set up and packages

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import geopandas as gpd
import re 

from sklearn.preprocessing import LabelEncoder

# Import data

In [None]:
# Import clean data

df = pd.read_csv("data/cleaned/gem_data.csv", delimiter= "\t")
df.columns

# Deal with missings
We have many missing values for different features. There are a few restrictions we implement on our dataset: 
- remove assets with missing latitude and longitude
- remove assets with missing owner??
- remove assets with missing start year??

In [None]:
# Check missing
msno.bar(df)

In [None]:
missings = pd.DataFrame([df.isnull().sum(), df.isnull().mean()]).transpose().rename(columns = {0: "count", 1: "share"})
missings

In [None]:
# drop useless variables
df.drop(['construction_start_year', 'proposal_year', 'construction_year','discovery_year'], axis = 1).reset_index()

# drop observations which have missing latitude, longitude, owner or start year
crucial_vars = [['latitude', 'longitude'], ['owner'], ['start_year']]
print(f"N at the start: {len(df)}")

for var in crucial_vars: 
    df = df.dropna(subset=var)
    print(f"N after dropping {var}: {len(df)}")

df.reset_index()

# Deal with owner information

In [None]:
len(df.owner.unique())

In [None]:
df[df.owner.str.contains("%")]
splits = df.owner.str.split(', ', expand=True)
splits.rename(columns = {x: "owner_" + str(x) for x in list(range(8))}, inplace= True)

df_owner = pd.concat([df, splits], axis = 1)
df_owner['main_owner'] = df_owner.owner_0.str.replace("[\[\(]\d*[.]\d*[%][\]\)]", '', regex=True).str.strip()
df_owner

# Encode sector

In [None]:
# enumerate the sector labels

label_encoder = LabelEncoder()

df['sector_num'] = label_encoder.fit_transform(df['sector'])
df.head(2)

In [None]:
df_geo['sector_num'] = df.sector_num.astype(np.int32)
type(df_geo.sector_num[0])

In [None]:
# World basemap for context
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))

# Create a GeoDataFrame with the latitude and longitude data
df_geo = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df['longitude'], df['latitude']))

# Plot the world basemap
ax = world.plot(figsize=(10, 6), color='lightgray')

# Plot your data on top with different colors based on the 'sector_num' column
scatter = df_geo.plot(ax=ax, c=df_geo['sector_num'], marker='o', markersize=1, legend=True)

# prepare legend
legend_df = df_geo[['sector_num', 'sector']].drop_duplicates()

# create a custom legend using the legend DataFrame
legend_labels = {label: str(sector) for label, sector in zip(legend_df['sector_num'], legend_df['sector'])}
scatter.legend(title='Sector', labels=legend_labels, loc='upper left')

# Customize the plot
plt.title('Assets by sector')
plt.xlabel('Longitude')
plt.ylabel('Latitude')

# Show the plot
plt.show()
legend_df

In [None]:

top_owners = df_owner.groupby('main_owner').asset_name.count().reset_index().sort_values('asset_name', ascending = False).head(20).sort_values('asset_name')

plt.barh(top_owners.main_owner, top_owners.asset_name)
plt.xticks(rotation=90)
plt.title('Top 20 (main) owners by number of assets')
plt.show()

df_owner.groupby('main_owner').asset_name.count().describe()

In [None]:

top_sectors = df.groupby('sector').asset_name.count().reset_index().sort_values('asset_name')
top_sectors

plt.barh(top_sectors.sector, top_sectors.asset_name)
plt.xticks(rotation=90)
plt.title('Sectors by number of assets')
plt.show()

df.sector.nunique()

In [None]:
top_countries = df.groupby(['country']).asset_name.count().reset_index().sort_values('asset_name').tail(20)
top_countries

plt.barh(top_countries.country, top_countries.asset_name)
plt.xticks(rotation=90)
plt.title('Top 20 countries by number of assets')
plt.show()

# Progress summary: 

GEM data: 
- downloaded asset data from GEM (Global Energy Monitor) about 12 types of energy-related assets
- cleaned each separate dataset (due to idiosyncracies between then sets, had to do each one separately)
- for each dataset: kept asset name, sector, owner(s), geographic location, start date, and if available included capacity 

EDA: 
- we focus only on assets where location, owner and start date are known
- in this dataset, we have about 45k assets, attributed to more than 12k owners
- majority of owners (more than 75%) own only 1-2 companies
- assets are located acround the globe, with most of them present in China, US, India, Brazil, Spain and Germany
- the sector wind power has the highest number of associated assets (~10k), followed by solar (~7k), and coal power plant (~6k)

The next steps are: 
- clean in a similar fashion the mining data from the Climate Trace source
- overlay asset information with the deforestation data using latitutde/longitude info
- consider how to feature engineer informaiton about onwers in any ML exercise on predicting deforestation