# Set up and packages

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import geopandas as gpd
import re 
from matplotlib.lines import Line2D
import cartopy.crs as ccrs
import cartopy.feature as cfeature

from sklearn.preprocessing import LabelEncoder

# Start over

# Import data

In [None]:
df = pd.read_csv("data/combined_asset_data.csv")


# for_deforestation.groupby(['latitude', 'longitude', 'uid_gem'])
# print(len(for_deforestation))
# print(for_deforestation.uid_gem.nunique())



# Deal with missings
We have many missing values for different features. There are a few restrictions we implement on our dataset: 
- remove assets with missing latitude and longitude
- remove assets with missing owner??
- remove assets with missing start year??

# Check missing

In [None]:
missings = pd.DataFrame([df.isnull().sum(), df.isnull().mean()]).transpose().rename(columns = {0: "count", 1: "share"})
missings

# Deal with owner information

In [None]:
sample = df.sample(n = 30, random_state=62442)
sample

In [None]:
len(df.owner_name.unique())

In [None]:
# turn everything into uppercase
df['owner_name'] = df.owner_name.str.upper()
df.head()

In [None]:
df_owner = df.groupby('owner_name').uid.count().sort_values(ascending=False).head(10).reset_index()
companies = list(df_owner.owner_name) 
assets = list(df_owner.uid)

for i, (company, asset) in enumerate(zip(companies, assets)):
    plt.scatter(i, 0, s=asset*10, alpha=0.5, label=f'{company}\nAssets: {asset}', facecolor='green', edgecolor='black')

# Adding text labels for each company and asset count
# plt.annotate('\n'.join(companies), xy=(range(len(companies)), [0]*len(companies)), ha='center', va='center', family='monospace')
# plt.annotate('\n'.join([str(a) for a in assets]), xy=(range(len(companies)), [0]*len(companies)), ha='center', va='center', family='monospace')

# Remove axes
plt.axis('off')

# # Show plot
# plt.legend(loc='upper left', bbox_to_anchor=(1, 1))
# plt.title('Assets by Company', family='monospace')
# plt.tight_layout()
# plt.show()

# # df_owner[0]
# df_owner

# Encode sector

In [None]:
# split sector 
df[['sector_main', 'sector_sub']] = df.sector.str.split("/", expand = True, n = 1)
df.isnull().sum()

In [None]:
# enumerate the sector labels

label_encoder = LabelEncoder()

df['sector_main_num'] = label_encoder.fit_transform(df['sector_main'])
df['sector_num'] = label_encoder.fit_transform(df['sector'])
# type(df['sector_num'][0])

# Maps for presentation

## World map

In [None]:
# World basemap for context
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))

# Create a GeoDataFrame with the latitude and longitude data
df_geo = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df['longitude'], df['latitude']))

# Plot the world basemap
ax = world.plot(figsize=(10, 6), color='lightgray')

# Plot your data on top with different colors based on the 'sector_num' column
scatter = df_geo.plot(ax=ax, c=df_geo['sector_main_num'], marker='o', markersize=1, legend=True)

# prepare legend
legend_df = df_geo[['sector_main_num', 'sector_main']].drop_duplicates()

# Customize the plot
plt.title('Assets by sector') 
plt.xlabel('Longitude')
plt.ylabel('Latitude')

# Show the plot
plt.show()

## Map of Brazil


In [None]:
# Filter assets in Brazil
brazil_assets = df[df['country'] == 'Brazil']

# Create a map of Brazil
plt.figure(figsize=(10, 10))
ax = plt.axes(projection=ccrs.PlateCarree())
ax.set_extent([-75, -30, -35, 5])  # Define the extent of the map (lon_min, lon_max, lat_min, lat_max)

# Plot assets
ax.scatter(brazil_assets['longitude'], brazil_assets['latitude'], c=brazil_assets['sector_main_num'], 
           marker='o', 
           s=5, 
           transform=ccrs.PlateCarree())

# Add map features
ax.coastlines()

# Add country borders
ax.add_feature(cfeature.BORDERS, linestyle='-', linewidth=1)


# Add title and labels
plt.title('Assets in Brazil')
plt.xlabel('Longitude')
plt.ylabel('Latitude')

# Show the plot
plt.show()

In [None]:

top_owners = df_owner.groupby('main_owner').asset_name.count().reset_index().sort_values('asset_name', ascending = False).head(20).sort_values('asset_name')

plt.barh(top_owners.main_owner, top_owners.asset_name)
plt.xticks(rotation=90)
plt.title('Top 20 (main) owners by number of assets')
plt.show()

df_owner.groupby('main_owner').asset_name.count().describe()

In [None]:

top_sectors = df.groupby('sector').asset_name.count().reset_index().sort_values('asset_name')
top_sectors

plt.barh(top_sectors.sector, top_sectors.asset_name)
plt.xticks(rotation=90)
plt.title('Sectors by number of assets')
plt.show()

df.sector.nunique()

In [None]:
top_countries = df.groupby(['country']).asset_name.count().reset_index().sort_values('asset_name').tail(20)
top_countries

plt.barh(top_countries.country, top_countries.asset_name)
plt.xticks(rotation=90)
plt.title('Top 20 countries by number of assets')
plt.show()

# Progress summary: 

GEM data: 
- downloaded asset data from GEM (Global Energy Monitor) about 12 types of energy-related assets
- cleaned each separate dataset (due to idiosyncracies between then sets, had to do each one separately)
- for each dataset: kept asset name, sector, owner(s), geographic location, start date, and if available included capacity 

EDA: 
- we focus only on assets where location, owner and start date are known
- in this dataset, we have about 45k assets, attributed to more than 12k owners
- majority of owners (more than 75%) own only 1-2 companies
- assets are located acround the globe, with most of them present in China, US, India, Brazil, Spain and Germany
- the sector wind power has the highest number of associated assets (~10k), followed by solar (~7k), and coal power plant (~6k)

The next steps are: 
- clean in a similar fashion the mining data from the Climate Trace source
- overlay asset information with the deforestation data using latitutde/longitude info
- consider how to feature engineer informaiton about onwers in any ML exercise on predicting deforestation

In [None]:
trial = {'key1': [14, 44], 'key2': [12, 84]}

print(trial['key1'][0])