# Set up and packages

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import geopandas as gpd
import re 
from matplotlib.lines import Line2D
from matplotlib import font_manager
import matplotlib
import cartopy.crs as ccrs
import cartopy.feature as cfeature

from sklearn.preprocessing import LabelEncoder

forest = '#284e13ff'
font_spec = 'Courier New'
xticks_font = font_manager.FontProperties(family='Courier New')

# Load data

In [None]:
df = pd.read_csv("data/assets_for_deforestation.csv", sep = "\t")
df.head()

# Restrict dataset

In [None]:
df = df[df.latitude.between(-30, -10) & df.longitude.between(-60, -40)]

# Check missings

In [None]:
df.isnull().sum() # no missings! 

# Encode sector

In [None]:
# split sector 
df[['sector_main', 'sector_sub']] = df.sector.str.split("/", expand = True, n = 1)
df.isnull().sum()

In [None]:
# check which sectors have missing subsectors
df[df.sector_sub.isnull()].groupby('sector').sector.count()

# check the share of missings within those sectors
df[df.sector_main == 'steel'].sector_main.count()
df[df.sector_main == 'bioenergy'].sector_main.count()
    # they are _always_ missing --> impute a category that sector_sub == sector_main

df['sector_sub'] = df['sector_sub'].fillna(df['sector_main'])

# check that there are no missings
assert df.isnull().sum().sum() == 0

In [None]:
# enumerate the sector labels

# initiate
label_encoder = LabelEncoder()

# transform
df['sector_main_num'] = label_encoder.fit_transform(df['sector_main'])
df['sector_num'] = label_encoder.fit_transform(df['sector'])

# glimpse at data
df.head()

# Sumstats

In [None]:
df.describe()

In [None]:
df.head()

# Maps for presentation

## Map of Brazil


In [None]:
# Create a map of Brazil
fig = plt.figure(figsize=(5, 5))
ax = plt.axes(projection=ccrs.PlateCarree())
ax.set_extent([-70, -30, -40, 0])  # Define the extent of the map (lon_min, lon_max, lat_min, lat_max)

# Plot assets
ax.scatter(df['longitude'], df['latitude'], c=df['sector_main_num'], 
           marker='o', 
           s=5, 
           transform=ccrs.PlateCarree())

# Add map features
ax.coastlines()

# Add country borders
ax.add_feature(cfeature.BORDERS, linestyle='-', linewidth=1)

# Add a square highlighting the right area
ax.plot([-60, -40, -40, -60, -60], [-30, -30, -10, -10, -30],
         color=forest, linewidth=1, marker='',
         transform=ccrs.Geodetic(), #remove this line to get straight lines
         )

# Add title and labels
plt.title('Assets of focus')
plt.xlabel('Longitude')
plt.ylabel('Latitude')

# Show the plot
plt.show()

# save the plot
fig.savefig('graphs/map_assets_of_interest.png', bbox_inches='tight')

# Summary stats on a graph

In [None]:

top_sectors = df.groupby('sector_main').uid_gem.count().reset_index().sort_values('uid_gem')
top_sectors

fig = plt.figure(figsize=(5, 5))

plt.barh(top_sectors.sector_main, top_sectors.uid_gem, color = forest)
plt.title('Sectors by number of assets')
plt.show()

fig.savefig('graphs/hbar_sectors.png', bbox_inches='tight')

In [None]:
# normalize capacity within group

def normalize_group(sector_main):
    sector_main['capacity_norm'] = (sector_main['capacity_first'] - sector_main['capacity_first'].mean()) / sector_main['capacity_first'].std()
    return sector_main

# Apply the function to each group
normalized_data = df.groupby('sector_main').apply(normalize_group).reset_index(drop = True)

fig = plt.figure(figsize=(5, 5))
sns.kdeplot(data = normalized_data, x = 'capacity_norm', hue = 'sector_main')
plt.xlabel('capacity (normalized)')
plt.ylabel('density')
plt.title('Capacity (normalized), by sector')


fig.savefig('graphs/kde_sectors.png', bbox_inches='tight')