In [217]:
# basics
import pandas as pd
import numpy as np

# maps
import matplotlib.pyplot as plt
import cartopy.crs as ccrs

# pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# prep
from sklearn.model_selection import train_test_split

# feature engineering
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

# modelling
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor

# evaluation
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score



In [218]:
raw_df = pd.read_csv('data/assets_with_deforestation.csv', sep = '\t')
raw_df
raw_df[raw_df['2015'] > 0]


Unnamed: 0,uid_gem,latitude,longitude,sector_main,sector_main_num,capacity_unit,country,asset_name,owner_name,index,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,treecover2000
146,L905350,-7.3047,-40.5301,wind power,11,mw,Brazil,Caldeirão Grande wind farm,Ibitu Energia,337,...,0.0,0.016529,0.0,0.0,0.0,0.0,0.0,0.0,0.0,37.0
157,L905082,-7.3724,-40.7072,wind power,11,mw,Brazil,Chapada Do Piaui 3 wind farm,Contour Global Do Brasil Holding LTDA,417,...,0.017447,0.007346,0.0,0.0,0.0,0.0,0.0,0.0,0.0,33.0
205,L905202,-8.5198,-41.5584,wind power,11,mw,Brazil,Lagoa Dos Ventos wind farm,Enel Green Power Brasil Participações LTDA,631,...,0.0,0.002755,0.0,0.0,0.0,0.02663,0.0,0.0,0.0,0.0
226,L905442,-9.8478,-41.0538,wind power,11,mw,Brazil,Pedra Branca wind farm,Brennand Energia,721,...,0.010101,0.013774,0.023875,0.0,0.0,0.0,0.0,0.0,0.0,0.0
283,L905506,-8.0065,-40.6378,wind power,11,mw,Brazil,Ventos Do Piauí 1 wind farm,Votorantim Geração de Energia SA,985,...,0.0,0.00551,0.012856,0.039486,0.0,0.0,0.0,0.0,0.0,0.0
11743,SBR00014,-4.874373,-47.407352,steel,10,total tonnes per annum,Brazil,AVB Açailândia steel plant,Aco Verde do Brasil SA,14954,...,0.011019,0.010101,0.02112,0.00551,0.0,0.000918,0.019284,0.0,0.003673,0.0
12020,L807871,-7.3886,-40.5724,solar power,9,"mw (peak value, grid connected, or unknown)",Brazil,Caldeirao Grande 2 Solar Complex,Ibitu Energia,15529,...,0.0,0.012856,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.0
12021,L831677,-7.395,-40.5836,solar power,9,"mw (peak value, grid connected, or unknown)",Brazil,Caldeirão Grande solar farm,Cemig Geração e Transmissão SA,15530,...,0.0,0.03214,0.0,0.0,0.0,0.0,0.0,0.0,0.02663,0.0


In [219]:
df = pd.read_csv('data/regression_sample.csv', sep = "\t")
df.head()


Unnamed: 0,uid_gem,latitude,longitude,sector_main,sector_main_num,capacity_unit,country,asset_name,owner_name,index,...,defo_total,y1,y3,y5,y7,defo_y1,defo_y3,defo_y5,defo_y7,quintile_capacity
0,T0208,25.809175,-80.296697,LNG terminal,0,bcf/d,United States,American LNG Hialeah Terminal,Fortress Investment Group,36351,...,0.0,1,3,5,7,0.0,0.0,0.0,0.0,-1
1,T0727,59.313521,5.285961,LNG terminal,0,mtpa,Norway,Snurrevarden LNG Terminal,Equinor [unknown %],36563,...,0.0,1,3,5,6,0.0,0.0,0.0,,-1
2,T0728,60.548447,4.834402,LNG terminal,0,mtpa,Norway,Kollsnes LNG Terminal,Equinor [unknown %],36564,...,0.0,1,3,5,6,0.0,0.0,0.0,,-1
3,T0438,60.518314,27.16082,LNG terminal,0,mtpa,Finland,Hamina LNG Terminal,Alexela [unknown %],36330,...,0.0,1,2,3,4,0.0,,,,-1
4,T0440,61.63817,21.39881,LNG terminal,0,mtpa,Finland,Pori LNG Terminal,Gasum,36318,...,0.0,1,3,5,7,0.0,0.0,0.0,0.0,-1


# Missing treecover investigation

In [220]:
# df[df.treecover2000 >= 0].groupby('country').count()

In [221]:
# missing_tree = df[df.treecover2000.isnull()]
# missing_tree

# lats = missing_tree.latitude
# lons = missing_tree.longitude
# values = missing_tree.sector_main_num

# fig = plt.figure(figsize=(15, 8))
# ax = fig.add_subplot(1, 1, 1, projection=ccrs.PlateCarree())
# ax.stock_img()
# ax.scatter(lons, lats, c=values, cmap='viridis', transform=ccrs.Geodetic())
# cbar = plt.colorbar(ax.scatter(lons, lats, c=values, cmap='viridis', marker='o', s=5, alpha=0.5, transform=ccrs.Geodetic()), ax=ax)
# cbar.set_label('Values')


# Simple modelling

In [222]:
df.columns

Index(['uid_gem', 'latitude', 'longitude', 'sector_main', 'sector_main_num',
       'capacity_unit', 'country', 'asset_name', 'owner_name', 'index',
       'capacity_first', 'start_year_first', 'sector_sub_first', 'capacity',
       'start_year', 'sector_sub', 'number_units', 'row', 'col',
       'treecover2000', 'defo_total', 'y1', 'y3', 'y5', 'y7', 'defo_y1',
       'defo_y3', 'defo_y5', 'defo_y7', 'quintile_capacity'],
      dtype='object')

In [230]:
yr = 5
yr_col = 'y' + str(yr)
defo_col = 'defo_y' + str(yr)

df_pred = df[df[yr_col] == yr]
df_pred = df_pred[df_pred.country == 'Brazil']



X_cols = ['sector_main_num', 'number_units', 'start_year_first', 'country'] #, 'sector_sub_first']
X = df_pred[X_cols]
y = df_pred[defo_col]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [224]:
len(X_train)


287

# Linear regression

In [231]:
# Create a ColumnTransformer to handle string data
preprocessor = ColumnTransformer(
    transformers=[
        ('country', OneHotEncoder(), ['country']),
        # ('sector_sub_first', OneHotEncoder(), ['sector_sub_first']),
    ],
    remainder='passthrough'
)

# Create the pipeline
lm = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

lm.fit(X_train, y_train)

# XGBoost

In [232]:
xgbm = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor())
])
xgbm.fit(X_train, y_train)


# Evaluate 

In [233]:

y_pred = lm.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Linreg output: r2 of {round(r2, 3)}, mse of {round(mse, 3)}")

# ---------------------------------------------------------
print("----" * 10)

print("XGB:")

y_pred = xgbm.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"XGB output: r2 of {round(r2, 3)}, mse of {round(mse, 3)}")

print("--- on train ---")

y_pred = xgbm.predict(X_train)
mse = mean_squared_error(y_train, y_pred)   
r2 = r2_score(y_train, y_pred)
print(f"r2 of {round(r2, 3)}, mse of {round(mse, 3)}")

Linreg output: r2 of 0.034, mse of 0.003
----------------------------------------
XGB:
XGB output: r2 of 0.452, mse of 0.002
--- on train ---
r2 of 0.308, mse of 0.004
