In [217]:
# basics
import pandas as pd
import numpy as np

# maps
import matplotlib.pyplot as plt
import cartopy.crs as ccrs

# pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# prep
from sklearn.model_selection import train_test_split

# feature engineering
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

# modelling
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor

# evaluation
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score



In [16]:
raw_df = pd.read_csv('data/assets_with_deforestation.csv', sep = '\t')
raw_df
# raw_df[raw_df['2015'] > 0]


Unnamed: 0,uid_gem,latitude,longitude,sector_main,sector_main_num,capacity_unit,country,asset_name,owner_name,index,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,treecover2000
0,L900124,28.4624,-0.0576,wind power,11,mw,Algeria,Kabertene wind farm,Shariket Kahraba wa Taket Moutadjadida (SKTM),0,...,,,,,,,,,,
1,L900045,25.8577,34.4182,wind power,11,mw,Egypt,Gulf Of Ziet Wind Complex,New and Renewable Energy Authority (NREA),1,...,,,,,,,,,,
2,L900026,28.4005,32.9572,wind power,11,mw,Egypt,Ras Ghareb wind farm,Engie SA,4,...,,,,,,,,,,
3,L900035,28.1338,33.2602,wind power,11,mw,Egypt,West Bakr wind farm,Lekela Power,5,...,,,,,,,,,,
4,L900044,29.1988,32.6210,wind power,11,mw,Egypt,Zafarana wind farm,New and Renewable Energy Authority (NREA),7,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24462,L200402,-29.0175,153.4450,bioenergy,1,mw,Australia,Broadwater power station,Capital Dynamics,37866,...,,,,,,,,,,
24463,L200403,-28.3107,153.4361,bioenergy,1,mw,Australia,Condong Cogeneration power station,Capital Dynamics,37867,...,,,,,,,,,,
24464,L200405,-19.5574,147.3308,bioenergy,1,mw,Australia,Pioneer Sugar Mill power station,Wilmar,37869,...,,,,,,,,,,
24465,L201417,-21.1647,149.1348,bioenergy,1,mw,Australia,Racecourse Mill power station,Mackay Sugar,37870,...,,,,,,,,,,


In [17]:
df = pd.read_csv('data/regression_sample.csv', sep = "\t")
df.head()


Unnamed: 0,uid_gem,latitude,longitude,sector_main,sector_main_num,capacity_unit,country,asset_name,owner_name,index,...,t_m2,t_m1,t_0,t_1,t_2,t_3,around_3,around_5,forward_3,past_3
0,L905061,-33.2114,-65.089,wind power,11,mw,Argentina,Achiras wind farm,Central Puerto,93,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,L904991,-42.6246,-65.2781,wind power,11,mw,Argentina,Aluar El Llano wind farm,Aluar,95,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,L905366,-28.6977,-66.7579,wind power,11,mw,Argentina,Arauco wind farm,Pampa Energía SA,99,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,L905447,-47.2995,-66.9931,wind power,11,mw,Argentina,Bicentenario wind farm,Petroquimica Comodoro Rivadavia SA (PCR),102,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,L905533,-46.5925,-67.6486,wind power,11,mw,Argentina,Canadon Leon wind farm,YPF Energía Eléctrica SA,104,...,0.0,0.0,0.0,0.0,,,0.0,,,0.0


# Missing treecover investigation

In [220]:
# df[df.treecover2000 >= 0].groupby('country').count()

In [221]:
# missing_tree = df[df.treecover2000.isnull()]
# missing_tree

# lats = missing_tree.latitude
# lons = missing_tree.longitude
# values = missing_tree.sector_main_num

# fig = plt.figure(figsize=(15, 8))
# ax = fig.add_subplot(1, 1, 1, projection=ccrs.PlateCarree())
# ax.stock_img()
# ax.scatter(lons, lats, c=values, cmap='viridis', transform=ccrs.Geodetic())
# cbar = plt.colorbar(ax.scatter(lons, lats, c=values, cmap='viridis', marker='o', s=5, alpha=0.5, transform=ccrs.Geodetic()), ax=ax)
# cbar.set_label('Values')


# Simple modelling

In [222]:
df.columns

Index(['uid_gem', 'latitude', 'longitude', 'sector_main', 'sector_main_num',
       'capacity_unit', 'country', 'asset_name', 'owner_name', 'index',
       'capacity_first', 'start_year_first', 'sector_sub_first', 'capacity',
       'start_year', 'sector_sub', 'number_units', 'row', 'col',
       'treecover2000', 'defo_total', 'y1', 'y3', 'y5', 'y7', 'defo_y1',
       'defo_y3', 'defo_y5', 'defo_y7', 'quintile_capacity'],
      dtype='object')

In [230]:
yr = 5
yr_col = 'y' + str(yr)
defo_col = 'defo_y' + str(yr)

df_pred = df[df[yr_col] == yr]
df_pred = df_pred[df_pred.country == 'Brazil']



X_cols = ['sector_main_num', 'number_units', 'start_year_first', 'country'] #, 'sector_sub_first']
X = df_pred[X_cols]
y = df_pred[defo_col]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [224]:
len(X_train)


287

# Linear regression

In [231]:
# Create a ColumnTransformer to handle string data
preprocessor = ColumnTransformer(
    transformers=[
        ('country', OneHotEncoder(), ['country']),
        # ('sector_sub_first', OneHotEncoder(), ['sector_sub_first']),
    ],
    remainder='passthrough'
)

# Create the pipeline
lm = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

lm.fit(X_train, y_train)

# XGBoost

In [232]:
xgbm = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor())
])
xgbm.fit(X_train, y_train)


# Evaluate 

In [233]:

y_pred = lm.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Linreg output: r2 of {round(r2, 3)}, mse of {round(mse, 3)}")

# ---------------------------------------------------------
print("----" * 10)

print("XGB:")

y_pred = xgbm.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"XGB output: r2 of {round(r2, 3)}, mse of {round(mse, 3)}")

print("--- on train ---")

y_pred = xgbm.predict(X_train)
mse = mean_squared_error(y_train, y_pred)   
r2 = r2_score(y_train, y_pred)
print(f"r2 of {round(r2, 3)}, mse of {round(mse, 3)}")

Linreg output: r2 of 0.034, mse of 0.003
----------------------------------------
XGB:
XGB output: r2 of 0.452, mse of 0.002
--- on train ---
r2 of 0.308, mse of 0.004
