In [387]:
# basics
import pandas as pd
import numpy as np

# maps
import matplotlib.pyplot as plt
import cartopy.crs as ccrs

# pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# prep
from sklearn.model_selection import train_test_split

# feature engineering
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

# modelling
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor

# evaluation
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score



In [388]:
df = pd.read_csv('data/regression_sample.csv', sep = "\t")
df.head()



Unnamed: 0,uid_gem,latitude,longitude,sector_main,sector_main_num,capacity_unit,country,asset_name,owner_name,index,...,defo_total,y1,y3,y5,y7,defo_y1,defo_y3,defo_y5,defo_y7,quintile_capacity
0,T0208,25.809175,-80.296697,LNG terminal,0,bcf/d,United States,American LNG Hialeah Terminal,Fortress Investment Group,36351,...,0.0,1,3,5,7,0.0,0.0,0.0,0.0,0.0
1,T0499,32.75898,129.806367,LNG terminal,0,mtpa,Japan,Nagasaki LNG Terminal,Saibu Gas,36484,...,0.003673,1,3,5,6,0.0,0.000918,0.000918,,0.0
2,T0221,32.091111,-81.000278,LNG terminal,0,mtpa,United States,Elba Island LNG Terminal,Southern LNG Company LLC,36307,...,0.017447,1,3,5,7,0.0,0.0,0.017447,0.017447,0.0
3,T0670,18.463029,-77.934847,LNG terminal,0,mtpa,Jamaica,Montego Bay LNG Terminal,New Fortress Energy,36262,...,0.0,1,3,5,7,0.0,0.0,0.0,0.0,0.0
4,T0753,19.977562,110.057687,LNG terminal,0,mtpa,China,Hainan Shennan LNG Storage Facility,"Hainan Shennan Energy Co.,ltd CNPC",36252,...,0.007346,1,3,5,7,0.0,0.0,0.002755,0.002755,0.0


# Missing treecover investigation

In [389]:
# df[df.treecover2000 >= 0].groupby('country').count()

In [390]:
# missing_tree = df[df.treecover2000.isnull()]
# missing_tree

# lats = missing_tree.latitude
# lons = missing_tree.longitude
# values = missing_tree.sector_main_num

# fig = plt.figure(figsize=(15, 8))
# ax = fig.add_subplot(1, 1, 1, projection=ccrs.PlateCarree())
# ax.stock_img()
# ax.scatter(lons, lats, c=values, cmap='viridis', transform=ccrs.Geodetic())
# cbar = plt.colorbar(ax.scatter(lons, lats, c=values, cmap='viridis', marker='o', s=5, alpha=0.5, transform=ccrs.Geodetic()), ax=ax)
# cbar.set_label('Values')


# Simple modelling

In [391]:
df.columns

Index(['uid_gem', 'latitude', 'longitude', 'sector_main', 'sector_main_num',
       'capacity_unit', 'country', 'asset_name', 'owner_name', 'index',
       'capacity_first', 'start_year_first', 'sector_sub_first', 'capacity',
       'start_year', 'sector_sub', 'number_units', 'row', 'col',
       'treecover2000', 'defo_total', 'y1', 'y3', 'y5', 'y7', 'defo_y1',
       'defo_y3', 'defo_y5', 'defo_y7', 'quintile_capacity'],
      dtype='object')

In [412]:
yr = 3
yr_col = 'y' + str(yr)
defo_col = 'defo_y' + str(yr)

df_pred = df[df[yr_col] == yr]
df_pred = df_pred[df_pred.treecover2000 > 0]
df_pred = df_pred[df_pred.defo_total > 0]

aux_country = df_pred.groupby('country').uid_gem.count().reset_index().sort_values('uid_gem').rename(columns = {'uid_gem': 'country_count'})
df_pred = pd.merge(df_pred, aux_country, how = 'inner', on = 'country')

df_pred = df_pred[df_pred.country_count > 1]

df.head()

# df.groupby('quintile_capacity').count()

Unnamed: 0_level_0,uid_gem,latitude,longitude,sector_main,sector_main_num,capacity_unit,country,asset_name,owner_name,index,...,treecover2000,defo_total,y1,y3,y5,y7,defo_y1,defo_y3,defo_y5,defo_y7
quintile_capacity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-1.0,146,146,146,146,146,6,146,146,71,146,...,146,146,146,146,146,146,146,141,129,115
0.0,1512,1512,1512,1512,1512,1512,1512,1512,1471,1512,...,1512,1512,1512,1512,1512,1512,1499,1392,1273,1195
1.0,1511,1511,1511,1511,1511,1511,1511,1511,1427,1511,...,1511,1511,1511,1511,1511,1511,1488,1416,1301,1198
2.0,1511,1511,1511,1511,1511,1511,1511,1511,1409,1511,...,1511,1511,1511,1511,1511,1511,1479,1418,1340,1242
3.0,1512,1512,1512,1512,1512,1512,1512,1512,1110,1512,...,1512,1512,1512,1512,1512,1512,1485,1402,1324,1197
4.0,1511,1511,1511,1511,1511,1511,1511,1511,1191,1511,...,1511,1511,1511,1511,1511,1511,1476,1379,1283,1166
5.0,1511,1511,1511,1511,1511,1511,1511,1511,1002,1511,...,1511,1511,1511,1511,1511,1511,1499,1434,1365,1235
6.0,1512,1512,1512,1512,1512,1512,1512,1512,1241,1512,...,1512,1512,1512,1512,1512,1512,1427,1297,1195,1026
7.0,1511,1511,1511,1511,1511,1511,1511,1511,1366,1511,...,1511,1511,1511,1511,1511,1511,1430,1294,1145,911
8.0,1511,1511,1511,1511,1511,1511,1511,1511,1359,1511,...,1511,1511,1511,1511,1511,1511,1316,1078,884,677


In [405]:
X_cols = ['sector_main', 'number_units', 'start_year_first', 'country'] #, 'defo_total']
X = df_pred[X_cols]
X_strat = df_pred[['country']]
y = df_pred[defo_col]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify = X_strat)


In [406]:
len(X_train)

2058

# Preprocessing

In [407]:
# Create a ColumnTransformer to handle string data
preprocessor = ColumnTransformer(
    transformers=[
        ('country', OneHotEncoder(), ['country']),
        # ('sector_sub_first', OneHotEncoder(), ['sector_sub_first']),
        ('sector_main', OneHotEncoder(), ['sector_main']),
    ],
    remainder='passthrough'
)


# Linear regression

In [408]:
# Create the pipeline
lm = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

lm.fit(X_train, y_train)

# XGBoost

In [409]:
xgbm = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor())
])
xgbm.fit(X_train, y_train)


# Evaluate 

In [410]:

y_pred = lm.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Linreg output: r2 of {round(r2, 3)}, mse of {round(mse, 3)}")

# ---------------------------------------------------------
print("----" * 10)

print("XGB:")

y_pred = xgbm.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"XGB output: r2 of {round(r2, 3)}, mse of {round(mse, 3)}")

print("--- on train ---")

y_pred = xgbm.predict(X_train)
mse = mean_squared_error(y_train, y_pred)   
r2 = r2_score(y_train, y_pred)
print(f"r2 of {round(r2, 3)}, mse of {round(mse, 3)}")

Linreg output: r2 of 0.06, mse of 0.004
----------------------------------------
XGB:
XGB output: r2 of 0.143, mse of 0.004
--- on train ---
r2 of 0.407, mse of 0.002
