In [None]:
# basics
import pandas as pd
import numpy as np

# maps
import matplotlib.pyplot as plt
import cartopy.crs as ccrs

# pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# prep
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

# feature engineering
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

# modelling
from sklearn.linear_model import LinearRegression
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor

# evaluation
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score



In [None]:
df = pd.read_csv('data/regression_sample.csv', sep = "\t")
df.head()



# Simple modelling

In [None]:
df.columns

In [None]:
yr = 3
yr_col = 'y' + str(yr)
defo_col = 'defo_y' + str(yr)

df_pred = df[df[yr_col] == yr]
df_pred = df_pred[df_pred.treecover2000 > 0]
df_pred = df_pred[df_pred.defo_total > 0]

aux_country = df_pred.groupby('country').uid_gem.count().reset_index().sort_values('uid_gem').rename(columns = {'uid_gem': 'country_count'})
df_pred = pd.merge(df_pred, aux_country, how = 'inner', on = 'country')

df_pred = df_pred[df_pred.country_count > 1]

df.head()

# df.groupby('quintile_capacity').count()

In [None]:
X_cols = ['sector_main', 'number_units', 'start_year_first', 'country'] #, 'defo_total']
X = df_pred[X_cols]
X_strat = df_pred[['country']]
y = df_pred[defo_col]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify = X_strat)


In [None]:
len(X_train)

# Preprocessing

In [None]:
# Create a ColumnTransformer to handle string data
preprocessor = ColumnTransformer(
    transformers=[
        ('country', OneHotEncoder(), ['country']),
        # ('sector_sub_first', OneHotEncoder(), ['sector_sub_first']),
        ('sector_main', OneHotEncoder(), ['sector_main']),
    ],
    remainder='passthrough'
)


# Linear regression

In [None]:
# Create the pipeline
lm = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

lm.fit(X_train, y_train)

# KNN regression

In [None]:
# Create the pipeline
knn = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', KNeighborsRegressor())
])

knn.fit(X_train, y_train)

# XGBoost

In [None]:
xgbm = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor())
])
xgbm.fit(X_train, y_train)


In [None]:
# The learning rate controls the step size at which the optimizer makes updates to the weights. 
# A smaller eta value results in slower but more accurate updates, while a larger eta value results 
# in faster but less accurate updates.

# The subsample parameter controls the fraction of observations used for each tree. A smaller subsample 
# value results in smaller and less complex models, which can help prevent overfitting.

# For any given problem, a lower log loss value means better predictions.

# You don't need xgboost.cv to find the optimal number of trees. You can also 
# run xgboost.train with "early_stopping_rounds" set. 

X_train_t = preprocessor.fit_transform(X_train, y_train)

#xgb.callback.TrainingCallback()

# res.best_iteration
# is most likely to be the last one but maybe not...

# n_estimators
# num_boosting_rounds

# Tune max_depth and min_child_weight first as they will have the highest impact on the model outcome.

# could provide a DataFrame with all desired combinations...
# then keep track of the results to choose the best...

# def fit(xgbm, **kwargs):

#     data_dmatrix = xgb.DMatrix(data=X_train_t, label=y_train)

#     # other parameters include max_depth, min_child_weight...
#     # defaults are all within a good range

#     # run cv for each model...
#     # GridSearchCV could be useful...

#     params = {'objective':'reg:squarederror',
#               'eval_metric':'rmse',
#               'eta': x[0],
#               'subsample':x[1]}
#     # stratified=True ???
#     # num_boost_round can be set high, 
#     xgb_cv = xgb.cv(dtrain=data_dmatrix, 
#                     params=params, 
#                     nfold=5, 
#                     early_stopping_rounds=50, # over all folds...
#                     metrics = 'rmse', 
#                     seed=42)
#     print(xgb_cv.shape)
#     print(xgb_cv)
#     return xgb_cv[-1:].values[0]

# grid = pd.DataFrame({'eta':[0.01,0.05,0.1]*2, 
#                      'subsample':np.repeat([0.1,0.3],3),
#                      })

# grid[['train-rmse-mean','train-rmse-std',
#     'test-rmse-mean','test-rmse-std']] = grid.apply(fit, axis=1, result_type='expand')

# params = {'objective':'reg:squarederror',
#             'eval_metric':'rmse',
#             'eta': grid.iloc[4].eta,
#             'subsample': grid.iloc[4].subsample}
# regressor = XGBRegressor(**params)

xgbr = XGBRegressor(
    learning_rate = 0.1,
    n_estimators = 1000,
    #max_depth=5,
    #min_child_weight=1,
    gamma = 0,
    #subsample=0.8,
    #colsample_bytree=0.8,
    #objective= 'reg:squarederror',
    seed = 42)

param_grid = {
    'eta':[0.01, 0.05, 0.1],
    'subsample': [0.1, 0.3]
}

fit_params={"early_stopping_rounds": 50, 
            "eval_metric": "rmse", 
            "eval_set": [[X_train_t, y_train]]}

gridsearch = GridSearchCV(xgbr, param_grid, verbose=0, cv=5, n_jobs=-1)

gridsearch.fit(X_train_t, y_train, **fit_params)
# gsearch = GridSearchCV(
#     estimator = xgbr, 
#     param_grid = param_grid, 
#     verbose = 1, 
#     scoring = 'neg_root_mean_squared_error',
#     n_jobs=-1,
#     cv=5
# )

xgbm2 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', gridsearch.best_estimator_)
])
xgbm2.fit(X_train, y_train)




# Evaluate 

In [None]:

y_pred = lm.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Linreg output: r2 of {round(r2, 3)}, mse of {round(mse, 3)}")

# ---------------------------------------------------------
print("----" * 10)

y_pred = knn.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"KNN output: r2 of {round(r2, 3)}, mse of {round(mse, 3)}")

# ---------------------------------------------------------
print("----" * 10)

print("XGB:")

y_pred = xgbm.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"XGB output: r2 of {round(r2, 3)}, mse of {round(mse, 3)}")

print("--- on train ---")

y_pred = xgbm.predict(X_train)
mse = mean_squared_error(y_train, y_pred)   
r2 = r2_score(y_train, y_pred)
print(f"r2 of {round(r2, 3)}, mse of {round(mse, 3)}")

print("XGB (cv):")

y_pred = xgbm2.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"XGB (cv) output: r2 of {round(r2, 3)}, mse of {round(mse, 3)}")

print("--- on train ---")

y_pred = xgbm2.predict(X_train)
mse = mean_squared_error(y_train, y_pred)   
r2 = r2_score(y_train, y_pred)
print(f"r2 of {round(r2, 3)}, mse of {round(mse, 3)}")
