In [None]:
# basics
import pandas as pd
import numpy as np

# maps
import matplotlib.pyplot as plt
import cartopy.crs as ccrs

# pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# prep
from sklearn.model_selection import train_test_split

# feature engineering
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

# modelling
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor

# evaluation
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score



In [None]:
df = pd.read_csv('data/regression_sample.csv', sep = "\t")
df.head()

# Simple modelling

In [None]:
df.columns

In [None]:
y_var = 'around_3'

df_pred = df[df[y_var].notnull()]
df_pred = df_pred[df_pred.treecover2000 > 0]
df_pred = df_pred[df_pred.defo_total > 0]

aux_country = df_pred.groupby('country').uid_gem.count().reset_index().sort_values('uid_gem').rename(columns = {'uid_gem': 'country_count'})
df_pred = pd.merge(df_pred, aux_country, how = 'inner', on = 'country')

df_pred = df_pred[df_pred.country_count > 1]

df.head()

# df.groupby('quintile_capacity').count()

In [None]:
X_cols = ['sector_main', 'number_units', 'start_year_first', 'country'] #, 'defo_total']
X = df_pred[X_cols]
X_strat = df_pred[['country']]
y = df_pred[y_var]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify = X_strat)

In [None]:
len(X_train)

# Preprocessing

In [None]:
# Create a ColumnTransformer to handle string data
preprocessor = ColumnTransformer(
    transformers=[
        ('country', OneHotEncoder(), ['country']),
        # ('sector_sub_first', OneHotEncoder(), ['sector_sub_first']),
        ('sector_main', OneHotEncoder(), ['sector_main']),
    ],
    remainder='passthrough'
)


# Linear regression

In [None]:
# Create the pipeline
lm = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

lm.fit(X_train, y_train)

# KNN regression

In [None]:
# Create the pipeline
knn = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', KNeighborsRegressor())
])

knn.fit(X_train, y_train)

# XGBoost

In [None]:
xgbm = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor())
])
xgbm.fit(X_train, y_train)


# Evaluate 

In [None]:

y_pred = lm.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Linreg output: r2 of {round(r2, 3)}, mse of {round(mse, 3)}")

# ---------------------------------------------------------
print("----" * 10)

y_pred = knn.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"KNN output: r2 of {round(r2, 3)}, mse of {round(mse, 3)}")

# ---------------------------------------------------------
print("----" * 10)

print("XGB:")

y_pred = xgbm.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"XGB output: r2 of {round(r2, 3)}, mse of {round(mse, 3)}")

print("--- on train ---")

y_pred = xgbm.predict(X_train)
mse = mean_squared_error(y_train, y_pred)   
r2 = r2_score(y_train, y_pred)
print(f"r2 of {round(r2, 3)}, mse of {round(mse, 3)}")