In [1]:
import category_encoders as ce
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from xgboost import XGBRegressor
# choose metrics MAE (because leaving some outliers)
from sklearn.metrics import mean_absolute_error, r2_score

# upload data
df_2008_2018 = pd.read_csv('df_2008_2018.csv')
  
# train/val/test split timebased
train = df_2008_2018[df_2008_2018['year'] <= 2015]
val = df_2008_2018[df_2008_2018['year'] == 2016]
test = df_2008_2018[df_2008_2018['year'] >= 2017]

# round up targets
train = train.round({"lat":4, "lon":4})
val = val.round({'lat':4, 'lon':4})
test = test.round({'lat':4, 'lon':4})

# define target and features
target1 = 'lat'
target2 = 'lon'

remove_feat = ['ano', 'view_date', 'uf', 'julday', 'class_name', 'publish_ye', 'gid', 'origin_id', 'mainclass', 'areakm_squared', 'pathrow', 'dsfnv', 'scene_id', 'geometry']
features = train.drop(columns=[target1] + [target2] + remove_feat).columns.tolist()

# split up X-matrix y-vector
X_train = train[features]
y_train_1 = train[target1]
y_train_2 = train[target2]
X_val = val[features]
y_val_1 = val[target1]
y_val_2 = val[target2]
X_test = test[features]
y_test_1 = test[target1]
y_test_2 = test[target2]

xgb1 = make_pipeline(
    ce.OrdinalEncoder(),
    XGBRegressor(n_estimators=200, random_state=42, n_jobs=-1, max_depth=50)
)
xgb2 = make_pipeline(
    ce.OrdinalEncoder(),
    XGBRegressor(n_estimators=200, random_state=42, n_jobs=-1, max_depth=50)
)

xgb1.fit(X_train, y_train_1)
xgb2.fit(X_train, y_train_2)

  import pandas.util.testing as tm


Pipeline(memory=None,
         steps=[('ordinalencoder',
                 OrdinalEncoder(cols=['states'], drop_invariant=False,
                                handle_missing='value', handle_unknown='value',
                                mapping=[{'col': 'states',
                                          'data_type': dtype('O'),
                                          'mapping': Amazonas       1
Para           2
Mato Grosso    3
Roraima        4
Acre           5
Rondonia       6
Maranhao       7
Tocantins      8
Amapa          9
NaN           -2
dtype: int64}],
                                return_df=True, verbose=0)),
                ('xgbregressor',
                 XGBRegressor(b...
                              interaction_constraints=None,
                              learning_rate=0.300000012, max_delta_step=0,
                              max_depth=50, min_child_weight=1, missing=nan,
                              monotone_constraints=None, n_estimators=300,
           

In [2]:
from joblib import dump
dump(xgb1, 'gb1.joblib', compress=True)
dump(xgb2, 'gb2.joblib', compress=True)

['gb2.joblib']

In [42]:
print(f'joblib=={joblib.__version__}')
print(f'scikit-learn=={sklearn.__version__}')
print(f'category_encoders=={ce.__version__}')
print(f'xgboost=={xgboost.__version__}')
print(f'pandas=={pd.__version__}')
print(f'numpy=={np.__version__}')

joblib==0.14.1
scikit-learn==0.22.2.post1
category_encoders==2.1.0
xgboost==1.0.2
pandas==1.0.3
numpy==1.16.5
