In [None]:

import xgboost as xgb
import pandas as pd
from pandas import DataFrame
# import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report, mean_squared_error, r2_score, mean_absolute_error,root_mean_squared_log_error
from sklearn.model_selection import train_test_split
import cartopy.crs as ccrs
import time
from sklearn.ensemble import RandomForestRegressor
import os
from scipy.stats import pearsonr
import geopandas as gpd
from shapely.geometry import Point
Results_file = 'mexico'

try:
    os.mkdir(Results_file)
except OSError as error:
    print(error)


Data = pd.read_csv('data/data20-150.csv') 
Data_test = pd.read_csv('data\datatest.csv') 
Data_train = pd.read_csv('data\datatrain.csv') 

mexico_boundary = gpd.read_file('mexico/gadm41_MEX_shp/gadm41_MEX_0.shp')


geometry = [Point(xy) for xy in zip(Data['lon'], Data['lat'])]
gdf = gpd.GeoDataFrame(Data, geometry=geometry)


Data_Except_ANT= gpd.sjoin(gdf, mexico_boundary, how='inner')
Data_ANT = gdf[~gdf.index.isin(Data_Except_ANT.index)]

Features = [
"LAB_LitMod", 
"MeanCurv", 
"vsv230",
"vsv300", 
"surfaceValue",
"Ridge",
"Volcanos"
] 


x_test = DataFrame(Data_test, columns=Features)
x_train = DataFrame(Data_train, columns=Features)
X_ANT = DataFrame(Data_ANT, columns=Features)
y_test = []
y_train = []
y1 = []

for j in range(len(x_test)):
    y_test.append(Data_test.iloc[j, 2])
for j in range(len(x_train)):
    y_train.append(Data_train.iloc[j, 2])
for j in range(len(Data_ANT)):
  y1.append(Data_ANT.iloc[j, 2])

x_train = pd.concat([x_train, X_ANT], ignore_index=True)
y_train = y_train + y1

t_0 = time.time()
n_estimators = 300
max_depth = 12
min_samples_split = 2  
random_state = 1  

rf_model = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, 
                                    min_samples_split=min_samples_split, random_state=random_state)
rf_model.fit(x_train, y_train)
LR = 0.01
MD = 11
Su = 0.7
Gamma = 120
model = xgb.XGBRegressor(n_estimators=1000, objective='reg:squarederror', learning_rate = LR, max_depth = MD, subsample = Su, gamma = Gamma, reg_alpha =0.5,reg_lambda=5)

rf_pred_train = rf_model.predict(x_train)
X_train_extended = np.column_stack((x_train, rf_pred_train))
model.fit(X_train_extended, y_train)

model_score = model.score(X_train_extended, y_train)
rf_pred_test = rf_model.predict(x_test)
X_test_extended = np.column_stack((x_test, rf_pred_test))
y_pred = model.predict(X_test_extended)

pearson_tod = pearsonr(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
ms_error = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
RMSE = np.sqrt(ms_error)  
RMSLE = root_mean_squared_log_error(y_test, y_pred)
print('pearsonr:', pearson_tod)
print('model score:', model_score)
print('MAE:', mae)
print('R2:', r2)
print('RMSE:', RMSE)
print('RMSLE:', RMSLE)
print("",(np.sqrt(mean_squared_error(y_test,y_pred)))/(np.max(y_test)-np.min(y_test)))

test_indices = x_test.index

test_results = pd.DataFrame({
    'longitude':  Data_test.loc[test_indices, 'lon'],  
    'latitude':  Data_test.loc[test_indices, 'lat'],
    'true_value': y_test,
    'predicted_value': y_pred,
    'abd':y_test-y_pred
})

#test_results.to_csv('mexico/rfxgb.csv', index=False)

