In [None]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR, NuSVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.cross_decomposition import PLSRegression
from sklearn.neural_network import MLPRegressor
from sklearn.inspection import permutation_importance
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#######################train crop-spectific RF model##################
# read train data
data = pd.read_csv('data/ml_data/input_data/data_machine_learning_new.csv',index_col='ID') 
id_X = np.hstack((3, np.arange(8,99)))
id_Y = np.arange(106,113)
X = data.iloc[:,id_X]
Y = data.iloc[:,id_Y]

# remove NA
y = Y['effect_yield']
id_nona = y[y.isnull().values==False].index.tolist()
#id_crop = data.Crop_type[data.Crop_type=='Maize'].index.tolist()
#id_sel = list(set(id_nona).intersection(set(id_crop)))
id_sel = id_nona 

X_sel = X.iloc[id_sel, :]
y_sel = y [id_sel]

id_r = data.Crop_type[data.Crop_type=='Paddy rice'].index.tolist()
id_rice =  list(set(id_nona).intersection(set(id_r)))
x_rice = X.iloc[id_rice, :]
y_rice = y [id_rice]                
              
id_w = data.Crop_type[data.Crop_type=='Wheat'].index.tolist()
id_wheat =  list(set(id_nona).intersection(set(id_w)))
x_wheat = X.iloc[id_wheat, :]
y_wheat = y [id_wheat]  
                 
id_m = data.Crop_type[data.Crop_type=='Maize'].index.tolist()
id_maize =  list(set(id_nona).intersection(set(id_m)))
x_maize = X.iloc[id_maize, :]
y_maize = y [id_maize]                 

# normalization
scaler = StandardScaler()
scaler.fit(X_sel)

X_sel = scaler.transform(X_sel)
y_sel = y_sel.values

x_rice = scaler.transform(x_rice)
y_rice = y_rice.values

x_wheat = scaler.transform(x_wheat)
y_wheat = y_wheat.values
                  
x_maize = scaler.transform(x_maize)
y_maize = y_maize.values
               
x_others = scaler.transform(x_others)
y_others = y_others.values


# optimize rice RF model' parameters
param_grid = [
    {'n_estimators': np.arange(50,550, 50), 
     'max_depth': np.arange(2,22, 2),
    }] 
clf = GridSearchCV(RandomForestRegressor(), param_grid, scoring="neg_mean_squared_error", n_jobs=1, cv=10) # n_jobs表示CPU线程数，-1表示全部CPU
clf.fit(x_rice, y_rice)

print(clf.best_params_)
print(clf.cv_results_)
model_rice = clf.best_estimator_
model_rice.fit(x_rice, y_rice)

# optimize wheat RF model' parameters
param_grid = [
    {'n_estimators': np.arange(50,550, 50), 
     'max_depth': np.arange(2,22, 2),
    }] 
clf = GridSearchCV(RandomForestRegressor(), param_grid, scoring="neg_mean_squared_error", n_jobs=1, cv=10) # n_jobs表示CPU线程数，-1表示全部CPU
clf.fit(x_wheat, y_wheat)

print(clf.best_params_)
print(clf.cv_results_)
model_wheat = clf.best_estimator_
model_wheat.fit(x_wheat, y_wheat)

# optimize maize RF model' parameters
param_grid = [
    {'n_estimators': np.arange(50,550, 50), 
     'max_depth': np.arange(2,22, 2),
    }] 
clf = GridSearchCV(RandomForestRegressor(), param_grid, scoring="neg_mean_squared_error", n_jobs=1, cv=10) # n_jobs表示CPU线程数，-1表示全部CPU
clf.fit(x_maize, y_maize)

print(clf.best_params_)
print(clf.cv_results_)
model_maize = clf.best_estimator_
model_maize.fit(x_maize, y_maize)

cv_result = pd.DataFrame(clf.cv_results_)
cv_result = cv_result[['param_max_depth', 'param_n_estimators', 'mean_test_score']]
cv_result['mean_test_score'] = np.sqrt(-cv_result['mean_test_score'])
cv_result.columns = ['param_max_depth', 'param_n_estimators', 'rmsecv']
cv_result

cv_result = pd.DataFrame(clf.cv_results_)
cv_result = cv_result[['param_max_depth', 'param_n_estimators', 'mean_test_score']]
cv_result['mean_test_score'] = np.sqrt(-cv_result['mean_test_score'])
cv_result.columns = ['param_max_depth', 'param_n_estimators', 'rmsecv']
cv_result

cv_result = pd.DataFrame(clf.cv_results_)
cv_result = cv_result[['param_max_depth', 'param_n_estimators', 'mean_test_score']]
cv_result['mean_test_score'] = np.sqrt(-cv_result['mean_test_score'])
cv_result.columns = ['param_max_depth', 'param_n_estimators', 'rmsecv']
cv_result

model_rice.fit(x_rice, y_rice)
model_wheat.fit(x_wheat, y_wheat)
model_maize.fit(x_maize, y_maize)

# predict train data
y_rice_pred = model_rice.predict(x_rice)
y_wheat_pred = model_wheat.predict(x_wheat)
y_maize_pred = model_maize.predict(x_maize)

# summary of predicted data
result_rice = pd.DataFrame(columns=['crop', 'mae', 'r2', 'rmse', 'rpd', 'rpiq'])
result_rice['crop'] = ['Paddy rice']
result_rice['mae'] = [metrics.mean_absolute_error(y_rice, y_rice_pred)]
result_rice["r2"] = metrics.r2_score(y_rice, y_rice_pred)
result_rice['rmse'] = np.sqrt(metrics.mean_squared_error(y_rice, y_rice_pred))
result_rice['rpd'] = np.std(y_rice)/np.sqrt(metrics.mean_squared_error(y_rice, y_rice_pred)) 
result_rice['rpiq'] = np.std(y_rice)/np.sqrt(metrics.mean_squared_error(y_rice, y_rice_pred)) 
result_rice['rpiq'] = (np.percentile(y_rice, (25, 75))[1]-np.percentile(y_rice, (25, 75))[0])/np.sqrt(metrics.mean_squared_error(y_rice, y_rice_pred))

result_wheat = pd.DataFrame(columns=['crop', 'mae', 'r2', 'rmse', 'rpd', 'rpiq'])
result_wheat['crop'] = ['Wheat ']
result_wheat['mae'] = [metrics.mean_absolute_error(y_wheat, y_wheat_pred)]
result_wheat["r2"] = metrics.r2_score(y_wheat, y_wheat_pred)
result_wheat['rmse'] = np.sqrt(metrics.mean_squared_error(y_wheat , y_wheat_pred))
result_wheat['rpd'] = np.std(y_wheat )/np.sqrt(metrics.mean_squared_error(y_wheat, y_wheat_pred))
result_wheat['rpiq'] = np.std(y_wheat )/np.sqrt(metrics.mean_squared_error(y_wheat, y_wheat_pred)) 
result_wheat['rpiq'] = (np.percentile(y_wheat, (25, 75))[1]-np.percentile(y_wheat, (25, 75))[0])/np.sqrt(metrics.mean_squared_error(y_wheat, y_wheat_pred))

result_maize = pd.DataFrame(columns=['crop', 'mae', 'r2', 'rmse', 'rpd', 'rpiq'])
result_maize['crop'] = ['Maize ']
result_maize['mae'] = [metrics.mean_absolute_error(y_maize , y_maize_pred)]
result_maize["r2"] = metrics.r2_score(y_maize , y_maize_pred)
result_maize['rmse'] = np.sqrt(metrics.mean_squared_error(y_maize, y_maize_pred))
result_maize['rpd'] = np.std(y_maize)/np.sqrt(metrics.mean_squared_error(y_maize, y_maize_pred)) 
result_maize['rpiq'] = np.std(y_maize)/np.sqrt(metrics.mean_squared_error(y_maize, y_maize_pred)) 
result_maize['rpiq'] = (np.percentile(y_maize, (25, 75))[1]-np.percentile(y_maize, (25, 75))[0])/np.sqrt(metrics.mean_squared_error(y_maize, y_maize_pred))

result = pd.concat([result_rice, result_wheat, result_maize], axis=0)
result

#######################predicted effects##################
###########double rice##############
data_double_rice1 = pd.read_csv('data/ml_data/input_data/double_rice_N_hwsd.txt', index_col = "OBJECTID")
data_double_rice1 = data_rice1.iloc[:,np.hstack((2, 3, 4, 8, 9, 10, np.arange(14,44)))]
data_double_rice2 = pd.read_csv('data/ml_data/input_data/double_rice_climate.txt', index_col = "OBJECTID")
data_double_rice2 =  data_rice2.iloc[:,np.hstack(np.arange(2,60))]
data_double_rice = pd.merge(data_rice1,data_rice2,how='inner',on='OBJECTID')
index = pd.read_csv('data/ml_data/input_data/index.csv', index_col = "ID")
data_double_rice.columns = index['index']
data_double_rice["Soil_temp_5"] = pd.to_numeric(data_double_rice["Soil_temp_5"],errors='coerce')
data_double_rice["Soil_temp_15"] = pd.to_numeric(data_double_rice["Soil_temp_15"],errors='coerce')

data_double_rice_dropna  = data_double_rice.replace([np.inf, -np.inf], np.nan).dropna(axis=0,how='any')

# dem remove 0
data_double_rice_dropna = data_double_rice_dropna[~((data_double_rice_dropna['aspect'] == 0) & 
                                        (data_double_rice_dropna['elevation'] == 0) & 
                                        (data_double_rice_dropna['hillshade'] == 0) & 
                                        (data_double_rice_dropna['slope'] == 0)) ]

# get the column name of the known sample
index_X = X.columns.tolist()

# Select only the area with more than 25 fertilizer applications
#X_rice_unknow_new = data_rice_dropna .query('N_fertilization_rate>=25')

X_double_rice = data_double_rice_dropna.iloc[:,np.arange(2,94)]
#X_rice = X_rice[index_X]

# normalization
X_double_rice_nor = scaler.transform(X_double_rice)

# predict unknown sample
y_double_rice_unknow = model_double_rice.predict(X_rice_nor)
double_rice_result = pd.DataFrame(columns=['X', 'Y', 'predicted'])
double_rice_result['X'] = data_double_rice_dropna['X']
double_rice_result['Y'] = data_double_rice_dropna['Y']
double_rice_result['predicted'] = y_double_rice_unknow
double_rice_result

##########rice################
data_rice1 = pd.read_csv('data/ml_data/input_data/rice_N_hwsd.txt', index_col = "OBJECTID")
data_rice1 = data_rice1.iloc[:,np.hstack((2, 3, 4, 8, 9, 10, np.arange(14,44)))]
data_rice2 = pd.read_csv('data/ml_data/input_data/rice_climate.txt', index_col = "OBJECTID")
data_rice2 =  data_rice2.iloc[:,np.hstack(np.arange(2,60))]
data_rice = pd.merge(data_rice1,data_rice2,how='inner',on='OBJECTID')
index = pd.read_csv('data/ml_data/input_data/index.csv', index_col = "ID")
data_rice.columns = index['index']
data_rice["Soil_temp_5"] = pd.to_numeric(data_rice["Soil_temp_5"],errors='coerce')
data_rice["Soil_temp_15"] = pd.to_numeric(data_rice["Soil_temp_15"],errors='coerce')

data_rice_dropna  = data_rice.replace([np.inf, -np.inf], np.nan).dropna(axis=0,how='any')

# dem remove 0
data_rice_dropna = data_rice_dropna[~((data_rice_dropna['aspect'] == 0) & 
                                        (data_rice_dropna['elevation'] == 0) & 
                                        (data_rice_dropna['hillshade'] == 0) & 
                                        (data_rice_dropna['slope'] == 0)) ]

# get the column name of the known sample
index_X = X.columns.tolist()

# Select only the area with more than 25 fertilizer applications
#X_rice_unknow_new = data_rice_dropna .query('N_fertilization_rate>=25')

X_rice = data_rice_dropna.iloc[:,np.arange(2,94)]
#X_rice = X_rice[index_X]

# normalization
X_rice_nor = scaler.transform(X_rice)

# predict unknown sample
y_rice_unknow = model_rice.predict(X_rice_nor)
rice_result = pd.DataFrame(columns=['X', 'Y', 'predicted'])
rice_result['X'] = data_rice_dropna['X']
rice_result['Y'] = data_rice_dropna['Y']
rice_result['predicted'] = y_rice_unknow
rice_result

##########wheat####################
data_wheat1 = pd.read_csv('data/ml_data/input_data/wheat_N_hwsd.txt', index_col = "OBJECTID")
data_wheat1 = data_wheat1.iloc[:,np.hstack((2, 3, 4, 8, 9, 10, np.arange(14,44)))]
data_wheat2 = pd.read_csv('data/ml_data/input_data/wheat_climate.txt', index_col = "OBJECTID")
data_wheat2 =  data_wheat2.iloc[:,np.hstack(np.arange(2,60))]
data_wheat = pd.merge(data_wheat1,data_wheat2,how='inner',on='OBJECTID')
index = pd.read_csv('data/ml_data/input_data/index.csv', index_col = "ID")
data_wheat.columns = index['index']
data_wheat["Soil_temp_5"] = pd.to_numeric(data_wheat["Soil_temp_5"],errors='coerce')
data_wheat["Soil_temp_15"] = pd.to_numeric(data_wheat["Soil_temp_15"],errors='coerce')

data_wheat_dropna  = data_wheat.replace([np.inf, -np.inf], np.nan).dropna(axis=0,how='any')

# dem remove 0
data_wheat_dropna = data_wheat_dropna[~((data_wheat_dropna['aspect'] == 0) & 
                                        (data_wheat_dropna['elevation'] == 0) & 
                                        (data_wheat_dropna['hillshade'] == 0) & 
                                        (data_wheat_dropna['slope'] == 0)) ]


# get the column name of the known sample
index_X = X.columns.tolist()
X_wheat = data_wheat_dropna.iloc[:,np.arange(2,94)]
X_wheat = X_wheat[index_X]
# normalization
X_wheat_nor = scaler.transform(X_wheat)

# predict unknown sample
y_wheat_unknow = model_wheat.predict(X_wheat_nor)
wheat_result = pd.DataFrame(columns=['X', 'Y', 'predicted'])
wheat_result['X'] = data_wheat_dropna['X']
wheat_result['Y'] = data_wheat_dropna['Y']
wheat_result['predicted'] = y_wheat_unknow
wheat_result


##################maize#########################
data_maize1 = pd.read_csv('data/ml_data/input_data/maize_N_hwsd.txt', index_col = "OBJECTID")
data_maize1 = data_maize1.iloc[:,np.hstack((2, 3, 4, 8, 9, 10, np.arange(14,44)))]
data_maize2 = pd.read_csv('data/ml_data/input_data/maize_climate.txt', index_col = "OBJECTID")
data_maize2 =  data_maize2.iloc[:,np.hstack(np.arange(2,60))]
data_maize = pd.merge(data_maize1,data_maize2,how='inner',on='OBJECTID')
index = pd.read_csv('data/ml_data/input_data/index.csv', index_col = "ID")
data_maize.columns = index['index']
data_maize["Soil_temp_5"] = pd.to_numeric(data_maize["Soil_temp_5"],errors='coerce')
data_maize["Soil_temp_15"] = pd.to_numeric(data_maize["Soil_temp_15"],errors='coerce')

data_maize_dropna  = data_maize.replace([np.inf, -np.inf], np.nan).dropna(axis=0,how='any')

# 去掉DEM为0的数据
data_maize_dropna = data_maize_dropna[~((data_maize_dropna['aspect'] == 0) & 
                                        (data_maize_dropna['elevation'] == 0) & 
                                        (data_maize_dropna['hillshade'] == 0) & 
                                        (data_maize_dropna['slope'] == 0)) ]

# get the column name of the known sample
index_X = X.columns.tolist()
X_maize = data_maize_dropna.iloc[:,np.arange(2,94)]
X_maize = X_maize[index_X]
# normalization
X_maize_nor = scaler.transform(X_maize)

# predict unknown sample
y_maize_unknow = model_maize.predict(X_maize_nor)
maize_result = pd.DataFrame(columns=['X', 'Y', 'predicted'])
maize_result['X'] = data_maize_dropna['X']
maize_result['Y'] = data_maize_dropna['Y']
maize_result['predicted'] = y_maize_unknow
maize_result


# export data for mapping
double_rice_result.to_csv('data/ml_data/output_data/yield_double_rice.txt')
rice_result.to_csv('data/ml_data/output_data/yield_rice.txt')
wheat_result.to_csv('data/ml_data/output_data/yield_wheat.txt')
maize_result.to_csv('data/ml_data/output_data/yield_maize.txt')
