In [1]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.inspection import permutation_importance
from sklearn import metrics
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from dbfread import DBF

# 读取数据
data = pd.read_csv('data_machine_learning_new.csv',index_col='ID') 
id_X = np.hstack((3, np.arange(8,99)))
id_Y = np.arange(99,113)
X = data.iloc[:,id_X]
Y = data.iloc[:,id_Y]


# 构造建模y数据，并剔除NA值
y = Y['effect_yield']
id_nona = y[y.isnull().values==False].index.tolist()
#id_crop = data.Crop_type[data.Crop_type=='Maize'].index.tolist()
#id_sel = list(set(id_nona).intersection(set(id_crop)))
id_sel = id_nona 

X_sel = X.iloc[id_sel, :]
y_sel = y [id_sel]

id_r = data.Crop_type[data.Crop_type=='Paddy rice'].index.tolist()
id_rice =  list(set(id_nona).intersection(set(id_r)))
x_rice = X.iloc[id_rice, :]
y_rice = y [id_rice]                
              
id_w = data.Crop_type[data.Crop_type=='Wheat'].index.tolist()
id_wheat =  list(set(id_nona).intersection(set(id_w)))
x_wheat = X.iloc[id_wheat, :]
y_wheat = y [id_wheat]  
                 
id_m = data.Crop_type[data.Crop_type=='Maize'].index.tolist()
id_maize =  list(set(id_nona).intersection(set(id_m)))
x_maize = X.iloc[id_maize, :]
y_maize = y [id_maize]                 

id_o = data.Crop_type[data.Crop_type=='Others'].index.tolist()
id_others =  list(set(id_nona).intersection(set(id_o)))
x_others = X.iloc[id_others, :]
y_others = y [id_others]


# 标准化，神经网络一定要标准化
scaler = StandardScaler()
scaler.fit(X_sel)

X_sel = scaler.transform(X_sel)
y_sel = y_sel.values

x_rice = scaler.transform(x_rice)
y_rice = y_rice.values

x_wheat = scaler.transform(x_wheat)
y_wheat = y_wheat.values
                  
x_maize = scaler.transform(x_maize)
y_maize = y_maize.values
               
x_others = scaler.transform(x_others)
y_others = y_others.values


# 划分数据集
x_train_entire, x_test_entire,  y_train_entire, y_test_entire = train_test_split(X_sel, y_sel, test_size = 0.3, random_state = 0)

x_train_rice, x_test_rice,  y_train_rice, y_test_rice = train_test_split(x_rice, y_rice, test_size = 0.3, random_state = 0)

x_train_wheat, x_test_wheat,  y_train_wheat, y_test_wheat = train_test_split(x_wheat, y_wheat, test_size = 0.3, random_state = 0)

x_train_maize, x_test_maize,  y_train_maize, y_test_maize = train_test_split(x_maize, y_maize, test_size = 0.3, random_state = 0)

In [2]:
# 优化模型
param_grid = [
    {'n_estimators': np.arange(50,550, 50), 
     'max_depth': np.arange(2,22, 2),
    }] 
clf_rice = GridSearchCV(RandomForestRegressor(), param_grid, scoring="neg_mean_squared_error", n_jobs=1, cv=10) # n_jobs表示CPU线程数，-1表示全部CPU
clf_rice.fit(x_rice, y_rice)

clf_wheat = GridSearchCV(RandomForestRegressor(), param_grid, scoring="neg_mean_squared_error", n_jobs=1, cv=10) # n_jobs表示CPU线程数，-1表示全部CPU
clf_wheat.fit(x_wheat, y_wheat)

clf_maize = GridSearchCV(RandomForestRegressor(), param_grid, scoring="neg_mean_squared_error", n_jobs=1, cv=10) # n_jobs表示CPU线程数，-1表示全部CPU
clf_maize.fit(x_maize, y_maize)


GridSearchCV(cv=10, estimator=RandomForestRegressor(), n_jobs=1,
             param_grid=[{'max_depth': array([ 2,  4,  6,  8, 10, 12, 14, 16, 18, 20]),
                          'n_estimators': array([ 50, 100, 150, 200, 250, 300, 350, 400, 450, 500])}],
             scoring='neg_mean_squared_error')

In [3]:
print(clf_rice.best_params_) 
print(clf_wheat.best_params_) 
print(clf_maize.best_params_) 
print(clf_rice.cv_results_)
print(clf_wheat.cv_results_)
print(clf_maize.cv_results_)


model_rice = clf_rice.best_estimator_ 
model_wheat = clf_wheat.best_estimator_ 
model_maize = clf_maize.best_estimator_ 

{'max_depth': 2, 'n_estimators': 300}
{'max_depth': 4, 'n_estimators': 200}
{'max_depth': 14, 'n_estimators': 50}
{'mean_fit_time': array([0.09005649, 0.18660116, 0.27785676, 0.37439876, 0.44680762,
       0.54464009, 0.63051047, 0.71559119, 0.80604355, 0.89350116,
       0.14202728, 0.28319032, 0.43004956, 0.57296536, 0.7094022 ,
       0.85222032, 1.00720599, 1.13407261, 1.27250447, 1.42529263,
       0.19617693, 0.37390544, 0.57187302, 0.75517645, 0.93480191,
       1.13994763, 1.32715261, 1.51404412, 1.7080308 , 1.88794899,
       0.2261975 , 0.45089376, 0.68547192, 0.91764545, 1.15062547,
       1.35985935, 1.58935103, 1.81483641, 2.04153876, 2.26440723,
       0.25402372, 0.51541736, 0.77453501, 1.03550227, 1.2764931 ,
       1.57191515, 1.81723881, 2.0703649 , 2.32468154, 2.55436699,
       0.27397277, 0.54733574, 0.81995425, 1.09617603, 1.37078254,
       1.64461577, 1.91143415, 2.18376107, 2.47938619, 2.73518267,
       0.28124764, 0.5643899 , 0.84523916, 1.15291264, 1.4263429

cv_result = pd.DataFrame(clf.cv_results_)
cv_result = cv_result[['param_max_depth', 'param_n_estimators', 'mean_test_score']]
cv_result['mean_test_score'] = np.sqrt(-cv_result['mean_test_score'])
cv_result.columns = ['param_max_depth', 'param_n_estimators', 'rmsecv']

cv_result.to_csv('D:\\1 博后期间\\papers\\CRU_meta-analysis\\machine_learning_new\\mapping\\results\\mapping_cv_rf_yield_rice.csv')

# 查看、构造最优模型并训练
print(clf.best_params_)
print(clf.cv_results_)
model_rice = clf.best_estimator_
model_rice.fit(x_rice, y_rice)

In [5]:
#model_rice = RandomForestRegressor(max_depth=18, n_estimators=100)
#model_wheat = RandomForestRegressor(max_depth=10, n_estimators=100)
#model_maize = RandomForestRegressor(max_depth=8, n_estimators=150)

model_rice.fit(x_rice, y_rice)
model_wheat.fit(x_wheat, y_wheat)
model_maize.fit(x_maize, y_maize)

# 预测
y_rice_pred = model_rice.predict(x_rice)
y_wheat_pred = model_wheat.predict(x_wheat)
y_maize_pred = model_maize.predict(x_maize)

# 预测结果汇总
result_rice = pd.DataFrame(columns=['crop', 'mae', 'r2', 'rmse', 'rpd', 'rpiq'])
result_rice['crop'] = ['Paddy rice']
result_rice['mae'] = [metrics.mean_absolute_error(y_rice, y_rice_pred)]
result_rice["r2"] = metrics.r2_score(y_rice, y_rice_pred)
result_rice['rmse'] = np.sqrt(metrics.mean_squared_error(y_rice, y_rice_pred))
result_rice['rpd'] = np.std(y_rice)/np.sqrt(metrics.mean_squared_error(y_rice, y_rice_pred)) # 加[0]是为了去掉表头
result_rice['rpiq'] = np.std(y_rice)/np.sqrt(metrics.mean_squared_error(y_rice, y_rice_pred)) # 加[0]是为了去掉表头
result_rice['rpiq'] = (np.percentile(y_rice, (25, 75))[1]-np.percentile(y_rice, (25, 75))[0])/np.sqrt(metrics.mean_squared_error(y_rice, y_rice_pred))

result_wheat = pd.DataFrame(columns=['crop', 'mae', 'r2', 'rmse', 'rpd', 'rpiq'])
result_wheat['crop'] = ['Wheat ']
result_wheat['mae'] = [metrics.mean_absolute_error(y_wheat, y_wheat_pred)]
result_wheat["r2"] = metrics.r2_score(y_wheat, y_wheat_pred)
result_wheat['rmse'] = np.sqrt(metrics.mean_squared_error(y_wheat , y_wheat_pred))
result_wheat['rpd'] = np.std(y_wheat )/np.sqrt(metrics.mean_squared_error(y_wheat, y_wheat_pred)) # 加[0]是为了去掉表头
result_wheat['rpiq'] = np.std(y_wheat )/np.sqrt(metrics.mean_squared_error(y_wheat, y_wheat_pred)) # 加[0]是为了去掉表头
result_wheat['rpiq'] = (np.percentile(y_wheat, (25, 75))[1]-np.percentile(y_wheat, (25, 75))[0])/np.sqrt(metrics.mean_squared_error(y_wheat, y_wheat_pred))

result_maize = pd.DataFrame(columns=['crop', 'mae', 'r2', 'rmse', 'rpd', 'rpiq'])
result_maize['crop'] = ['Maize ']
result_maize['mae'] = [metrics.mean_absolute_error(y_maize , y_maize_pred)]
result_maize["r2"] = metrics.r2_score(y_maize , y_maize_pred)
result_maize['rmse'] = np.sqrt(metrics.mean_squared_error(y_maize, y_maize_pred))
result_maize['rpd'] = np.std(y_maize)/np.sqrt(metrics.mean_squared_error(y_maize, y_maize_pred)) # 加[0]是为了去掉表头
result_maize['rpiq'] = np.std(y_maize)/np.sqrt(metrics.mean_squared_error(y_maize, y_maize_pred)) # 加[0]是为了去掉表头
result_maize['rpiq'] = (np.percentile(y_maize, (25, 75))[1]-np.percentile(y_maize, (25, 75))[0])/np.sqrt(metrics.mean_squared_error(y_maize, y_maize_pred))


result = pd.concat([result_rice, result_wheat, result_maize], axis=0)
result

Unnamed: 0,crop,mae,r2,rmse,rpd,rpiq
0,Paddy rice,3.763633,0.691403,5.358622,1.800131,1.919298
0,Wheat,3.701095,0.582465,6.535828,1.547581,1.350317
0,Maize,4.147376,0.586901,6.006728,1.555869,1.449373


In [6]:
#读取数据rice
data_rice1 = pd.read_csv('D:/1 博后期间/papers/CRU_meta-analysis/global_analysis/data_summary/rice_N_hwsd.txt', index_col = "OBJECTID")
data_rice1 = data_rice1.iloc[:,np.hstack((2, 3, 4, 8, 9, 10, np.arange(14,44)))]
data_rice2 = pd.read_csv('D:/1 博后期间/papers/CRU_meta-analysis/global_analysis/data_summary/rice_climate.txt', index_col = "OBJECTID")
data_rice2 =  data_rice2.iloc[:,np.hstack(np.arange(2,60))]
data_rice = pd.merge(data_rice1,data_rice2,how='inner',on='OBJECTID')
index = pd.read_csv('D:/1 博后期间/papers/CRU_meta-analysis/global_analysis/data_summary/index.csv', index_col = "ID")
data_rice.columns = index['index']
data_rice["Soil_temp_5"] = pd.to_numeric(data_rice["Soil_temp_5"],errors='coerce')
data_rice["Soil_temp_15"] = pd.to_numeric(data_rice["Soil_temp_15"],errors='coerce')

data_rice_dropna  = data_rice.replace([np.inf, -np.inf], np.nan).dropna(axis=0,how='any')

# 去掉DEM为0的数据
data_rice_dropna = data_rice_dropna[~((data_rice_dropna['aspect'] == 0) & 
                                        (data_rice_dropna['elevation'] == 0) & 
                                        (data_rice_dropna['hillshade'] == 0) & 
                                        (data_rice_dropna['slope'] == 0)) ]

# 然后获取已知样本的列名称
index_X = X.columns.tolist()

#只选择施肥量大于25的区域
#X_rice_unknow_new = data_rice_dropna .query('N_fertilization_rate>=25')

# 将未知样本列名称
X_rice = data_rice_dropna.iloc[:,np.arange(2,94)]
#X_rice = X_rice[index_X]

# 标准化
X_rice_nor = scaler.transform(X_rice)

# 预测未知样本
y_rice_unknow = model_rice.predict(X_rice_nor)

# rice
rice_result = pd.DataFrame(columns=['X', 'Y', 'predicted'])
rice_result['X'] = data_rice_dropna['X']
rice_result['Y'] = data_rice_dropna['Y']
rice_result['predicted'] = y_rice_unknow
rice_result

Unnamed: 0_level_0,X,Y,predicted
OBJECTID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,90.791664,56.875000,9.219235
2,95.041664,56.875000,9.519521
3,90.708336,56.791668,8.549174
4,93.041664,56.791668,9.762869
5,94.875000,56.791668,9.762869
...,...,...,...
156123,-71.708336,-36.125000,6.602159
156124,-71.625000,-36.125000,7.235598
156125,-71.541664,-36.125000,6.681735
156126,-71.708336,-36.208332,6.634084


In [7]:
#读取数据wheat
data_wheat1 = pd.read_csv('D:/1 博后期间/papers/CRU_meta-analysis/global_analysis/data_summary/wheat_N_hwsd.txt', index_col = "OBJECTID")
data_wheat1 = data_wheat1.iloc[:,np.hstack((2, 3, 4, 8, 9, 10, np.arange(14,44)))]
data_wheat2 = pd.read_csv('D:/1 博后期间/papers/CRU_meta-analysis/global_analysis/data_summary/wheat_climate.txt', index_col = "OBJECTID")
data_wheat2 =  data_wheat2.iloc[:,np.hstack(np.arange(2,60))]
data_wheat = pd.merge(data_wheat1,data_wheat2,how='inner',on='OBJECTID')
index = pd.read_csv('D:/1 博后期间/papers/CRU_meta-analysis/global_analysis/data_summary/index.csv', index_col = "ID")
data_wheat.columns = index['index']
data_wheat["Soil_temp_5"] = pd.to_numeric(data_wheat["Soil_temp_5"],errors='coerce')
data_wheat["Soil_temp_15"] = pd.to_numeric(data_wheat["Soil_temp_15"],errors='coerce')

data_wheat_dropna  = data_wheat.replace([np.inf, -np.inf], np.nan).dropna(axis=0,how='any')

# 去掉DEM为0的数据
data_wheat_dropna = data_wheat_dropna[~((data_wheat_dropna['aspect'] == 0) & 
                                        (data_wheat_dropna['elevation'] == 0) & 
                                        (data_wheat_dropna['hillshade'] == 0) & 
                                        (data_wheat_dropna['slope'] == 0)) ]

# 然后获取已知样本的列名称
index_X = X.columns.tolist()
# 将未知样本列名称
X_wheat = data_wheat_dropna.iloc[:,np.arange(2,94)]
X_wheat = X_wheat[index_X]
# 标准化
X_wheat_nor = scaler.transform(X_wheat)

# 预测未知样本
y_wheat_unknow = model_wheat.predict(X_wheat_nor)

# wheat
wheat_result = pd.DataFrame(columns=['X', 'Y', 'predicted'])
wheat_result['X'] = data_wheat_dropna['X']
wheat_result['Y'] = data_wheat_dropna['Y']
wheat_result['predicted'] = y_wheat_unknow
wheat_result

Unnamed: 0_level_0,X,Y,predicted
OBJECTID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1335,-123.875000,59.958332,5.123903
1336,-1.291667,59.958332,5.025878
1337,10.291667,59.958332,4.850266
1338,10.458333,59.958332,4.715528
1340,10.958333,59.958332,9.043033
...,...,...,...
306951,169.458328,-46.208332,7.019200
306952,169.541672,-46.208332,7.167975
306953,168.125000,-46.291668,5.843010
306954,169.541672,-46.291668,6.138608


In [8]:
#读取数据maize
data_maize1 = pd.read_csv('D:/1 博后期间/papers/CRU_meta-analysis/global_analysis/data_summary/maize_N_hwsd.txt', index_col = "OBJECTID")
data_maize1 = data_maize1.iloc[:,np.hstack((2, 3, 4, 8, 9, 10, np.arange(14,44)))]
data_maize2 = pd.read_csv('D:/1 博后期间/papers/CRU_meta-analysis/global_analysis/data_summary/maize_climate.txt', index_col = "OBJECTID")
data_maize2 =  data_maize2.iloc[:,np.hstack(np.arange(2,60))]
data_maize = pd.merge(data_maize1,data_maize2,how='inner',on='OBJECTID')
index = pd.read_csv('D:/1 博后期间/papers/CRU_meta-analysis/global_analysis/data_summary/index.csv', index_col = "ID")
data_maize.columns = index['index']
data_maize["Soil_temp_5"] = pd.to_numeric(data_maize["Soil_temp_5"],errors='coerce')
data_maize["Soil_temp_15"] = pd.to_numeric(data_maize["Soil_temp_15"],errors='coerce')

data_maize_dropna  = data_maize.replace([np.inf, -np.inf], np.nan).dropna(axis=0,how='any')

# 去掉DEM为0的数据
data_maize_dropna = data_maize_dropna[~((data_maize_dropna['aspect'] == 0) & 
                                        (data_maize_dropna['elevation'] == 0) & 
                                        (data_maize_dropna['hillshade'] == 0) & 
                                        (data_maize_dropna['slope'] == 0)) ]

# 然后获取已知样本的列名称
index_X = X.columns.tolist()
# 将未知样本列名称
X_maize = data_maize_dropna.iloc[:,np.arange(2,94)]
X_maize = X_maize[index_X]
# 标准化
X_maize_nor = scaler.transform(X_maize)

# 预测未知样本
y_maize_unknow = model_maize.predict(X_maize_nor)

# wheat
maize_result = pd.DataFrame(columns=['X', 'Y', 'predicted'])
maize_result['X'] = data_maize_dropna['X']
maize_result['Y'] = data_maize_dropna['Y']
maize_result['predicted'] = y_maize_unknow
maize_result

Unnamed: 0_level_0,X,Y,predicted
OBJECTID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,72.541664,54.041668,8.275654
3,73.041664,54.041668,8.406866
4,73.125000,54.041668,8.464200
5,73.208336,54.041668,8.699007
9,7.375000,53.625000,7.100980
...,...,...,...
268906,170.291672,-45.125000,4.407817
268907,170.125000,-45.208332,5.086675
268908,168.208328,-45.708332,-1.440655
268909,168.458328,-45.791668,-0.231478


In [9]:
rice_result.to_csv('D:/1 博后期间/papers/CRU_meta-analysis/global_analysis/data_summary/results/yield_rice1.txt')
wheat_result.to_csv('D:/1 博后期间/papers/CRU_meta-analysis/global_analysis/data_summary/results/yield_wheat1.txt')
maize_result.to_csv('D:/1 博后期间/papers/CRU_meta-analysis/global_analysis/data_summary/results/yield_maize1.txt')