In [None]:
%reset

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.cross_decomposition import PLSRegression
from sklearn.inspection import permutation_importance
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#################prepare data#########################
# read train data
data = pd.read_csv('data/ml_data/input_data/data_machine_learning_new.csv',index_col='ID') 
id_X = np.hstack((3, np.arange(8,99)))
id_Y = np.arange(106,113)
X = data.iloc[:,id_X]
Y = data.iloc[:,id_Y]

# extract y, remove NA
y = Y['effect_yield']
id_nona = y[y.isnull().values==False].index.tolist()
#id_crop = data.Crop_type[data.Crop_type=='Maize'].index.tolist()
#id_sel = list(set(id_nona).intersection(set(id_crop)))
id_sel = id_nona 

X_sel = X.iloc[id_sel, :]
y_sel = y [id_sel]

id_r = data.Crop_type[data.Crop_type=='Paddy rice'].index.tolist()
id_rice =  list(set(id_nona).intersection(set(id_r)))
x_rice = X.iloc[id_rice, :]
y_rice = y [id_rice]                
              
id_w = data.Crop_type[data.Crop_type=='Wheat'].index.tolist()
id_wheat =  list(set(id_nona).intersection(set(id_w)))
x_wheat = X.iloc[id_wheat, :]
y_wheat = y [id_wheat]  
                 
id_m = data.Crop_type[data.Crop_type=='Maize'].index.tolist()
id_maize =  list(set(id_nona).intersection(set(id_m)))
x_maize = X.iloc[id_maize, :]
y_maize = y [id_maize]                 

# normalization
scaler = StandardScaler()
scaler.fit(X_sel)

X_sel = scaler.transform(X_sel)
y_sel = y_sel.values

x_rice = scaler.transform(x_rice)
y_rice = y_rice.values

x_wheat = scaler.transform(x_wheat)
y_wheat = y_wheat.values
                  
x_maize = scaler.transform(x_maize)
y_maize = y_maize.values
               
x_others = scaler.transform(x_others)
y_others = y_others.values

# divide train and test sets
x_train_entire, x_test_entire,  y_train_entire, y_test_entire = train_test_split(X_sel, y_sel, test_size = 0.3, random_state = 0)
x_train_rice, x_test_rice,  y_train_rice, y_test_rice = train_test_split(x_rice, y_rice, test_size = 0.3, random_state = 0)
x_train_wheat, x_test_wheat,  y_train_wheat, y_test_wheat = train_test_split(x_wheat, y_wheat, test_size = 0.3, random_state = 0)
x_train_maize, x_test_maize,  y_train_maize, y_test_maize = train_test_split(x_maize, y_maize, test_size = 0.3, random_state = 0)


############ optimize model' parameters##############
# optimize model's parameters by 10 fold cross-validation
param_grid = [
    {'n_components': np.arange(1, 20, 1)}
] 
clf = GridSearchCV(PLSRegression(), param_grid, scoring="neg_mean_squared_error", n_jobs=1, cv=10)
clf.fit(x_train_rice, y_train_rice)

# export cross-validation results
cv_result = pd.DataFrame(clf.cv_results_)
cv_result = cv_result.iloc[:, [4,16]]
cv_result["mean_test_score"] = np.sqrt(np.absolute(cv_result["mean_test_score"]))
cv_result
cv_result.to_csv('data/ml_data/output_data/cv_pls_yield_rice.csv')

# view, construct, and train optimal models
print(clf.best_params_)
print(clf.cv_results_)
model_rice = clf.best_estimator_
model_rice.fit(x_train_rice, y_train_rice)

############ train model and validation##############
# train model
model_rice.fit(x_train_rice, y_train_rice)
# prediction train and test sets
y_train_rice_pred = model_rice.predict(x_train_rice)
y_test_rice_pred = model_rice.predict(x_test_rice)

# summary and calculation of prediction performance
result = pd.DataFrame(columns=['mae_train', 'r2_train', 'rmse_train', 'rpd_train', 'rpiq_train', 'mae_test', 'r2_test', 'rmse_test', 'rpd_test', 'rpiq_test'])
result['mae_train'] = [metrics.mean_absolute_error(y_train_rice, y_train_rice_pred)]
result["r2_train"] = metrics.r2_score(y_train_rice, y_train_rice_pred)
result['rmse_train'] = np.sqrt(metrics.mean_squared_error(y_train_rice, y_train_rice_pred))
result['rpd_train'] = np.std(y_train_rice)/np.sqrt(metrics.mean_squared_error(y_train_rice, y_train_rice_pred))
result['rpiq_train'] = np.std(y_train_rice)/np.sqrt(metrics.mean_squared_error(y_train_rice, y_train_rice_pred))
result['rpiq_train'] = (np.percentile(y_train_rice, (25, 75))[1]-np.percentile(y_train_rice, (25, 75))[0])/np.sqrt(metrics.mean_squared_error(y_train_rice, y_train_rice_pred))

result['mae_test'] = metrics.mean_absolute_error(y_test_rice, y_test_rice_pred)
result['r2_test'] = metrics.r2_score(y_test_rice, y_test_rice_pred)
result['rmse_test'] = np.sqrt(metrics.mean_squared_error(y_test_rice, y_test_rice_pred))
result['rpd_test'] = np.std(y_test_rice)/np.sqrt(metrics.mean_squared_error(y_test_rice, y_test_rice_pred))
result['rpiq_test'] = (np.percentile(y_test_rice, (25, 75))[1]-np.percentile(y_test_rice, (25, 75))[0])/np.sqrt(metrics.mean_squared_error(y_test_rice, y_test_rice_pred))

result.to_csv('data/ml_data/output_data/performance_pls_yield_rice.csv')

# fits the predicted and actual values
f_train = np.polyfit(np.array(y_train_rice).flatten(), y_train_rice_pred, 1).flatten()
p_train = np.poly1d(f_train)
f_test = np.polyfit(np.array(y_test_rice).flatten(), y_test_rice_pred, 1).flatten()
p_test = np.poly1d(f_test)

# scatter plot
plt.scatter(y_train_rice, y_train_rice_pred)
plt.scatter(y_test_rice, y_test_rice_pred)
plt.plot([0, 60], [0, 60], color='black', linestyle='--')
plt.plot([0, 60], p_train([0, 60]))
plt.plot([0, 60], p_test([0, 60]))

# export predicted and actual data
y_train_rice_new = pd.DataFrame(y_train_rice)
y_train_rice_new['predicted'] = y_train_rice_pred 
y_train_rice_new['set'] = 'Calibration set'
y_train_rice_new['crop'] = 'Paddy rice'
y_train_rice_new.columns = ['measured', 'predicted', 'set', 'crop']

y_test_rice_new = pd.DataFrame(y_test_rice)
y_test_rice_new['predicted'] = y_test_rice_pred 
y_test_rice_new['set'] = 'Validation set'
y_test_rice_new['crop'] = 'Paddy rice'
y_test_rice_new.columns = ['measured', 'predicted', 'set', 'crop']
y_test_rice_new

Predicted_to_Measured= pd.concat([y_train_rice_new, y_test_rice_new])
Predicted_to_Measured.to_csv('data/ml_data/output_data/predicted_vs_measured_pls_yield_rice.csv')


###########permutation feature importance##########
from sklearn.inspection import permutation_importance
vip = permutation_importance(model_rice, x_test_rice, y_test_rice, 
                           n_repeats=30,
                           random_state=0)
vip.to_csv('data/ml_data/output_data/vip_pls_yield_rice.csv')