In [2]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.inspection import permutation_importance
from sklearn import metrics
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
import pingouin as pg

# read data
ch4 = pd.read_csv('data/ml_data/output_data/ch4_rice1.txt', index_col = "OBJECTID")
n2o = pd.read_csv('data/ml_data/output_data/n2o_rice1.txt', index_col = "OBJECTID")
nh3 = pd.read_csv('data/ml_data/output_data/nh3_rice1.txt', index_col = "OBJECTID")
leaching = pd.read_csv('ldata/ml_data/output_data/eaching_rice1.txt', index_col = "OBJECTID")
runoff = pd.read_csv('data/ml_data/output_data/runoff_rice1.txt', index_col = "OBJECTID")

ch4_bk = pd.read_csv('data/ml_data/output_data/rice_ch4_bk1.txt', index_col = "OBJECTID")
n2o_bk = pd.read_csv('data/ml_data/output_data/rice_n2o_bk1.txt', index_col = "OBJECTID")
nh3_bk = pd.read_csv('data/ml_data/output_data/rice_nh3_bk1.txt', index_col = "OBJECTID")
leaching_bk = pd.read_csv('data/ml_data/output_data/rice_leaching_bk1.txt', index_col = "OBJECTID")
runoff_bk = pd.read_csv('data/ml_data/output_data/rice_runoff_bk1.txt', index_col = "OBJECTID")

# cal gwp and nr losses
n2o_total = 1.0075*(0.001*n2o_bk['predicted']*n2o['predicted']*0.01)+0.01*(nh3_bk['predicted']*nh3['predicted']*0.01)
gwp = pd.DataFrame(columns=['X', 'Y', 'value'])
gwp['X'] = ch4['X']
gwp['Y'] = ch4['Y']
gwp['value'] = 25*(16/12)*ch4['predicted']*ch4_bk['predicted']*0.01+298*(44/28)*n2o_total

nr = pd.DataFrame(columns=['X', 'Y', 'value'])
nr['X'] = ch4['X']
nr['Y'] = ch4['Y']
nr['value'] = n2o['predicted']*0.001*n2o_bk['predicted']*0.01+leaching['predicted']*leaching_bk['predicted']*0.01+runoff['predicted']*runoff_bk['predicted']*0.01+nh3['predicted']*nh3_bk['predicted']*0.01
nr

# read predictors
data_rice1 = pd.read_csv('data/ml_data/input_data/rice_N_hwsd.txt', index_col = "OBJECTID")
data_rice1 = data_rice1.iloc[:,np.hstack((2, 3, 4, 8, 9, 10, np.arange(14,44)))]
data_rice2 = pd.read_csv('data/ml_data/input_data/rice_climate.txt', index_col = "OBJECTID")
data_rice2 =  data_rice2.iloc[:,np.hstack(np.arange(2,60))]
data_rice = pd.merge(data_rice1,data_rice2,how='inner',on='OBJECTID')
index = pd.read_csv('data/ml_data/input_data/index.csv', index_col = "ID")
data_rice.columns = index['index']
data_rice["Soil_temp_5"] = pd.to_numeric(data_rice["Soil_temp_5"],errors='coerce')
data_rice["Soil_temp_15"] = pd.to_numeric(data_rice["Soil_temp_15"],errors='coerce')

data_rice_dropna  = data_rice.replace([np.inf, -np.inf], np.nan).dropna(axis=0,how='any')

# remove 0 in dem
data_rice_dropna = data_rice_dropna[~((data_rice_dropna['aspect'] == 0) & 
                                        (data_rice_dropna['elevation'] == 0) & 
                                        (data_rice_dropna['hillshade'] == 0) & 
                                        (data_rice_dropna['slope'] == 0)) ]

d = pd.concat([data_rice_dropna, gwp['value']], axis=1)

### sub-window analysis

deta = 0.0833333
var_nm=list(d)[2:94]

time_start = time.time()
vip_total = pd.DataFrame()
r_total = pd.DataFrame()
for i in range(len(d)):
    x_min = d.iloc[i, 0]-9*deta
    x_max = d.iloc[i, 0]+9*deta
    y_min = d.iloc[i, 1]-9*deta
    y_max = d.iloc[i, 1]+9*deta
    
    subd = d[(d['X']<=x_max)& (d['X']>=x_min)&(d['Y']<=y_max)& (d['Y']>=y_min)]
    
    if len(subd)>=3:
        pc = pd.DataFrame(columns=['n', 'r', 'CI95%', 'p-val'])

        for k in range(92):
            label = var_nm[k]
            pc_k = pg.partial_corr(data=subd, x='value', y=label, covar=var_nm[1:k] + var_nm[k+1:93])
            pc = pc.append(pc_k)
            
        pc['var_nm'] = var_nm
        pc['r_abs'] = abs(pc['r'])

        pc.sort_values(by='r_abs', inplace=True, ascending=False)
        vip = np.transpose(pd.DataFrame(pc['var_nm']))  
        r = np.transpose(pd.DataFrame(pc['r'])) 
        vip = vip.iloc[:,[0,1,2]]
        r = r.iloc[:, [0,1,2]]
        
    else:
        vip = np.transpose(pd.DataFrame(['NA', "NA", "NA"]))
        r = np.transpose(pd.DataFrame(['NA', "NA", "NA"]))
    
        
    vip_total = vip_total.append(pd.DataFrame(vip.values))
    r_total = r_total.append(pd.DataFrame(r.values))
    
    print(i)

time_end = time.time()

time_end-time_start

data_pre_XY = d.iloc[:,[0,1,94]]
vip_total_xy = pd.concat([data_pre_XY.reset_index(), vip_total.reset_index(), r_total.reset_index()], axis=1, ignore_index=True)
vip_total_xy = vip_total_xy.iloc[:, [1,2,3,5,6,7,9,10,11]]
vip_total_xy.columns = ['X', 'Y', 'predicted', 'vip1', 'vip2', 'vip3','r1','r2','r3']
vip_total_xy
vip_total_xy.to_csv('data/ml_data/output_data/partial_correlation_GWP_second_rice_season.csv')