winter: 12,1,2  
spring_migration:3,4,5  
breeding: 6,7,8  
fall_migration:9,10,11  

In [1]:
import xgboost as xgb
import pandas as pd
import os 
import numpy as np
import matplotlib.pyplot as plt  
from sklearn.model_selection import train_test_split
import seaborn as sns
import shap
from xgboost.sklearn import XGBClassifier
from pdpbox import pdp
from pdpbox import info_plots

pd.options.mode.chained_assignment = None

In [2]:
def par_dep(xs, frame, model, resolution=20, bins=[None]):   
    pd.options.mode.chained_assignment = None
    par_dep_frame = pd.DataFrame(columns=[xs, 'partial_dependence'])
    col_cache = frame.loc[:, xs].copy(deep=True)
    if bins[0] == None:
        min_ = frame[xs].min()
        max_ = frame[xs].max()
        by = (max_ - min_)/resolution
        print(min_,max_,by)
        bins = np.arange(min_, max_, by)
    for j in bins:
        frame.loc[:, xs] = j
        #dframe = xgb.DMatrix(frame)
        par_dep_i = pd.DataFrame(model.predict(frame))
        par_dep_j = par_dep_i.mean()[0]
        par_dep_frame = par_dep_frame.append({xs:j,
                                              'partial_dependence': par_dep_j}, 
                                              ignore_index=True)
    frame.loc[:, xs] = col_cache
    return par_dep_frame

def plot_par(variable,par_dataset,xgb_model,save_path =None,plot_bins=[None]):
    X = par_dataset[variable]
    par_dep_night = par_dep(variable,par_dataset, xgb_model,bins=plot_bins)
    
    fig,ax1 = plt.subplots(figsize = (8,6))
    ax2 = ax1.twinx()
    ax1.hist(X,color='yellowgreen',bins=50,label=variable)
    ax2.plot(par_dep_night[variable],par_dep_night['partial_dependence'],label='partial_dependence',c = 'red',lw=2)
    ax2.axhline(y=0.500,c="blue",ls='--',lw=2)
    fig.legend(fontsize = 12)
    plt.margins(x=0)
    if save_path==None:
        save_path=r'D:\china avian distribution\20200609\temptrash'
        #fig.savefig(save_path+'%s.png'%variable)
        

In [3]:
def hist(xs, frame,bins):   
    pd.options.mode.chained_assignment = None
    hist_list = []
    for i in range(len(bins)-1):
        temp = len(frame[xs][(frame[xs]>=bins[i]) & (frame[xs]<bins[i+1])])
        hist_list.append([bins[i],temp/len(frame[xs])])
        
    temp = len(frame[xs][(frame[xs]>=bins[-1])])
    hist_list.append([bins[-1],temp/len(frame[xs])])
    hist_df = pd.DataFrame(hist_list,columns=[xs,'hist'])
    return hist_df

In [4]:
def fit_model(data,season):
    '''
    input:data: bird dataframe
          season:list of the month
    output: X_total(for pdp calculate),model,
    '''
    season_df = data[data['month'].isin(season)]
    X_total=season_df.iloc[:,1:].drop(columns=['month'])
    y_total=season_df.iloc[:,0]
    X_train, X_test, y_train, y_test = train_test_split(
                                        X_total, y_total, test_size=0.33)
    dtrain = xgb.DMatrix(data=X_train,label=y_train)
    dtest = xgb.DMatrix(data=X_test,label=y_test)
    #watchlist = [(dtrain, 'train'),(dtest, 'eval')]
    base_y = y_train.mean()
    xgb_model = XGBClassifier(
              max_depth=12,
              learning_rate=0.01,
              n_estimators=200,
              verbosity=1,
              objective='binary:logistic',
              booster='gbtree',
              gamma=0.4,
              min_child_weight=6,
              subsample=0.8,
              reg_lambda=1,
              base_score=base_y,
              eval_metric='auc',
              )
    xgb_model.fit(X_train, y_train,eval_set=[(X_test, y_test)],early_stopping_rounds=50,verbose=False) 
    return X_total,xgb_model

In [5]:
## 输出所有的hist和pdp，输入X_total和model
def out_pdp_hist(par_dataset,xgb_model,season,save_path =None,variables=None):
    if variables==None:
        variables=['distance_light']
    X = par_dataset[variables]
    for v in variables:
        #print(v)
        plot_bin = V_bins[v]
        par_dep_v = par_dep(v,par_dataset, xgb_model,bins=plot_bin)
        hist_v = hist(v,par_dataset,bins=plot_bin)
        
        par_dep_v.to_csv(save_path+'%s/'%season+'%s_%s.csv'%('pdp',v),index=False)
        hist_v.to_csv(save_path+'%s/'%season+'%s_%s.csv'%('hist',v),index=False) 
        print(save_path+'%s/'%season+'%s_%s.csv'%('pdp',v))

# season

In [6]:
path = r'../../data/west'
os.chdir(path)
data = pd.read_csv('west.csv',engine='c')
data['month'] = data['date'].apply(lambda x:pd.Timestamp(x).month)
data = data.drop(columns=['date','lon','lat','Open Water','Urban-Built-up','elevation','night light'])

In [7]:
V_bins ={}
resolution = 20
for i in data.iloc[:,1:].drop(columns=['month']).columns:
    max_ = data[i].max()
    min_ = data[i].min()
    by = (max_ - min_)/20
    V_bins[i] = np.append(np.arange(min_, max_, by),max_)

In [8]:
wintering=[12,1,2]
spring_migration=[3,4,5]
breeding=[6,7,8]
fall_migration=[9,10,11]
seasons = [spring_migration,breeding,fall_migration,wintering]

spring_migration,breeding,fall_migration,wintering

In [None]:
X_total1,xgb_model1 = fit_model(data,spring_migration)
xgb.plot_importance(xgb_model1,importance_type='gain')
save = r'../../result/figure_data/season_pdp/west/'
out_pdp_hist(par_dataset=X_total1,xgb_model=xgb_model1,season='spring',save_path =save,variables=None)

In [None]:
X_total2,xgb_model2 = fit_model(data,breeding)
xgb.plot_importance(xgb_model2,importance_type='gain')
save = r'../../result/figure_data/season_pdp/west/'
out_pdp_hist(par_dataset=X_total2,xgb_model=xgb_model2,season='breeding',save_path =save,variables=None)

In [None]:
X_total3,xgb_model3 = fit_model(data,fall_migration)
xgb.plot_importance(xgb_model3,importance_type='gain')
save = r'../../result/figure_data/season_pdp/west/'
out_pdp_hist(par_dataset=X_total3,xgb_model=xgb_model3,season='fall',save_path =save,variables=None)

In [None]:
X_total4,xgb_model4 = fit_model(data,wintering)
xgb.plot_importance(xgb_model4,importance_type='gain')
save = r'../../result/figure_data/season_pdp/west/'
out_pdp_hist(par_dataset=X_total4,xgb_model=xgb_model4,season='wintering',save_path =save,variables=None)