In [1]:
import xgboost as xgb
import pandas as pd
import os 
import numpy as np
import matplotlib.pyplot as plt  
#from pdpbox import info_plots
from xgboost.sklearn import XGBClassifier
#from pdpbox import pdp
from matplotlib import cm
from sklearn.metrics import roc_auc_score
#from geopy import distance
from time import time
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.inspection import plot_partial_dependence
#%matplotlib widget
pd.options.mode.chained_assignment = None
plt.rc('font',family='Arial')

In [81]:
def par_dep(xs, frame, model, resolution=25, bins=[None]):   
    pd.options.mode.chained_assignment = None
    par_dep_frame = pd.DataFrame(columns=[xs, 'partial_dependence'])
    col_cache = frame.loc[:, xs].copy(deep=True)
    if bins[0] == None:
        min_ = frame[xs].min()
        max_ = frame[xs].max()
        by = (max_ - min_)/resolution
        print(min_,max_,by)
        bins = np.arange(min_, max_, by)
    for j in bins:
        frame.loc[:, xs] = j
        #dframe = xgb.DMatrix(frame)
        par_dep_i = pd.DataFrame(model.predict(frame))
        par_dep_j = par_dep_i.mean()[0]
        par_dep_frame = par_dep_frame.append({xs:j,
                                              'partial_dependence': par_dep_j}, 
                                              ignore_index=True)
    frame.loc[:, xs] = col_cache
    return par_dep_frame

def plot_par(variable,par_dataset,xgb_model,save_path =None,plot_bins=[None]):
    X = par_dataset[variable]
    par_dep_night = par_dep(variable,par_dataset, xgb_model,bins=plot_bins)
    
    fig,ax1 = plt.subplots(figsize = (8,6))
    ax2 = ax1.twinx()
    ax1.hist(X,color='yellowgreen',bins=50,label=variable)
    ax2.plot(par_dep_night[variable],par_dep_night['partial_dependence'],label='partial_dependence',c = 'red',lw=2)
    ax2.axhline(y=0.500,c="blue",ls='--',lw=2)
    fig.legend(fontsize = 12)
    plt.margins(x=0)
        
def hist(xs, frame,bins):   
    pd.options.mode.chained_assignment = None
    hist_list = []
    for i in range(len(bins)-1):
        temp = len(frame[xs][(frame[xs]>=bins[i]) & (frame[xs]<bins[i+1])])
        hist_list.append([bins[i],temp/len(frame[xs])])
        
    temp = len(frame[xs][(frame[xs]>=bins[-1])])
    hist_list.append([bins[-1],temp/len(frame[xs])])
    hist_df = pd.DataFrame(hist_list,columns=[xs,'hist'])
    return hist_df

## 输出所有的hist和pdp，输入X_total和model
def out_pdp_hist(par_dataset,xgb_model,V_bins,region,save_path =None,variables=None):
    if variables==None:
        variables=['distance_light']
    #X = par_dataset[variables]
    for v in variables:
        #print(v)
        plot_bin = V_bins[v]
        par_dep_v = par_dep(v,par_dataset, xgb_model,bins=plot_bin)
        hist_v = hist(v,par_dataset,bins=plot_bin)

        par_dep_v.to_csv(save_path+'/%s'%region+'/pdp_csv/%s_%s.csv'%('pdp',v),index=False)
        hist_v.to_csv(save_path+'/%s'%region+'/hist_csv/%s_%s.csv'%('hist',v),index=False) 
        #print(save_path+'/%s_%s.csv'%('pdp',v))

In [45]:
def stream1(point_path,region):
    path = point_path+r'noise_analysis_dataset/finall'
    os.chdir(path)
    data = pd.read_excel('%s.xlsx'%region,index_col=0)
    data['date'] = pd.to_datetime(data['date'])
    data['month'] = data.apply(lambda x:x.date.month,axis=1)
    data = data.rename({'2 metre temperature':'temperature','Total precipitation':'prepicipitation',
                '10 metre U wind component':'u_wind','10 metre V wind component':'v_wind'})
    data = data.drop(columns=['date','lon','lat','Open Water','Urban-Built-up','Sonw-Ice'])
    y = data.iloc[:,0].astype('int8')
    X = data.iloc[:,1:]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33,random_state = 100)
    # 计算一套V_bins 
    V_bins ={}
    resolution = 20
    for i in X_train.columns:
        V_bins[i]=np.arange(0,401,20)

    xgb_model = XGBClassifier(
                  n_jobs=4,
                  max_depth=6,
                  learning_rate=0.1,
                  n_estimators=300,
                  verbosity=1,
                  objective='binary:logistic',
                  booster='gbtree',
                  gamma=1,
                  min_child_weight=4,
                  subsample=0.6,
                  colsample_bytree=0.6,
                  reg_alpha=1,
                  reg_lambda=10,
                  base_score=0.5,
                  eval_metric='auc',
              )
    #xgb_model.fit(X_train, y_train,eval_set=[(X_test, y_test)],early_stopping_rounds=75,verbose=0)
    xgb_model.fit(X_train, y_train,eval_set=[(X_test, y_test)],verbose=0)
    print(xgb_model.best_score)
    save = point_path+'plot'
    out_pdp_hist(par_dataset=X_train,xgb_model=xgb_model,V_bins=V_bins,region=region,save_path=save,variables=['distance_light'])
    
    features = [['distance_light','Km']]
    for f,u in features:
        inpath = save
        outpath = save
        plot_pdp(path=inpath,file = f,unit = u,region=region,outpath=outpath,filename='distance_light_%s'%region)
    return save

In [None]:
for d in [5,10,20,50,100]:
    for i in range(0,50):
        stream_path = r'/home/wb/Documents/nc_noiseanalysi/noise/noise_%skm/%s/'%(d,i)
        %time stream1(stream_path,'east')

In [None]:
for d in [5,10,20,50,100]:
    for i in range(0,50):
        stream_path = r'/home/wb/Documents/nc_noiseanalysi/noise/noise_%skm/%s/'%(d,i)
        %time stream1(stream_path,'west')