In [2]:
from xgboost.sklearn import XGBClassifier
XGBClassifier(base_score=0.5,booster='gbtree',colsample_bylevel=1,colsample_bynode=1,colsample_bytree=1,gamma=0,
              learning_rate=0.1,max_delta_step=0,max_depth=8,min_child_weight=1,missing=None,n_estimators=100,
              n_jobs=1,nthread=None,num_class=5,objective='multi:softprob',random_state=0,reg_alpha=0,reg_lambda=1,
             scale_pos_weight=1,seed=None,silent=None,subsample=1,verbosity=1)

XGBClassifier(max_depth=8, num_class=5, objective='multi:softprob')

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost.sklearn import XGBClassifier
from sklearn.metrics import classification_report
import joblib
from sklearn.metrics import f1_score,precision_score,recall_score


In [5]:
def get_cust_age_stage(birth_year):
    age_stage = []
    for i in range(len(birth_year)):
        if int(birth_year[i])==0:
            age_stage.append('未知')
        elif int(birth_year[i])<1960:
            age_stage.append('60前')
        elif int(birth_year[i])<1970:
            age_stage.append('60后')
        elif int(birth_year[i])<1980:
            age_stage.append('70后')
        elif int(birth_year[i])<1990:
            age_stage.append('80后')
        elif int(birth_year[i])<2000:
            age_stage.append('90后 ')
        elif int(birth_year[i])>=2000:
            age_stage.append('00后')
        else:
            age_stage.append("未知")
    return age_stage
def get_top5_onehot(data):
    c_top5_counts = data['c'].value_counts()[:5]
    c_top5_name = list(c_top5_counts.keys())
    c_one_hot = pd.get_dummies(data['c'])
    c_top5 = c_one_hot[c_top5_name]
    data = data.join(c_top5)
    return data
            

In [8]:
def get_quantile_20_values(input_data):
    grade = pd.DataFrame(columns=['quantile','value'])
    for i in range(0,21):
        grade.loc[i,'quantile']=i/20.0
        grade.loc[i,'value']=input_data.quantile(i/20.0)
    cut_point = grade['value'].tolist()
    s_unique = []
    for i in range(len(cut_point)):
        if cut_point[i] not in s_unique:
            s_unique.append(cut_point[i])
    return s_unique
def get_quantile_interregional(s_unique):
    interregional = []
    for i in range(1,len(s_unique)):
        interregional.append([i,s_unique[i-1],s_unique[i]])
        if i == len(s_unique)-1 and len(interregional)<20:
            interregional.append([i+1,s_unique[i],s_unique[i]])
    return interregional

In [10]:
def get_current_level(item_data,interregional):
    level = 0
    for i in range(len(interregional)):
        if item_data>=interregional[i][1] and item_data<interregional[i][2]:
            level = interregional[i][0]
        elif interregional[i][1]==interregional[i][2]:
            level = interregional[i][0]
            break
    return level

In [11]:
def get_division_level(input_data):
    s_unique = get_quantile_20_values(input_data)
    interregional = get_quantile_interregional(s_unique)
    quantile_20_level = []
    for item in input_data:
        quantile_20_level.append(get_current_level(item,interregional))
    return quantile_20_level

In [13]:
def pre_processing(data):
    data['年龄']= get_cust_age_stage(data['出生年月日'])
    data['本月平均时长']= data['本月时长'].div(data['本月次数'],axis=0)
    data['g']=data['a']-data['b']
    col_name_0 = ['a','b','g','k']
    values = {}
    for i in col_name_0:
        values[i]=0
    data.fillna(values=values,inplace=True)
    data.fillna({'m':'未知','z':'未知'},inplace=True)
    data = get_top5_onehot(data)
    col_name_level = ['d','e','f']
    for i in range(len(col_name_level)):
        new_col_name = col_name_level[i]+'_TILE20'
        data[new_col_name]=get_division_level(data[col_name_level[i]])
    return data
    

In [15]:
def get_model_columns(input_data):
    total_col_names = input_data.columns
    del_col_names = ['a','b','c']
    model_col_names = [i for i in total_col_names if i not in del_col_names]
    return model_col_names

In [16]:
def importance_features_top(model_str,model,x_train):
    print('XGBOOST 重要指标')
    feature_importances_ = model.feature_importances_
    feature_names = x_train.columns
    importance_col = pd.DataFrame([*zip(feature_names,feature_importances_)],columns=['a','b'])
    importance_col_desc = importance_col.sort_values(by='b',ascending=False)
    print(importance_col_desc[:10,:])

In [18]:
def print_precision_recall_f1(y_true,y_pre):
    print('精准率，召回率，F1')
    print(classification_report(y_true,y_pre))
    f1 = round(f1_score(y_true,y_pre,average='macro'),2)
    pre = round(precision_score(y_true,y_pre,average='macro'),2)
    rec = round(recall_score(y_true,y_pre,average='macro'),2)
    print('Precision {},Recall {}, F1 : {}'.format(pre,rec,f1))

In [21]:
def xgboost_model(x_train,y_train):
    xgboost_clf = XGBClassifier(min_child_weight=6,max_depth = 15,objective='multi:softmax',num_class=5)
    print(xgboost_clf)
    xgboost_clf.fit(x_train,y_train)
    importance_features_top('xgboost',xgboost_clf,x_train)
    joblib.dump(xgboost_clf,'xgb.model')
    return xgboost_clf

In [None]:
eval_set = []