In [38]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import gc
from sklearn.model_selection import train_test_split,RandomizedSearchCV
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
from sklearn.preprocessing import StandardScaler
import joblib
import time
np.random.seed(2020)


def lgb_model_age(train_x, test_x,train_y, test_y):
    model = lgb.LGBMClassifier (objective = 'multiclass',
                                num_class = 10,
                                n_estimators = 200
                               )
#     model.fit(train_x,train_y,early_stopping_rounds=100,eval_set=eval_set=[(train_x,train_y),(test_x,test_y)],verbose = 10)
    
    
    #设定搜索的xgboost参数搜索范围，值搜索XGBoost的主要6个参数
    param_dist = {
            'learning_rate':np.linspace(0.01,0.5,20),
            'subsample':np.linspace(0.1,0.9,10),
            'colsample_bytree':np.linspace(0.1,0.9,10),
            'num_leaves' : range(32,128,6),
            'reg_alpha' : np.linspace(0,0.1,10),
            'reg_lambda' : np.linspace(0,0.1,10)
            }

    #RandomizedSearchCV参数说明，clf1设置训练的学习器
    #param_dist字典类型，放入参数搜索范围
    #scoring = 'neg_log_loss'，精度评价方式设定为“neg_log_loss“
    #n_iter=300，训练300次，数值越大，获得的参数精度越大，但是搜索时间越长
    #n_jobs = -1，使用所有的CPU进行训练，默认为1，使用1个CPU
    SearchCV = RandomizedSearchCV(model,param_dist,cv = 5,scoring = 'neg_log_loss',n_iter=10,n_jobs = -1,verbose = 10)
    
    #在训练集上训练
    SearchCV.fit(train_x,train_y)
    
    # # 模型存储
    joblib.dump(SearchCV, 'w2v_lgb_age_SearchCV.pkl')
    # # 模型预测
    y_t_pred = SearchCV.predict(test_x)

    # print(model.get_score(importance_type='weight'))
    cm = confusion_matrix(test_y, y_t_pred)
    np.set_printoptions(precision=3)                                    # 显示精度
    cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]  # 将样本矩阵转化为比率


    print('age************************************')
    print('confusion_matrix is \n {:} \n '.format(cm_normalized))
    print('test acc is \n {:} \n '.format(np.sum(test_y==y_t_pred)/len(test_y)))
    print(classification_report(test_y,y_t_pred))
    print('accuracy is %f , sen is %f,spe is %f ' % (accuracy_score(test_y, y_t_pred) * 100,
                                                     cm_normalized[0][0],cm_normalized[1][1] ))

    return accuracy_score(test_y, y_t_pred)

def lgb_model_gender(train_x, test_x,train_y, test_y):
    model = lgb.LGBMClassifier (objective = 'binary',
                                n_estimators = 100
                               )
#     model.fit(train_x,train_y,early_stopping_rounds=100,eval_set=eval_set=[(train_x,train_y),(test_x,test_y)],verbose = 10)
    
    
    #设定搜索的xgboost参数搜索范围，值搜索XGBoost的主要6个参数
    param_dist = {
            'learning_rate':np.linspace(0.01,0.5,20),
            'subsample':np.linspace(0.1,0.9,10),
            'colsample_bytree':np.linspace(0.1,0.9,10),
            'num_leaves' : range(32,128,6),
            'reg_alpha' : np.linspace(0,0.1,10),
            'reg_lambda' : np.linspace(0,0.1,10)
            }
    #RandomizedSearchCV参数说明，clf1设置训练的学习器
    #param_dist字典类型，放入参数搜索范围
    #scoring = 'neg_log_loss'，精度评价方式设定为“neg_log_loss“
    #n_iter=300，训练300次，数值越大，获得的参数精度越大，但是搜索时间越长
    #n_jobs = -1，使用所有的CPU进行训练，默认为1，使用1个CPU
    SearchCV = RandomizedSearchCV(model,param_dist,cv = 5,scoring = 'neg_log_loss',n_iter=10,n_jobs = -1,verbose = 10)
    
    #在训练集上训练
    SearchCV.fit(train_x,train_y)
    
    # # 模型存储
    joblib.dump(SearchCV, 'w2v_lgb_gender_SearchCV.pkl')
    # # 模型预测
    y_t_pred = SearchCV.predict(test_x)

    # print(model.get_score(importance_type='weight'))
    cm = confusion_matrix(test_y, y_t_pred)
    np.set_printoptions(precision=3)                                    # 显示精度
    cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]  # 将样本矩阵转化为比率

    print('gender************************************')
    print('confusion_matrix is \n {:} \n '.format(cm_normalized))
    print('test acc is \n {:} \n '.format(np.sum(test_y==y_t_pred)/len(test_y)))
    print(classification_report(test_y,y_t_pred))
    print('accuracy is %f , sen is %f,spe is %f ' % (accuracy_score(test_y, y_t_pred) * 100,
                                                     cm_normalized[0][0],cm_normalized[1][1] ))
    return accuracy_score(test_y, y_t_pred)

def load_data():
    # user
    data = pd.read_csv('w2v_feat_data/train_data.csv')
    data = data.head(200000)
    label = data[['age','gender']]
    
    data = data.drop(['user_id','age','gender'],axis = 1)
    return data,label

In [34]:
data,label = load_data()
#划分age的训练和测试数据
train_x, test_x, train_y, test_y = train_test_split(data, label, test_size=0.8,random_state=2020)

In [39]:
# star = time.time()
# acc_age = lgb_model_age(train_x, test_x,train_y['age'], test_y['age'])
# end = time.time()
# print('time spend ',end-star)

star = time.time()
acc_gender = lgb_model_gender(train_x, test_x,train_y['gender'], test_y['gender'])
end = time.time()
print('time spend ',end-star)

# print(acc_age+acc_gender)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    6.3s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   10.0s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   19.1s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   23.7s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   31.2s
[Parallel(n_jobs=-1)]: Done  41 out of  50 | elapsed:   37.0s remaining:    8.1s
[Parallel(n_jobs=-1)]: Done  47 out of  50 | elapsed:   38.8s remaining:    2.5s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   40.4s finished


gender************************************
confusion_matrix is 
 [[0.952 0.048]
 [0.389 0.611]] 
 
test acc is 
 0.838625 
 
              precision    recall  f1-score   support

           1       0.83      0.95      0.89      5345
           2       0.86      0.61      0.72      2655

    accuracy                           0.84      8000
   macro avg       0.85      0.78      0.80      8000
weighted avg       0.84      0.84      0.83      8000

accuracy is 83.862500 , sen is 0.951543,spe is 0.611299 
time spend  48.47770118713379


还是采用贝叶斯方法优化一下，大概取10,0000样本就可以