In [234]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, roc_auc_score, log_loss
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')

In [235]:
train_user = pd.read_csv('./data/train/训练集_用户基本信息表.txt')
train_day = pd.read_csv('./data/train/训练集_用户日电量明细表.txt')
train_month = pd.read_csv('./data/train/训练集_行业户均月电量.txt')

test_user = pd.read_csv('./data/test/测试集_用户基本信息表.txt')
test_day = pd.read_csv('./data/test/测试集_用户日电量明细表.txt')
test_month = pd.read_csv('./data/test/测试集_行业户均月电量.txt')

df_user = pd.concat([train_user, test_user])
df_day = pd.concat([train_day, test_day])
df_month = pd.concat([train_month, test_month])

feats = ['contract_cap', 'run_cap']

In [236]:
# 行业表

tmp = df_month.groupby(['trade_code', 'trade_name', 'county_code'])['avg_settle_pq'].agg(
    ['mean', 'median', 'skew', 'sum']).reset_index()
tmp.columns = list(tmp.columns[:3]) + ['avg_settle_pq_mean', 'avg_settle_pq_median', 'avg_settle_pq_skew', 'avg_settle_pq_sum']
df_user = pd.merge(df_user, tmp, on=['trade_code', 'trade_name', 'county_code'], how='left')
feats += ['avg_settle_pq_mean', 'avg_settle_pq_median', 'avg_settle_pq_skew']

In [237]:
# 用户表

df_day['year'] = pd.to_datetime(df_day['sum_date']).dt.year
df_day['month'] = pd.to_datetime(df_day['sum_date']).dt.month

In [238]:
# 用户用电月份总数 
tmp_user_month_cnt = df_day[['user_id','year','month']].drop_duplicates().groupby(['user_id'])['year'].count().reset_index().rename(columns={'year':'month_cnt'})
# tmp_month_cnt


# 每个用户总用电情况
tmp_user_month_stat = df_day.groupby(['user_id'])['d_kwh_quantity'].agg(['mean', 'median', 'skew', 'sum']).reset_index()
tmp_user_month_stat.columns = ['user_id','d_kwh_quantity_mean', 'd_kwh_quantity_median', 'd_kwh_quantity_skew', 'd_kwh_quantity_sum']

tmp_user = pd.merge(tmp_user_month_stat,tmp_user_month_cnt,on='user_id',how='left')
tmp_user['avg_month_d_kwh_quantity'] = tmp_user['d_kwh_quantity_sum']/tmp_user['month_cnt']
tmp_user = tmp_user[tmp_user['d_kwh_quantity_sum']!=0]

df_user = pd.merge(df_user,tmp_user,on='user_id',how='left')
df_user.fillna(-1,inplace=True)
# df_user[['d_kwh_quantity_mean', 'd_kwh_quantity_median', 'd_kwh_quantity_skew', 'd_kwh_quantity_sum','avg_month_d_kwh_quantity','month_cnt']]=df_user[['d_kwh_quantity_mean', 'd_kwh_quantity_median', 'd_kwh_quantity_skew', 'd_kwh_quantity_sum','avg_month_d_kwh_quantity','month_cnt']].fillna(-1)
feats += ['user_id','d_kwh_quantity_mean', 'd_kwh_quantity_median', 'd_kwh_quantity_skew', 'd_kwh_quantity_sum','avg_month_d_kwh_quantity']

In [239]:
df_user.isnull().sum()

user_id                     0
county_code                 0
volt_name                   0
elec_type_name              0
status_name                 0
trade_code                  0
trade_name                  0
build_date                  0
contract_cap                0
run_cap                     0
flag                        0
avg_settle_pq_mean          0
avg_settle_pq_median        0
avg_settle_pq_skew          0
avg_settle_pq_sum           0
d_kwh_quantity_mean         0
d_kwh_quantity_median       0
d_kwh_quantity_skew         0
d_kwh_quantity_sum          0
month_cnt                   0
avg_month_d_kwh_quantity    0
dtype: int64

In [240]:
cat_feats = ['county_code', 'volt_name', 'elec_type_name', 'status_name', 'trade_name']
for name in cat_feats:
    le = LabelEncoder()
    df_user[name] = le.fit_transform(df_user[name])
feats += cat_feats

df_train = df_user[df_user['flag']!=-1].reset_index()
df_test = df_user[df_user['flag']==-1].reset_index()

print(feats)

['contract_cap', 'run_cap', 'avg_settle_pq_mean', 'avg_settle_pq_median', 'avg_settle_pq_skew', 'user_id', 'd_kwh_quantity_mean', 'd_kwh_quantity_median', 'd_kwh_quantity_skew', 'd_kwh_quantity_sum', 'avg_month_d_kwh_quantity', 'county_code', 'volt_name', 'elec_type_name', 'status_name', 'trade_name']


In [218]:
# def gen_thres_new(df_train, oof_preds):
#     df_train['oof_preds'] = oof_preds
#     thres =0.1
#     # thres = df_train['flag'].mean() 
#     # thres = df_train['oof_preds'].quantile(1 - quantile_point) # 比如 0,1,1,1 mean=0.75 1-mean=0.25,也就是25%分位数取值为0

#     _thresh = []
#     for thres_item in np.arange(thres - 0.05, thres + 0.05, 0.01): #  按照理论阈值的上下0.2范围，0.01步长，找到最佳阈值，f1分数最高对应的阈值即为最佳阈值
#         _thresh.append(
#             [thres_item, f1_score(df_train['flag'], np.where(oof_preds > thres_item, 1, 0), average='macro')])

#     _thresh = np.array(_thresh)
#     best_id = _thresh[:, 1].argmax() # 找到f1最高对应的行
#     best_thresh = _thresh[best_id][0] # 取出最佳阈值

#     print("阈值: {}\n训练集的f1: {}".format(best_thresh, _thresh[best_id][1]))
#     return best_thresh

# lgb_thres =  gen_thres_new(df_train, oof_preds_lgb)
# print(lgb_thres)


# df_test['flag'] = np.where(test_preds_lgb>lgb_thres,1,0)
# df_test[['user_id', 'flag']].to_csv('./baseline.csv', index=False, header=['id', 'flag'])

阈值: 0.09000000000000001
训练集的f1: 0.22791725348943162
0.09000000000000001


In [242]:
lgb_model = lgb.LGBMClassifier(objective= 'binary',
                               metric= 'auc',
                               num_leaves= 512,  # num_leaves < 2^max_depth
                               boosting_type= 'gbdt',
                               bagging_freq= 1,
                               lambda_l1= 0.5,
                               lambda_l2= 0.5,
                               n_estimators= 5000,
                               learning_rate= 0.005,
                               feature_fraction= 0.8,
                               bagging_fraction= 0.8,
                               max_depth= 12,
                               n_jobs= -1,
                               random_state= 2021
                               )

# 划分训练集
X = df_train[feats]
y = df_train['flag']

# 5折-交叉验证
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2022)
pred = []
for folder,(train_index, test_index) in enumerate(kf.split(X, y)):
    X_train, X_val = X.iloc[train_index], X.iloc[test_index]
    y_train, y_val = y.iloc[train_index], y.iloc[test_index]
    lgb_model = lgb_model.fit(X_train,
                              y_train,
                              eval_set=[(X_val, y_val)],
                              eval_metric='auc',
                              verbose=True)
    # 预测
    test_pre = lgb_model.predict_proba(df_test[feats])
    print('第{}轮训练结束，正在保存预测数据-------------------------'.format(folder+1))
    pred.append(test_pre)

[1]	valid_0's auc: 0.829231
[2]	valid_0's auc: 0.849447
[3]	valid_0's auc: 0.853931
[4]	valid_0's auc: 0.854813
[5]	valid_0's auc: 0.855679
[6]	valid_0's auc: 0.848481
[7]	valid_0's auc: 0.848756
[8]	valid_0's auc: 0.848969
[9]	valid_0's auc: 0.845036
[10]	valid_0's auc: 0.845144
[11]	valid_0's auc: 0.85024
[12]	valid_0's auc: 0.853367
[13]	valid_0's auc: 0.851392
[14]	valid_0's auc: 0.851087
[15]	valid_0's auc: 0.855857
[16]	valid_0's auc: 0.85456
[17]	valid_0's auc: 0.86086
[18]	valid_0's auc: 0.863506
[19]	valid_0's auc: 0.87129
[20]	valid_0's auc: 0.872545
[21]	valid_0's auc: 0.871185
[22]	valid_0's auc: 0.870597
[23]	valid_0's auc: 0.868371
[24]	valid_0's auc: 0.869035
[25]	valid_0's auc: 0.868851
[26]	valid_0's auc: 0.869326
[27]	valid_0's auc: 0.872774
[28]	valid_0's auc: 0.873727
[29]	valid_0's auc: 0.873289
[30]	valid_0's auc: 0.872434
[31]	valid_0's auc: 0.872564
[32]	valid_0's auc: 0.872423
[33]	valid_0's auc: 0.87108
[34]	valid_0's auc: 0.871028
[35]	valid_0's auc: 0.87217


In [255]:
# lgb结果
new_result = (pred[0]+pred[1]+pred[2]+pred[3]+pred[4])/5
final_result = []
for i in new_result:
    if i[0]>0.5:
        final_result.append(0)
    else:
        final_result.append(1)

In [256]:
df_test['flag'] = final_result
df_test[['user_id', 'flag']].to_csv('./baseline.csv', index=False, header=['id', 'flag'])

In [257]:
# params = {
#     'learning_rate': 0.05,
#     'boosting_type': 'gbdt',
#     'objective': 'binary',
#     'verbose': -1,
#     'seed': 2022,
#     'n_jobs': -1,
# }

# fold_num = 10
# seed = 2022
# kf = StratifiedKFold(n_splits=fold_num, shuffle=True, random_state=seed)

# oof = np.zeros(len(df_train))
# importance = 0
# pred_y = pd.DataFrame()
# LABEL = 'flag'
# for fold, (train_idx, val_idx) in enumerate(kf.split(df_train[feats], df_train[LABEL])):
#     print('-----------', fold)
#     train = lgb.Dataset(df_train.loc[train_idx, feats],
#                         df_train.loc[train_idx, LABEL])
#     val = lgb.Dataset(df_train.loc[val_idx, feats],
#                       df_train.loc[val_idx, LABEL])
#     model = lgb.train(params, train, valid_sets=val, num_boost_round=10000,
#                       early_stopping_rounds=100, verbose_eval=200)
#     oof[val_idx] += model.predict(df_train.loc[val_idx, feats])
#     pred_y['fold_%d_seed_%d' % (fold, seed)] = model.predict(df_test[feats])
#     importance += model.feature_importance(importance_type='gain') / fold_num

# thre = 0.1
# score = f1_score(df_train[LABEL],
#                  list(map(lambda x: 1 if x > thre else 0, oof)), average='macro')
# print('\nF1... ', score)
# # print('AUC  %0.5f, LOGLOSS  %0.5f' % (
# #     roc_auc_score(df_train['flag'], oof),
# #     log_loss(df_train['flag'], oof)))

# feats_importance = pd.DataFrame()
# feats_importance['name'] = feats
# feats_importance['importance'] = importance
# print(feats_importance.sort_values('importance', ascending=False)[:10])

# pred_y = pred_y.mean(axis=1).map(lambda x: 1 if x > thre else 0)
# print(pred_y.sum())
# df_test['flag'] = pred_y
# df_test[['user_id', 'flag']].to_csv('./baseline.csv', index=False, header=['id', 'flag'])