In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, roc_auc_score, log_loss
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')

In [2]:
train_user = pd.read_csv('./data/train/训练集_用户基本信息表.txt')
train_day = pd.read_csv('./data/train/训练集_用户日电量明细表.txt')
train_month = pd.read_csv('./data/train/训练集_行业户均月电量.txt')

test_user = pd.read_csv('./data/test/测试集_用户基本信息表.txt')
test_day = pd.read_csv('./data/test/测试集_用户日电量明细表.txt')
test_month = pd.read_csv('./data/test/测试集_行业户均月电量.txt')

df_user = pd.concat([train_user, test_user])
df_day = pd.concat([train_day, test_day])
df_month = pd.concat([train_month, test_month])

feats = ['contract_cap', 'run_cap']

In [3]:
# 行业表

tmp = df_month.groupby(['trade_code', 'trade_name', 'county_code'])['avg_settle_pq'].agg(
    ['mean', 'median', 'skew', 'sum']).reset_index()
tmp.columns = list(tmp.columns[:3]) + ['avg_settle_pq_mean', 'avg_settle_pq_median', 'avg_settle_pq_skew', 'avg_settle_pq_sum']
df_user = pd.merge(df_user, tmp, on=['trade_code', 'trade_name', 'county_code'], how='left')
feats += ['avg_settle_pq_mean', 'avg_settle_pq_median', 'avg_settle_pq_skew']




In [40]:
# 用户表
df_day['year'] = pd.to_datetime(df_day['sum_date']).dt.year
df_day['month'] = pd.to_datetime(df_day['sum_date']).dt.month


# 每个用户每个月的用电情况
# df_day.groupby(['user_id','year','month'])['d_kwh_quantity'].sum().reset_index()
df_day

Unnamed: 0,user_id,sum_date,d_kwh_quantity,d_kwh_j,d_kwh_f,d_kwh_p,d_kwh_g,year,month,day,y_m
0,1000007,2020-10-01 00:00:00,7.23,0.0,5.04,0.0,2.20,2020,10,1,0 2020\n1 2020\n2 2...
1,1000009,2020-10-01 00:00:00,4.30,0.0,2.73,0.0,1.57,2020,10,1,0 2020\n1 2020\n2 2...
2,1000015,2020-10-01 00:00:00,9.07,0.0,6.41,0.0,2.65,2020,10,1,0 2020\n1 2020\n2 2...
3,1000035,2020-10-01 00:00:00,6.30,0.0,3.88,0.0,2.41,2020,10,1,0 2020\n1 2020\n2 2...
4,1000037,2020-10-01 00:00:00,5.28,,,,,2020,10,1,0 2020\n1 2020\n2 2...
...,...,...,...,...,...,...,...,...,...,...,...
1867562,2023009,2020-12-31 00:00:00,0.00,0.0,0.00,0.0,0.00,2020,12,31,0 2020\n1 2020\n2 2...
1867563,2023017,2020-12-31 00:00:00,3675.00,0.0,585.00,690.0,2400.00,2020,12,31,0 2020\n1 2020\n2 2...
1867564,2023023,2020-12-31 00:00:00,2.70,0.0,2.52,0.0,0.17,2020,12,31,0 2020\n1 2020\n2 2...
1867565,2023042,2020-12-31 00:00:00,1.56,0.0,0.92,0.0,0.64,2020,12,31,0 2020\n1 2020\n2 2...


In [None]:
cat_feats = ['county_code', 'volt_name', 'elec_type_name', 'status_name', 'trade_name']
for name in cat_feats:
    le = LabelEncoder()
    df_user[name] = le.fit_transform(df_user[name])
feats += cat_feats

df_train = df_user[~df_user['flag'].isna()].reset_index()
df_test = df_user[df_user['flag'].isna()].reset_index()

print(feats)

In [8]:
params = {
    'learning_rate': 0.05,
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'verbose': -1,
    'seed': 2022,
    'n_jobs': -1,
}

fold_num = 10
seed = 2022
kf = StratifiedKFold(n_splits=fold_num, shuffle=True, random_state=seed)

oof = np.zeros(len(df_train))
importance = 0
pred_y = pd.DataFrame()
LABEL = 'flag'
for fold, (train_idx, val_idx) in enumerate(kf.split(df_train[feats], df_train[LABEL])):
    print('-----------', fold)
    train = lgb.Dataset(df_train.loc[train_idx, feats],
                        df_train.loc[train_idx, LABEL])
    val = lgb.Dataset(df_train.loc[val_idx, feats],
                      df_train.loc[val_idx, LABEL])
    model = lgb.train(params, train, valid_sets=val, num_boost_round=10000,
                      early_stopping_rounds=100, verbose_eval=200)
    oof[val_idx] += model.predict(df_train.loc[val_idx, feats])
    pred_y['fold_%d_seed_%d' % (fold, seed)] = model.predict(df_test[feats])
    importance += model.feature_importance(importance_type='gain') / fold_num

thre = 0.1
score = f1_score(df_train[LABEL],
                 list(map(lambda x: 1 if x > thre else 0, oof)), average='macro')
print('\nF1... ', score)
print('AUC  %0.5f, LOGLOSS  %0.5f' % (
    roc_auc_score(df_train['flag'], oof),
    log_loss(df_train['flag'], oof)))

feats_importance = pd.DataFrame()
feats_importance['name'] = feats
feats_importance['importance'] = importance
print(feats_importance.sort_values('importance', ascending=False)[:10])

pred_y = pred_y.mean(axis=1).map(lambda x: 1 if x > thre else 0)
print(pred_y.sum())
df_test['flag'] = pred_y
df_test[['user_id', 'flag']].to_csv('./baseline.csv', index=False, header=['id', 'flag'])

----------- 0
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[33]	valid_0's binary_logloss: 0.0354758
----------- 1
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[33]	valid_0's binary_logloss: 0.0377425
----------- 2
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[15]	valid_0's binary_logloss: 0.0381263
----------- 3
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[93]	valid_0's binary_logloss: 0.0295708
----------- 4
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[43]	valid_0's binary_logloss: 0.0320341
----------- 5
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[31]	valid_0's binary_logloss: 0.0340344
----------- 6
Training until validation scores don't improve for 100 rounds
Early stoppi