In [2]:
import pandas as pd
import numpy as np
import os
import gc
import matplotlib.pyplot as plt
from tqdm import *
#----------------核心模型----------------
from catboost import CatBoostClassifier
from sklearn.linear_model import SGDRegressor, LinearRegression, Ridge
#----------------交叉验证----------------
from sklearn.model_selection import StratifiedKFold, KFold
#----------------评估指标----------------
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, log_loss
#----------------忽略报警----------------
import warnings
warnings.filterwarnings('ignore')

In [4]:
# 读取训练数据和测试数据
train_data_ads = pd.read_csv('.\\Downloads\\2022_3_data\\train\\train_data_ads.csv')
train_data_feeds = pd.read_csv('.\\Downloads\\2022_3_data\\train\\train_data_feeds.csv')

test_data_ads = pd.read_csv('.\\Downloads\\2022_3_data\\test\\test_data_ads.csv')
test_data_feeds = pd.read_csv('.\\Downloads\\2022_3_data\\test\\test_data_feeds.csv')

# 合并数据
# 合并数据
train_data_ads['istest'] = 0
test_data_ads['istest'] = 1
data_ads = pd.concat([train_data_ads, test_data_ads], axis=0, ignore_index=True)

train_data_feeds['istest'] = 0
test_data_feeds['istest'] = 1
data_feeds = pd.concat([train_data_feeds, test_data_feeds], axis=0, ignore_index=True)

del train_data_ads, test_data_ads, train_data_feeds, test_data_feeds
gc.collect()

0

In [5]:
# 自然数编码
def label_encode(series, series2):
    unique = list(series.unique())
    return series2.map(dict(zip(
        unique, range(series.nunique())
    )))

for col in ['ad_click_list_v001','ad_click_list_v002','ad_click_list_v003','ad_close_list_v001','ad_close_list_v002','ad_close_list_v003','u_newsCatInterestsST']:
    data_ads[col] = label_encode(data_ads[col], data_ads[col])

In [6]:
# data_feeds特征构建
cols = [f for f in data_feeds.columns if f not in ['label','istest','u_userId']]
for col in Z(cols):
    tmp = data_feeds.groupby(['u_userId'])[col].nunique().reset_index()
    tmp.columns = ['user_id', col+'_feeds_nuni']
    data_ads = data_ads.merge(tmp, on='user_id', how='left')

cols = [f for f in data_feeds.columns if f not in ['istest','u_userId','u_newsCatInterests','u_newsCatDislike','u_newsCatInterestsST','u_click_ca2_news','i_docId','i_s_sourceId','i_entities']]
for col in tqdm(cols):
    tmp = data_feeds.groupby(['u_userId'])[col].mean().reset_index()
    tmp.columns = ['user_id', col+'_feeds_mean']
    data_ads = data_ads.merge(tmp, on='user_id', how='left')

100%|██████████████████████████████████████████████████████████████████████████████████| 26/26 [01:58<00:00,  4.54s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [01:43<00:00,  5.19s/it]


In [7]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df
    
# 压缩使用内存
data_ads = reduce_mem_usage(data_ads)
# Mem. usage decreased to 2351.47 Mb (69.3% reduction)

Mem. usage decreased to 1303.62 Mb (76.2% reduction)


In [8]:
# 划分训练集和测试集
cols = [f for f in data_ads.columns if f not in ['label','istest']]
x_train = data_ads[data_ads.istest==0][cols]
x_test = data_ads[data_ads.istest==1][cols]

y_train = data_ads[data_ads.istest==0]['label']

del data_ads, data_feeds
gc.collect()

0

In [9]:
def cv_model(clf, train_x, train_y, test_x, clf_name, seed=2022):
    
    kf = KFold(n_splits=5, shuffle=True, random_state=seed)

    train = np.zeros(train_x.shape[0])
    test = np.zeros(test_x.shape[0])

    cv_scores = []

    for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
        print('************************************ {} {}************************************'.format(str(i+1), str(seed)))
        trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]
               
        params = {'learning_rate': 0.3, 'depth': 5, 'l2_leaf_reg': 10, 'bootstrap_type':'Bernoulli','random_seed':seed,
                  'od_type': 'Iter', 'od_wait': 50, 'random_seed': 11, 'allow_writing_files': False}

        model = clf(iterations=20000, **params, eval_metric='AUC')
        model.fit(trn_x, trn_y, eval_set=(val_x, val_y),
                  metric_period=200,
                  cat_features=[], 
                  use_best_model=True, 
                  verbose=1)

        val_pred  = model.predict_proba(val_x)[:,1]
        test_pred = model.predict_proba(test_x)[:,1]
            
        train[valid_index] = val_pred
        test += test_pred / kf.n_splits
        cv_scores.append(roc_auc_score(val_y, val_pred))
        
        print(cv_scores)
       
    print("%s_score_list:" % clf_name, cv_scores)
    print("%s_score_mean:" % clf_name, np.mean(cv_scores))
    print("%s_score_std:" % clf_name, np.std(cv_scores))
    return train, test

cat_train, cat_test = cv_model(CatBoostClassifier, x_train, y_train, x_test, "cat")

************************************ 1 2022************************************




0:	test: 0.6921154	best: 0.6921154 (0)	total: 701ms	remaining: 3h 53m 44s
200:	test: 0.8046135	best: 0.8046135 (200)	total: 1m 54s	remaining: 3h 8m 4s
400:	test: 0.8081975	best: 0.8081975 (400)	total: 3m 49s	remaining: 3h 6m 55s
600:	test: 0.8096837	best: 0.8096837 (600)	total: 5m 40s	remaining: 3h 3m 7s
800:	test: 0.8102733	best: 0.8102755 (799)	total: 7m 28s	remaining: 2h 59m 5s
1000:	test: 0.8108860	best: 0.8109073 (990)	total: 9m 16s	remaining: 2h 56m 9s
1200:	test: 0.8113489	best: 0.8113677 (1175)	total: 11m 4s	remaining: 2h 53m 28s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.8113677039
bestIteration = 1175

Shrink model to first 1176 iterations.
[0.8113677039422733]
************************************ 2 2022************************************




0:	test: 0.6890423	best: 0.6890423 (0)	total: 555ms	remaining: 3h 5m
200:	test: 0.8066723	best: 0.8066723 (200)	total: 1m 49s	remaining: 2h 59m 50s
400:	test: 0.8095862	best: 0.8095862 (400)	total: 3m 38s	remaining: 2h 57m 54s
600:	test: 0.8109250	best: 0.8109483 (586)	total: 5m 26s	remaining: 2h 55m 37s
800:	test: 0.8119713	best: 0.8119713 (800)	total: 7m 14s	remaining: 2h 53m 34s
1000:	test: 0.8125059	best: 0.8125135 (969)	total: 9m 2s	remaining: 2h 51m 38s
1200:	test: 0.8131917	best: 0.8132006 (1199)	total: 10m 50s	remaining: 2h 49m 42s
1400:	test: 0.8135026	best: 0.8135403 (1373)	total: 12m 38s	remaining: 2h 47m 53s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.813540322
bestIteration = 1373

Shrink model to first 1374 iterations.
[0.8113677039422733, 0.8135403220261701]
************************************ 3 2022************************************




0:	test: 0.6638754	best: 0.6638754 (0)	total: 524ms	remaining: 2h 54m 33s
200:	test: 0.8058515	best: 0.8058515 (200)	total: 1m 49s	remaining: 2h 58m 58s
400:	test: 0.8091572	best: 0.8091572 (400)	total: 3m 37s	remaining: 2h 56m 58s
600:	test: 0.8103285	best: 0.8103566 (595)	total: 5m 24s	remaining: 2h 54m 18s
800:	test: 0.8117448	best: 0.8117461 (799)	total: 7m 11s	remaining: 2h 52m 21s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.8121950788
bestIteration = 898

Shrink model to first 899 iterations.
[0.8113677039422733, 0.8135403220261701, 0.8121950787651718]
************************************ 4 2022************************************




0:	test: 0.6589996	best: 0.6589996 (0)	total: 527ms	remaining: 2h 55m 33s
200:	test: 0.8054655	best: 0.8054720 (199)	total: 1m 49s	remaining: 2h 59m 34s
400:	test: 0.8087192	best: 0.8087192 (400)	total: 3m 37s	remaining: 2h 56m 57s
600:	test: 0.8102491	best: 0.8102558 (598)	total: 5m 31s	remaining: 2h 58m 6s
800:	test: 0.8111745	best: 0.8111842 (788)	total: 7m 23s	remaining: 2h 57m 7s
1000:	test: 0.8119016	best: 0.8119033 (999)	total: 9m 17s	remaining: 2h 56m 24s
1200:	test: 0.8125379	best: 0.8125515 (1194)	total: 11m 12s	remaining: 2h 55m 28s
1400:	test: 0.8128287	best: 0.8128341 (1375)	total: 13m 7s	remaining: 2h 54m 13s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.8128988471
bestIteration = 1409

Shrink model to first 1410 iterations.
[0.8113677039422733, 0.8135403220261701, 0.8121950787651718, 0.8128988471259874]
************************************ 5 2022************************************




0:	test: 0.6626293	best: 0.6626293 (0)	total: 510ms	remaining: 2h 50m 8s
200:	test: 0.8055802	best: 0.8055802 (200)	total: 1m 56s	remaining: 3h 11m 40s
400:	test: 0.8091364	best: 0.8091506 (398)	total: 3m 47s	remaining: 3h 5m 40s
600:	test: 0.8104784	best: 0.8104817 (598)	total: 5m 36s	remaining: 3h 1m 14s
800:	test: 0.8114410	best: 0.8114410 (800)	total: 7m 24s	remaining: 2h 57m 42s
1000:	test: 0.8120612	best: 0.8120624 (998)	total: 9m 12s	remaining: 2h 54m 41s
1200:	test: 0.8124766	best: 0.8124959 (1169)	total: 11m	remaining: 2h 52m 15s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.8124958933
bestIteration = 1169

Shrink model to first 1170 iterations.
[0.8113677039422733, 0.8135403220261701, 0.8121950787651718, 0.8128988471259874, 0.8124958932768551]
cat_score_list: [0.8113677039422733, 0.8135403220261701, 0.8121950787651718, 0.8128988471259874, 0.8124958932768551]
cat_score_mean: 0.8124995690272915
cat_score_std: 0.000723385955949271


In [10]:
x_test['pctr'] = cat_test
x_test[['log_id','pctr']].to_csv('submission.csv', index=False)