In [None]:
import gc
import os
import time
import random
import pickle
import pandas as pd
import numpy as np
import lightgbm as lgb
import xgboost as xgb
import catboost as cbt
import seaborn as sns

from tqdm import tqdm
from sklearn.metrics import *
from sklearn.model_selection import *
from matplotlib import pyplot as plt


seed = 2020
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
random.seed(seed)

In [None]:
with open('./input/origin_df.pkl', 'rb') as f:
    df = pickle.load(f)

In [None]:
train_df = df.iloc[:6355374]
test_df = df.iloc[-849077:]

feature_name = [i for i in df.columns if 'feature' in i]
len(feature_name)

In [None]:
fold = 5
test_num = 849077
skf = StratifiedKFold(n_splits=5, random_state=seed, shuffle=True)
cat_sub = np.zeros((test_df.shape[0],))
xgb_sub = np.zeros((test_df.shape[0],))
cat_oof = np.zeros((train_df.shape[0],))
xgb_oof = np.zeros((train_df.shape[0],))
metric_list = []
y_pre_list = []
model_count = 1

In [None]:
i = 0
metric_list = []

for train_index, valid_index in skf.split(train_df, train_df['label'].astype(int).values):
    print('Fold %s' % i)
    X_train, label_train = train_df.iloc[train_index][feature_name], train_df.iloc[train_index]['label'].astype(int).values
    X_valid, label_valid = train_df.iloc[valid_index][feature_name], train_df.iloc[valid_index]['label'].astype(int).values

    clf1 = cbt.CatBoostRegressor(iterations=100000, learning_rate=0.1, depth=7, loss_function='RMSE', eval_metric='RMSE')
    clf1.fit(X_train, label_train, eval_set=[(X_valid, label_valid)], early_stopping_rounds=500, verbose=1000)
    x1 = clf1.predict(X_valid)
    y1 = clf1.predict(test_df[feature_name])

    clf2 = xgb.XGBRegressor(learning_rate=0.1, max_depth=7, subsample=0.5, colsample_bytree=0.5, n_estimators=100000, eval_metric='rmse')
    clf2.fit(X_train, label_train, eval_set=[(X_valid, label_valid)], early_stopping_rounds=500, verbose=1000)
    x2 = clf2.predict(X_valid)
    y2 = clf2.predict(test_df[feature_name])

    cat_oof[valid_index] = y1
    xgb_oof[valid_index] = y2

    cat_sub += (y1 / 5)
    xgb_sub += (y2 / 5)
    i += 1

In [None]:
y = (y1 + y2) / 2
submit = test_df[['query_id','doc_id']].reset_index(drop=True)
submit['predict_label'] = y
submit.columns = ['queryid','documentid','predict_label']
submit.to_csv('./ensemble/xgb_cat.csv'), index=False)
submit