In [None]:
import gc
import os
import time
import random
import pickle
import pandas as pd
import numpy as np
import lightgbm as lgb
import xgboost as xgb
import catboost as cbt
import seaborn as sns

from tqdm import tqdm
from sklearn.metrics import *
from sklearn.model_selection import *
from matplotlib import pyplot as plt


seed = 2020
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
random.seed(seed)

In [None]:
with open('./input/origin_df.pkl', 'rb') as f:
    df = pickle.load(f)

In [None]:
df['label'].value_counts()

In [None]:
train_df = df.iloc[:6355374]
test_df = df.iloc[-849077:]

feature_name = [i for i in df.columns if 'feature' in i]
len(feature_name)

In [None]:
fold = 5
test_num = 849077
skf = StratifiedKFold(n_splits=5, random_state=seed, shuffle=True)
sub = np.zeros((test_df.shape[0],))
oof_pred = np.zeros((train_df.shape[0],))
metric_list = []
y_pre_list = []
model_count = 4

for i, (train_index, valid_index) in enumerate(skf.split(train_df, train_df['label'])):
    
    x_train = train_df.iloc[train_index][feature_name]
    y_train = train_df.iloc[train_index]['label']
    x_valid = train_df.iloc[valid_index][feature_name]
    y_valid = train_df.iloc[valid_index]['label']
    
    lgb_train = lgb.Dataset(x_train, label=y_train)
    lgb_valid = lgb.Dataset(x_valid, label=y_valid)
    
    params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'mse'},
    'num_threads': 60,
    'num_leaves': 511,
    'num_iterations': 40000,
    'learning_rate': 0.1,
    'feature_fraction': 1,
    'bagging_fraction': 1,
    'bagging_freq': 1,
    'verbose': 1
    }

    model = lgb.train(params, lgb_train, valid_sets=[lgb_train, lgb_valid], verbose_eval=500, early_stopping_rounds=1000)
    y_test_pred = model.predict(test_df[feature_name], num_iteration=model.best_iteration)
    sub += y_test_pred / 5
    y_pre_list.append(y_test_pred)
    
    y_valid_pred = model.predict(x_valid, num_iteration=model.best_iteration)
    oof_pred[valid_index] = y_valid_pred
    metric_result = mean_squared_error(y_valid, y_valid_pred)
    metric_list.append(metric_result)

    with open('./model/prob/lgb_score_%s.pkl' % model_count, 'wb') as f:
        pickle.dump(metric_list, f)

    with open('./model/prob/lgb_result_%s.pkl' % model_count, 'wb') as f:
        pickle.dump(y_pre_list, f)

    with open('./model/prob/lgb_oof_%s.pkl' % model_count, 'wb') as f:
        pickle.dump(oof_pred, f)

In [None]:
metric_list

In [None]:
submit = test_df[['query_id', 'doc_id']].reset_index(drop=True)
submit['predict_label'] = sub
submit.columns = ['queryid', 'documentid', 'predict_label']
submit.to_csv('./ensemble/lgb.csv', index=False)
submit