## 工具导入

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

import gc
from collections import Counter
import copy

import warnings
warnings.filterwarnings("ignore")
 
%matplotlib inline

## 数据读取

In [2]:
#读取数据集

#test_data = pd.read_csv('./data_format1/test_format1.csv')
#train_data = pd.read_csv('./data_format1/train_format1.csv')

#user_info = pd.read_csv('./data_format1/user_info_format1.csv')
#user_log = pd.read_csv('./data_format1/user_log_format1.csv')


## 数据资源查看

In [3]:
#train_data.info()

In [4]:
#test_data.info()

In [5]:
#user_info.info()

In [6]:
#user_log.info()

## 数据读取函数

In [7]:
def read_csv(file_name, num_rows):
    return pd.read_csv(file_name, nrows=num_rows)

## 内存压缩方法

In [8]:
# reduce memory
def reduce_mem_usage(df, verbose=True):
    start_mem = df.memory_usage().sum() / 1024**2
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']

    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
                    
    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage befor optimization is :{:.2f} MB'.format(start_mem))
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df

## 数据进行内存压缩

In [9]:
num_rows = None
#num_rows = 2000  # 1000条测试代码使用

train_file = './data_format1/train_format1.csv'
test_file = './data_format1/test_format1.csv'

user_info_file = './data_format1/user_info_format1.csv'
user_log_file = './data_format1/user_log_format1.csv'

train_data = reduce_mem_usage(read_csv(train_file, num_rows))
test_data = reduce_mem_usage(read_csv(test_file, num_rows))

user_info = reduce_mem_usage(read_csv(user_info_file, num_rows))
user_log = reduce_mem_usage(read_csv(user_log_file, num_rows))

Memory usage befor optimization is :5.97 MB
Memory usage after optimization is: 1.74 MB
Decreased by 70.8%
Memory usage befor optimization is :5.98 MB
Memory usage after optimization is: 3.49 MB
Decreased by 41.7%
Memory usage befor optimization is :9.71 MB
Memory usage after optimization is: 3.24 MB
Decreased by 66.7%
Memory usage befor optimization is :2933.33 MB
Memory usage after optimization is: 890.48 MB
Decreased by 69.6%


In [10]:
user_log.head()

Unnamed: 0,user_id,item_id,cat_id,seller_id,brand_id,time_stamp,action_type
0,328862,323294,833,2882,2660.0,829,0
1,328862,844400,1271,2882,2660.0,829,0
2,328862,575153,1271,2882,2660.0,829,0
3,328862,996875,1271,2882,2660.0,829,0
4,328862,1086186,1271,1253,1049.0,829,0


In [11]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260864 entries, 0 to 260863
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype
---  ------       --------------   -----
 0   user_id      260864 non-null  int32
 1   merchant_id  260864 non-null  int16
 2   label        260864 non-null  int8 
dtypes: int16(1), int32(1), int8(1)
memory usage: 1.7 MB


In [12]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 261477 entries, 0 to 261476
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   user_id      261477 non-null  int32  
 1   merchant_id  261477 non-null  int16  
 2   prob         0 non-null       float64
dtypes: float64(1), int16(1), int32(1)
memory usage: 3.5 MB


In [13]:
user_info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 424170 entries, 0 to 424169
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   user_id    424170 non-null  int32  
 1   age_range  421953 non-null  float16
 2   gender     417734 non-null  float16
dtypes: float16(2), int32(1)
memory usage: 3.2 MB


In [14]:
user_log.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54925330 entries, 0 to 54925329
Data columns (total 7 columns):
 #   Column       Dtype  
---  ------       -----  
 0   user_id      int32  
 1   item_id      int32  
 2   cat_id       int16  
 3   seller_id    int16  
 4   brand_id     float16
 5   time_stamp   int16  
 6   action_type  int8   
dtypes: float16(1), int16(3), int32(2), int8(1)
memory usage: 890.5 MB


## 数据处理

### 合并用户信息

In [15]:
all_data = train_data.append(test_data)
all_data = all_data.merge(user_info,on=['user_id'],how='left')
del train_data, test_data, user_info
gc.collect()

15

In [16]:
all_data.head()

Unnamed: 0,user_id,merchant_id,label,prob,age_range,gender
0,34176,3906,0.0,,6.0,0.0
1,34176,121,0.0,,6.0,0.0
2,34176,4356,1.0,,6.0,0.0
3,34176,2217,0.0,,6.0,0.0
4,230784,4818,0.0,,0.0,0.0


### 用户行为日志信息按时间进行排序

In [17]:
"""
按时间排序
"""
user_log = user_log.sort_values(['user_id','time_stamp'])

### 对每个用户的逐个合并所有的item_id, cat_id,seller_id,brand_id,time_stamp, action_type字段

In [18]:
"""
合并数据
"""
list_join_func = lambda x: " ".join([str(i) for i in x])


agg_dict = {
            'item_id' : list_join_func,	
            'cat_id' : list_join_func,
            'seller_id' : list_join_func,
            'brand_id' : list_join_func,
            'time_stamp' : list_join_func,
            'action_type' : list_join_func
        }

rename_dict = {
            'item_id' : 'item_path',
            'cat_id' : 'cat_path',
            'seller_id' : 'seller_path',
            'brand_id' : 'brand_path',
            'time_stamp' : 'time_stamp_path',
            'action_type' : 'action_type_path'
        }

def merge_list(df_ID, join_columns, df_data, agg_dict, rename_dict):
    df_data = df_data.\
            groupby(join_columns).\
            agg(agg_dict).\
            reset_index().\
            rename(columns=rename_dict)

    df_ID = df_ID.merge(df_data, on=join_columns, how="left")
    return df_ID

all_data = merge_list(all_data, 'user_id', user_log, agg_dict, rename_dict)


In [19]:
all_data.head()

Unnamed: 0,user_id,merchant_id,label,prob,age_range,gender,item_path,cat_path,seller_path,brand_path,time_stamp_path,action_type_path
0,34176,3906,0.0,,6.0,0.0,581818 879005 581818 581818 1011673 52343 2773...,1505 662 1505 1505 1505 662 1095 1505 662 1095...,416 3606 416 416 416 3760 3606 416 1926 3004 4...,4014.0 33.0 4014.0 4014.0 4014.0 3738.0 33.0 4...,521 521 521 521 521 521 521 521 521 521 521 52...,0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 2 0 2 ...
1,34176,121,0.0,,6.0,0.0,581818 879005 581818 581818 1011673 52343 2773...,1505 662 1505 1505 1505 662 1095 1505 662 1095...,416 3606 416 416 416 3760 3606 416 1926 3004 4...,4014.0 33.0 4014.0 4014.0 4014.0 3738.0 33.0 4...,521 521 521 521 521 521 521 521 521 521 521 52...,0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 2 0 2 ...
2,34176,4356,1.0,,6.0,0.0,581818 879005 581818 581818 1011673 52343 2773...,1505 662 1505 1505 1505 662 1095 1505 662 1095...,416 3606 416 416 416 3760 3606 416 1926 3004 4...,4014.0 33.0 4014.0 4014.0 4014.0 3738.0 33.0 4...,521 521 521 521 521 521 521 521 521 521 521 52...,0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 2 0 2 ...
3,34176,2217,0.0,,6.0,0.0,581818 879005 581818 581818 1011673 52343 2773...,1505 662 1505 1505 1505 662 1095 1505 662 1095...,416 3606 416 416 416 3760 3606 416 1926 3004 4...,4014.0 33.0 4014.0 4014.0 4014.0 3738.0 33.0 4...,521 521 521 521 521 521 521 521 521 521 521 52...,0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 2 0 2 ...
4,230784,4818,0.0,,0.0,0.0,191923 191923 191923 191923 964906 229470 2294...,1023 1023 1023 1023 662 664 664 1544 664 662 6...,3545 3545 3545 3545 4566 2537 2537 2420 2537 4...,5860.0 5860.0 5860.0 5860.0 6320.0 6064.0 6064...,601 601 601 601 614 614 614 614 614 614 618 61...,0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 0 0 0 0 0 ...


### 删除数据并回收内存

In [20]:
"""
删除不需要的数据
"""
del user_log
gc.collect()

0

## 定义数据统计函数

### 统计数据的总数

In [21]:
def cnt_(x):
    try:
        return len(x.split(' '))
    except:
        return -1

### 统计唯一数据总数

In [22]:
def nunique_(x):
    try:
        return len(set(x.split(' ')))
    except:
        return -1

### 统计数据最大值

In [23]:
def max_(x):
    try:
        return np.max([int(i) for i in x.split(' ')])
    except:
        return -1


### 统计数据最小值

In [24]:
def min_(x):
    try:
        return np.min([int(i) for i in x.split(' ')])
    except:
        return -1  

### 统计数据的标准差

In [25]:
def std_(x):
    try:
        return np.std([float(i) for i in x.split(' ')])
    except:
        return -1 

### 统计数据中top N的数据

In [26]:
def most_n(x, n):
    try:
        return Counter(x.split(' ')).most_common(n)[n-1][0]
    except:
        return -1

### 统计数据中top N数据的总数

In [27]:
def most_n_cnt(x, n):
    try:
        return Counter(x.split(' ')).most_common(n)[n-1][1]
    except:
        return -1   

In [28]:
###
def user_cnt(df_data, single_col, name):
    df_data[name] = df_data[single_col].apply(cnt_)
    return df_data

def user_nunique(df_data, single_col, name):
    df_data[name] = df_data[single_col].apply(nunique_)
    return df_data
    
def user_max(df_data, single_col, name):
    df_data[name] = df_data[single_col].apply(max_)
    return df_data

def user_min(df_data, single_col, name):
    df_data[name] = df_data[single_col].apply(min_)
    return df_data
    
def user_std(df_data, single_col, name):
    df_data[name] = df_data[single_col].apply(std_)
    return df_data

def user_most_n(df_data, single_col, name, n=1):
    func = lambda x: most_n(x, n)
    df_data[name] = df_data[single_col].apply(func)
    return df_data

def user_most_n_cnt(df_data, single_col, name, n=1):
    func = lambda x: most_n_cnt(x, n)
    df_data[name] = df_data[single_col].apply(func)
    return df_data


### 提取商铺的基本统计特征

In [29]:
"""
	提取基本统计特征
"""
all_data_test = all_data.head(2000)
#all_data_test = all_data
# 统计用户 点击、浏览、加购、购买行为
# 总次数
all_data_test = user_cnt(all_data_test,  'seller_path', 'user_cnt')
# 不同店铺个数
all_data_test = user_nunique(all_data_test,  'seller_path', 'seller_nunique')
# 不同品类个数
all_data_test = user_nunique(all_data_test,  'cat_path', 'cat_nunique')
# 不同品牌个数
all_data_test = user_nunique(all_data_test,  'brand_path', 'brand_nunique')
# 不同商品个数
all_data_test = user_nunique(all_data_test,  'item_path', 'item_nunique')
# 活跃天数
all_data_test = user_nunique(all_data_test,  'time_stamp_path', 'time_stamp_nunique')
# 不同行为种数
all_data_test = user_nunique(all_data_test,  'action_type_path', 'action_type_nunique')


In [30]:
all_data_test.shape

(2000, 19)

In [31]:
# 最晚时间
all_data_test = user_max(all_data_test,  'action_type_path', 'time_stamp_max')
# 最早时间
all_data_test = user_min(all_data_test,  'action_type_path', 'time_stamp_min')
# 活跃天数方差
all_data_test = user_std(all_data_test,  'action_type_path', 'time_stamp_std')
# 最早和最晚相差天数
all_data_test['time_stamp_range'] = all_data_test['time_stamp_max'] - all_data_test['time_stamp_min']

In [32]:
all_data_test.shape

(2000, 23)

In [33]:
# 用户最喜欢的店铺
all_data_test = user_most_n(all_data_test, 'seller_path', 'seller_most_1', n=1)
# 最喜欢的类目
all_data_test = user_most_n(all_data_test, 'cat_path', 'cat_most_1', n=1)
# 最喜欢的品牌
all_data_test = user_most_n(all_data_test, 'brand_path', 'brand_most_1', n=1)
# 最常见的行为动作
all_data_test = user_most_n(all_data_test, 'action_type_path', 'action_type_1', n=1)
# .....

In [34]:
all_data_test.shape

(2000, 27)

In [35]:
# 用户最喜欢的店铺 行为次数
all_data_test = user_most_n_cnt(all_data_test, 'seller_path', 'seller_most_1_cnt', n=1)
# 最喜欢的类目 行为次数
all_data_test = user_most_n_cnt(all_data_test, 'cat_path', 'cat_most_1_cnt', n=1)
# 最喜欢的品牌 行为次数
all_data_test = user_most_n_cnt(all_data_test, 'brand_path', 'brand_most_1_cnt', n=1)
# 最常见的行为动作 行为次数
all_data_test = user_most_n_cnt(all_data_test, 'action_type_path', 'action_type_1_cnt', n=1)
# .....

In [36]:
all_data_test.shape

(2000, 31)

## 分开统计用户的点击，加购，购买，收藏特征

### 不同行为的业务函数定义

In [123]:
# 点击、加购、购买、收藏 分开统计
"""
统计基本特征函数
-- 知识点二
-- 根据不同行为的业务函数
-- 提取不同特征
"""
def col_cnt_(df_data, columns_list, action_type):
    try:
        data_dict = {}

        col_list = copy.deepcopy(columns_list)
        if action_type != None:
            col_list += ['action_type_path']

        for col in col_list:
            data_dict[col] = df_data[col].split(' ')

        path_len = len(data_dict[col])

        data_out = []
        for i_ in range(path_len):
            data_txt = ''
            for col_ in columns_list:
                if data_dict['action_type_path'][i_] == action_type:
                    data_txt += '_' + data_dict[col_][i_]
                    data_out.append(data_txt)

        return len(data_out)
    except:
        return -1

def col_nuique_(df_data, columns_list, action_type):
    try:
        data_dict = {}

        col_list = copy.deepcopy(columns_list)
        if action_type != None:
            col_list += ['action_type_path']

        for col in col_list:
            data_dict[col] = df_data[col].split(' ')

        path_len = len(data_dict[col])

        data_out = []
        for i_ in range(path_len):
            data_txt = ''
            for col_ in columns_list:
                if data_dict['action_type_path'][i_] == action_type:
                    data_txt += '_' + data_dict[col_][i_]
                    data_out.append(data_txt)
        return len(set(data_out))
    except:
        return -1


def user_col_cnt(df_data, columns_list, action_type, name):
    df_data[name] = df_data.apply(lambda x: col_cnt_(x, columns_list, action_type), axis=1)
    return df_data

def user_col_nunique(df_data, columns_list, action_type, name):
    df_data[name] = df_data.apply(lambda x: col_nuique_(x, columns_list, action_type), axis=1)
    return df_data

### 统计店铺被用户点击次数，加购次数，购买次数，收藏次数

In [124]:
# 点击次数
all_data_test = user_col_cnt(all_data_test,  ['seller_path'], '0', 'user_cnt_0')
# 加购次数
all_data_test = user_col_cnt(all_data_test,  ['seller_path'], '1', 'user_cnt_1')
# 购买次数
all_data_test = user_col_cnt(all_data_test,  ['seller_path'], '2', 'user_cnt_2')
# 收藏次数
all_data_test = user_col_cnt(all_data_test,  ['seller_path'], '3', 'user_cnt_3')


# # 不同店铺个数
all_data_test = user_col_nunique(all_data_test,  ['seller_path'], '0', 'seller_nunique_0')
# ....

In [125]:
all_data_test.shape

(2000, 36)

In [126]:
arr = all_data_test[all_data_test.user_id==34176].action_type_path[0].split(' ')
d = [i for i in arr if i =='3']
len(d)

7

In [128]:
all_data_test[['user_id','user_cnt_0','user_cnt_1','user_cnt_2','user_cnt_3','seller_nunique_0']]

Unnamed: 0,user_id,user_cnt_0,user_cnt_1,user_cnt_2,user_cnt_3,seller_nunique_0
0,34176,410,0,34,7,106
1,34176,410,0,34,7,106
2,34176,410,0,34,7,106
3,34176,410,0,34,7,106
4,230784,47,0,7,0,20
...,...,...,...,...,...,...
1995,220293,41,0,2,2,9
1996,155013,104,0,1,1,29
1997,24453,69,0,12,0,12
1998,155781,55,0,4,0,42


In [49]:
all_data_test.groupby('user_id').size()

array([1, 2, 3, 7, 4, 5], dtype=int64)

## 组合特征

### 特征组合进行业务特征提取

In [40]:
# 点击次数
all_data_test = user_col_cnt(all_data_test,  ['seller_path', 'item_path'], '0', 'user_cnt_0')

# 不同店铺个数
all_data_test = user_col_nunique(all_data_test,  ['seller_path', 'item_path'], '0', 'seller_nunique_0')
# ....

In [41]:
all_data_test.shape

(2000, 36)

### 查看提取的特征

## 利用countvector，tfidf提取特征

In [42]:
"""
-- 知识点四
-- 利用countvector，tfidf提取特征
"""
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, ENGLISH_STOP_WORDS
from scipy import sparse
# cntVec = CountVectorizer(stop_words=ENGLISH_STOP_WORDS, ngram_range=(1, 1), max_features=100)
tfidfVec = TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS, ngram_range=(1, 1), max_features=100)


# columns_list = ['seller_path', 'cat_path', 'brand_path', 'action_type_path', 'item_path', 'time_stamp_path']
columns_list = ['seller_path']
for i, col in enumerate(columns_list):
	tfidfVec.fit(all_data_test[col])
	data_ = tfidfVec.transform(all_data_test[col])
	if i == 0:
		data_cat = data_
	else:
		data_cat = sparse.hstack((data_cat, data_))

### 特征重命名 特征合并

In [43]:
df_tfidf = pd.DataFrame(data_cat.toarray())
df_tfidf.columns = ['tfidf_' + str(i) for i in df_tfidf.columns]
all_data_test = pd.concat([all_data_test, df_tfidf],axis=1)

In [44]:
all_data_test.shape

(2000, 136)

## embeeding特征

In [45]:
import gensim

# Train Word2Vec model

model = gensim.models.Word2Vec(all_data_test['seller_path'].apply(lambda x: x.split(' ')), size=100, window=5, min_count=5, workers=4)
# model.save("product2vec.model")
# model = gensim.models.Word2Vec.load("product2vec.model")

def mean_w2v_(x, model, size=100):
    try:
        i = 0
        for word in x.split(' '):
            if word in model.wv.vocab:
                i += 1
                if i == 1:
                    vec = np.zeros(size)
                vec += model.wv[word]
        return vec / i 
    except:
        return  np.zeros(size)


def get_mean_w2v(df_data, columns, model, size):
    data_array = []
    for index, row in df_data.iterrows():
        w2v = mean_w2v_(row[columns], model, size)
        data_array.append(w2v)
    return pd.DataFrame(data_array)

df_embeeding = get_mean_w2v(all_data_test, 'seller_path', model, 100)
df_embeeding.columns = ['embeeding_' + str(i) for i in df_embeeding.columns]

In [46]:
df_embeeding.head()

Unnamed: 0,embeeding_0,embeeding_1,embeeding_2,embeeding_3,embeeding_4,embeeding_5,embeeding_6,embeeding_7,embeeding_8,embeeding_9,...,embeeding_90,embeeding_91,embeeding_92,embeeding_93,embeeding_94,embeeding_95,embeeding_96,embeeding_97,embeeding_98,embeeding_99
0,-0.81163,-0.576515,-0.256304,-0.303521,-0.006271,-0.373185,-0.095445,-1.24977,-0.131301,-0.37822,...,0.151749,-0.345193,-0.033182,-0.014218,0.026865,-0.533239,0.231678,-0.427409,0.412302,0.85033
1,-0.81163,-0.576515,-0.256304,-0.303521,-0.006271,-0.373185,-0.095445,-1.24977,-0.131301,-0.37822,...,0.151749,-0.345193,-0.033182,-0.014218,0.026865,-0.533239,0.231678,-0.427409,0.412302,0.85033
2,-0.81163,-0.576515,-0.256304,-0.303521,-0.006271,-0.373185,-0.095445,-1.24977,-0.131301,-0.37822,...,0.151749,-0.345193,-0.033182,-0.014218,0.026865,-0.533239,0.231678,-0.427409,0.412302,0.85033
3,-0.81163,-0.576515,-0.256304,-0.303521,-0.006271,-0.373185,-0.095445,-1.24977,-0.131301,-0.37822,...,0.151749,-0.345193,-0.033182,-0.014218,0.026865,-0.533239,0.231678,-0.427409,0.412302,0.85033
4,-0.779944,-0.573124,-0.140103,-0.231829,-0.515864,0.142165,0.080954,-0.489231,-0.203817,-0.228702,...,0.475462,-0.644581,-0.573855,-0.128846,-0.287453,-0.151249,0.713761,-0.563686,-0.011947,0.135482


In [47]:
# model.wv.vocab

### embeeding特征和原始特征合并

In [48]:
all_data_test.shape

(2000, 136)

In [49]:
all_data_test = pd.concat([all_data_test, df_embeeding],axis=1)

In [50]:
all_data_test.shape

(2000, 236)

## stacking特征

In [51]:
"""
-- 知识点六
-- stacking特征
"""
from sklearn.model_selection import KFold
import pandas as pd
import numpy as np
from scipy import sparse
import xgboost
import lightgbm
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier,ExtraTreesClassifier
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor,GradientBoostingRegressor,ExtraTreesRegressor
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.svm import LinearSVC,SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import log_loss,mean_absolute_error,mean_squared_error
from sklearn.naive_bayes import MultinomialNB,GaussianNB

### stacking 回归特征

In [52]:
"""
-- 回归
-- stacking 回归特征
"""
def stacking_reg(clf,train_x,train_y,test_x,clf_name,kf,label_split=None):
    train=np.zeros((train_x.shape[0],1))
    test=np.zeros((test_x.shape[0],1))
    test_pre=np.empty((folds,test_x.shape[0],1))
    cv_scores=[]
    for i,(train_index,test_index) in enumerate(kf.split(train_x,label_split)):       
        tr_x=train_x[train_index]
        tr_y=train_y[train_index]
        te_x=train_x[test_index]
        te_y = train_y[test_index]
        if clf_name in ["rf","ada","gb","et","lr"]:
            clf.fit(tr_x,tr_y)
            pre=clf.predict(te_x).reshape(-1,1)
            train[test_index]=pre
            test_pre[i,:]=clf.predict(test_x).reshape(-1,1)
            cv_scores.append(mean_squared_error(te_y, pre))
        elif clf_name in ["xgb"]:
            train_matrix = clf.DMatrix(tr_x, label=tr_y, missing=-1)
            test_matrix = clf.DMatrix(te_x, label=te_y, missing=-1)
            z = clf.DMatrix(test_x, label=te_y, missing=-1)
            params = {'booster': 'gbtree',
                      'eval_metric': 'rmse',
                      'gamma': 1,
                      'min_child_weight': 1.5,
                      'max_depth': 5,
                      'lambda': 10,
                      'subsample': 0.7,
                      'colsample_bytree': 0.7,
                      'colsample_bylevel': 0.7,
                      'eta': 0.03,
                      'tree_method': 'exact',
                      'seed': 2017,
                      'nthread': 12
                      }
            num_round = 10000
            early_stopping_rounds = 100
            watchlist = [(train_matrix, 'train'),
                         (test_matrix, 'eval')
                         ]
            if test_matrix:
                model = clf.train(params, train_matrix, num_boost_round=num_round,evals=watchlist,
                                  early_stopping_rounds=early_stopping_rounds
                                  )
                pre= model.predict(test_matrix,ntree_limit=model.best_ntree_limit).reshape(-1,1)
                train[test_index]=pre
                test_pre[i, :]= model.predict(z, ntree_limit=model.best_ntree_limit).reshape(-1,1)
                cv_scores.append(mean_squared_error(te_y, pre))

        elif clf_name in ["lgb"]:
            train_matrix = clf.Dataset(tr_x, label=tr_y)
            test_matrix = clf.Dataset(te_x, label=te_y)
            params = {
                      'boosting_type': 'gbdt',
                      'objective': 'regression_l2',
                      'metric': 'mse',
                      'min_child_weight': 1.5,
                      'num_leaves': 2**5,
                      'lambda_l2': 10,
                      'subsample': 0.7,
                      'colsample_bytree': 0.7,
                      'colsample_bylevel': 0.7,
                      'learning_rate': 0.03,
                      'tree_method': 'exact',
                      'seed': 2017,
                      'nthread': 12,
                      'silent': True,
                      }
            num_round = 10000
            early_stopping_rounds = 100
            if test_matrix:
                model = clf.train(params, train_matrix,num_round,valid_sets=test_matrix,
                                  early_stopping_rounds=early_stopping_rounds
                                  )
                pre= model.predict(te_x,num_iteration=model.best_iteration).reshape(-1,1)
                train[test_index]=pre
                test_pre[i, :]= model.predict(test_x, num_iteration=model.best_iteration).reshape(-1,1)
                cv_scores.append(mean_squared_error(te_y, pre))
        else:
            raise IOError("Please add new clf.")
        print("%s now score is:"%clf_name,cv_scores)
    test[:]=test_pre.mean(axis=0)
    print("%s_score_list:"%clf_name,cv_scores)
    print("%s_score_mean:"%clf_name,np.mean(cv_scores))
    return train.reshape(-1,1),test.reshape(-1,1)

def rf_reg(x_train, y_train, x_valid, kf, label_split=None):
    randomforest = RandomForestRegressor(n_estimators=600, max_depth=20, n_jobs=-1, random_state=2017, max_features="auto",verbose=1)
    rf_train, rf_test = stacking_reg(randomforest, x_train, y_train, x_valid, "rf", kf, label_split=label_split)
    return rf_train, rf_test,"rf_reg"

def ada_reg(x_train, y_train, x_valid, kf, label_split=None):
    adaboost = AdaBoostRegressor(n_estimators=30, random_state=2017, learning_rate=0.01)
    ada_train, ada_test = stacking_reg(adaboost, x_train, y_train, x_valid, "ada", kf, label_split=label_split)
    return ada_train, ada_test,"ada_reg"

def gb_reg(x_train, y_train, x_valid, kf, label_split=None):
    gbdt = GradientBoostingRegressor(learning_rate=0.04, n_estimators=100, subsample=0.8, random_state=2017,max_depth=5,verbose=1)
    gbdt_train, gbdt_test = stacking_reg(gbdt, x_train, y_train, x_valid, "gb", kf, label_split=label_split)
    return gbdt_train, gbdt_test,"gb_reg"

def et_reg(x_train, y_train, x_valid, kf, label_split=None):
    extratree = ExtraTreesRegressor(n_estimators=600, max_depth=35, max_features="auto", n_jobs=-1, random_state=2017,verbose=1)
    et_train, et_test = stacking_reg(extratree, x_train, y_train, x_valid, "et", kf, label_split=label_split)
    return et_train, et_test,"et_reg"

def lr_reg(x_train, y_train, x_valid, kf, label_split=None):
    lr_reg=LinearRegression(n_jobs=-1)
    lr_train, lr_test = stacking_reg(lr_reg, x_train, y_train, x_valid, "lr", kf, label_split=label_split)
    return lr_train, lr_test, "lr_reg"

def xgb_reg(x_train, y_train, x_valid, kf, label_split=None):
    xgb_train, xgb_test = stacking_reg(xgboost, x_train, y_train, x_valid, "xgb", kf, label_split=label_split)
    return xgb_train, xgb_test,"xgb_reg"

def lgb_reg(x_train, y_train, x_valid, kf, label_split=None):
    lgb_train, lgb_test = stacking_reg(lightgbm, x_train, y_train, x_valid, "lgb", kf, label_split=label_split)
    return lgb_train, lgb_test,"lgb_reg"

### stacking 分类特征

In [53]:
"""
-- 分类
-- stacking 分类特征
"""
def stacking_clf(clf,train_x,train_y,test_x,clf_name,kf,label_split=None):
    train=np.zeros((train_x.shape[0],1))
    test=np.zeros((test_x.shape[0],1))
    test_pre=np.empty((folds,test_x.shape[0],1))
    cv_scores=[]
    for i,(train_index,test_index) in enumerate(kf.split(train_x,label_split)):       
        tr_x=train_x[train_index]
        tr_y=train_y[train_index]
        te_x=train_x[test_index]
        te_y = train_y[test_index]

        if clf_name in ["rf","ada","gb","et","lr","knn","gnb"]:
            clf.fit(tr_x,tr_y)
            pre=clf.predict_proba(te_x)
            
            train[test_index]=pre[:,0].reshape(-1,1)
            test_pre[i,:]=clf.predict_proba(test_x)[:,0].reshape(-1,1)
            
            cv_scores.append(log_loss(te_y, pre[:,0].reshape(-1,1)))
        elif clf_name in ["xgb"]:
            train_matrix = clf.DMatrix(tr_x, label=tr_y, missing=-1)
            test_matrix = clf.DMatrix(te_x, label=te_y, missing=-1)
            z = clf.DMatrix(test_x, missing=-1)
            params = {'booster': 'gbtree',
                      'objective': 'multi:softprob',
                      'eval_metric': 'mlogloss',
                      'gamma': 1,
                      'min_child_weight': 1.5,
                      'max_depth': 5,
                      'lambda': 10,
                      'subsample': 0.7,
                      'colsample_bytree': 0.7,
                      'colsample_bylevel': 0.7,
                      'eta': 0.03,
                      'tree_method': 'exact',
                      'seed': 2017,
                      "num_class": 2
                      }

            num_round = 10000
            early_stopping_rounds = 100
            watchlist = [(train_matrix, 'train'),
                         (test_matrix, 'eval')
                         ]
            if test_matrix:
                model = clf.train(params, train_matrix, num_boost_round=num_round,evals=watchlist,
                                  early_stopping_rounds=early_stopping_rounds
                                  )
                pre= model.predict(test_matrix,ntree_limit=model.best_ntree_limit)
                train[test_index]=pre[:,0].reshape(-1,1)
                test_pre[i, :]= model.predict(z, ntree_limit=model.best_ntree_limit)[:,0].reshape(-1,1)
                cv_scores.append(log_loss(te_y, pre[:,0].reshape(-1,1)))
        elif clf_name in ["lgb"]:
            train_matrix = clf.Dataset(tr_x, label=tr_y)
            test_matrix = clf.Dataset(te_x, label=te_y)
            params = {
                      'boosting_type': 'gbdt',
                      #'boosting_type': 'dart',
                      'objective': 'multiclass',
                      'metric': 'multi_logloss',
                      'min_child_weight': 1.5,
                      'num_leaves': 2**5,
                      'lambda_l2': 10,
                      'subsample': 0.7,
                      'colsample_bytree': 0.7,
                      'colsample_bylevel': 0.7,
                      'learning_rate': 0.03,
                      'tree_method': 'exact',
                      'seed': 2017,
                      "num_class": 2,
                      'silent': True,
                      }
            num_round = 10000
            early_stopping_rounds = 100
            if test_matrix:
                model = clf.train(params, train_matrix,num_round,valid_sets=test_matrix,
                                  early_stopping_rounds=early_stopping_rounds
                                  )
                pre= model.predict(te_x,num_iteration=model.best_iteration)
                print('AAAAAA',test_x.shape)
                train[test_index]=pre[:,0].reshape(-1,1)
                test_pre[i, :]= model.predict(test_x, num_iteration=model.best_iteration)[:,0].reshape(-1,1)
                cv_scores.append(log_loss(te_y, pre[:,0].reshape(-1,1)))
        else:
            raise IOError("Please add new clf.")
        print("%s now score is:"%clf_name,cv_scores)
    test[:]=test_pre.mean(axis=0)
    print("%s_score_list:"%clf_name,cv_scores)
    print("%s_score_mean:"%clf_name,np.mean(cv_scores))
    return train.reshape(-1,1),test.reshape(-1,1)

def rf_clf(x_train, y_train, x_valid, kf, label_split=None):
    randomforest = RandomForestClassifier(n_estimators=1200, max_depth=20, n_jobs=-1, random_state=2017, max_features="auto",verbose=1)
    rf_train, rf_test = stacking_clf(randomforest, x_train, y_train, x_valid, "rf", kf, label_split=label_split)
    return rf_train, rf_test,"rf"

def ada_clf(x_train, y_train, x_valid, kf, label_split=None):
    adaboost = AdaBoostClassifier(n_estimators=50, random_state=2017, learning_rate=0.01)
    ada_train, ada_test = stacking_clf(adaboost, x_train, y_train, x_valid, "ada", kf, label_split=label_split)
    return ada_train, ada_test,"ada"

def gb_clf(x_train, y_train, x_valid, kf, label_split=None):
    gbdt = GradientBoostingClassifier(learning_rate=0.04, n_estimators=100, subsample=0.8, random_state=2017,max_depth=5,verbose=1)
    gbdt_train, gbdt_test = stacking_clf(gbdt, x_train, y_train, x_valid, "gb", kf, label_split=label_split)
    return gbdt_train, gbdt_test,"gb"

def et_clf(x_train, y_train, x_valid, kf, label_split=None):
    extratree = ExtraTreesClassifier(n_estimators=1200, max_depth=35, max_features="auto", n_jobs=-1, random_state=2017,verbose=1)
    et_train, et_test = stacking_clf(extratree, x_train, y_train, x_valid, "et", kf, label_split=label_split)
    return et_train, et_test,"et"

def xgb_clf(x_train, y_train, x_valid, kf, label_split=None):
    xgb_train, xgb_test = stacking_clf(xgboost, x_train, y_train, x_valid, "xgb", kf, label_split=label_split)
    return xgb_train, xgb_test,"xgb"

def lgb_clf(x_train, y_train, x_valid, kf, label_split=None):
    xgb_train, xgb_test = stacking_clf(lightgbm, x_train, y_train, x_valid, "lgb", kf, label_split=label_split)
    return xgb_train, xgb_test,"lgb"

def gnb_clf(x_train, y_train, x_valid, kf, label_split=None):
    gnb=GaussianNB()
    gnb_train, gnb_test = stacking_clf(gnb, x_train, y_train, x_valid, "gnb", kf, label_split=label_split)
    return gnb_train, gnb_test,"gnb"

def lr_clf(x_train, y_train, x_valid, kf, label_split=None):
    logisticregression=LogisticRegression(n_jobs=-1,random_state=2017,C=0.1,max_iter=200)
    lr_train, lr_test = stacking_clf(logisticregression, x_train, y_train, x_valid, "lr", kf, label_split=label_split)
    return lr_train, lr_test, "lr"

def knn_clf(x_train, y_train, x_valid, kf, label_split=None):
    kneighbors=KNeighborsClassifier(n_neighbors=200,n_jobs=-1)
    knn_train, knn_test = stacking_clf(kneighbors, x_train, y_train, x_valid, "lr", kf, label_split=label_split)
    return knn_train, knn_test, "knn"

### 获取训练和验证数据(为stacking特征做准备)

In [57]:
all_data_test.shape

(2000, 236)

In [64]:
all_data_test[features_columns].shape,len(features_columns),all_data_test.shape

((2000, 228), 228, (2000, 236))

In [65]:
len(all_data_test.columns)

236

In [66]:
features_columns = [c for c in all_data_test.columns if c not in ['label', 'prob', 'seller_path', 'cat_path', 'brand_path', 'action_type_path', 'item_path', 'time_stamp_path']]
x_train = all_data_test[~all_data_test['label'].isna()][features_columns].values
y_train = all_data_test[~all_data_test['label'].isna()]['label'].values
x_valid = all_data_test[all_data_test['label'].isna()][features_columns].values

### 处理函数值inf以及nan情况

In [67]:
def get_matrix(data):
    where_are_nan = np.isnan(data)
    where_are_inf = np.isinf(data)
    data[where_are_nan] = 0
    data[where_are_inf] = 0
    return data

In [68]:
x_train = np.float_(get_matrix(np.float_(x_train)))
y_train = np.int_(y_train)
x_valid = x_train

In [69]:
x_train.shape

(2000, 228)

### 导入划分数据函数 设stacking特征为5折

In [70]:
from sklearn.model_selection import StratifiedKFold, KFold
import lightgbm,xgboost


folds = 5
seed = 1
kf = KFold(n_splits=5, shuffle=True, random_state=0)

### 使用lgb和xgb分类模型构造stacking特征

In [71]:
# clf_list = [lgb_clf, xgb_clf, lgb_reg, xgb_reg]
# clf_list_col = ['lgb_clf', 'xgb_clf', 'lgb_reg', 'xgb_reg']

clf_list = [lgb_clf, xgb_clf]
clf_list_col = ['lgb_clf', 'xgb_clf']

### 训练模型，获取stacking特征

In [72]:
x_train.shape,y_train.shape,x_valid.shape

((2000, 228), (2000,), (2000, 228))

In [73]:
clf_list = clf_list
column_list = []
train_data_list=[]
test_data_list=[]
for clf in clf_list:
    train_data,test_data,clf_name=clf(x_train, y_train, x_valid, kf, label_split=None)
    train_data_list.append(train_data)
    test_data_list.append(test_data)
train_stacking = np.concatenate(train_data_list, axis=1)
test_stacking = np.concatenate(test_data_list, axis=1)

[1]	valid_0's multi_logloss: 0.240319
Training until validation scores don't improve for 100 rounds
[2]	valid_0's multi_logloss: 0.240122
[3]	valid_0's multi_logloss: 0.239833
[4]	valid_0's multi_logloss: 0.239505
[5]	valid_0's multi_logloss: 0.239508
[6]	valid_0's multi_logloss: 0.239234
[7]	valid_0's multi_logloss: 0.23894
[8]	valid_0's multi_logloss: 0.238671
[9]	valid_0's multi_logloss: 0.238576
[10]	valid_0's multi_logloss: 0.238459
[11]	valid_0's multi_logloss: 0.238496
[12]	valid_0's multi_logloss: 0.238343
[13]	valid_0's multi_logloss: 0.238338
[14]	valid_0's multi_logloss: 0.23838
[15]	valid_0's multi_logloss: 0.238456
[16]	valid_0's multi_logloss: 0.23872
[17]	valid_0's multi_logloss: 0.238728
[18]	valid_0's multi_logloss: 0.238966
[19]	valid_0's multi_logloss: 0.239273
[20]	valid_0's multi_logloss: 0.239236
[21]	valid_0's multi_logloss: 0.239341
[22]	valid_0's multi_logloss: 0.239368
[23]	valid_0's multi_logloss: 0.239362
[24]	valid_0's multi_logloss: 0.239557
[25]	valid_0's

[98]	valid_0's multi_logloss: 0.317971
[99]	valid_0's multi_logloss: 0.318236
[100]	valid_0's multi_logloss: 0.3186
[101]	valid_0's multi_logloss: 0.318946
Early stopping, best iteration is:
[1]	valid_0's multi_logloss: 0.281961
AAAAAA (2000, 228)
lgb now score is: [2.6300821660643603, 2.589485985152662]
[1]	valid_0's multi_logloss: 0.253852
Training until validation scores don't improve for 100 rounds
[2]	valid_0's multi_logloss: 0.253789
[3]	valid_0's multi_logloss: 0.253905
[4]	valid_0's multi_logloss: 0.254008
[5]	valid_0's multi_logloss: 0.253926
[6]	valid_0's multi_logloss: 0.254072
[7]	valid_0's multi_logloss: 0.254134
[8]	valid_0's multi_logloss: 0.254065
[9]	valid_0's multi_logloss: 0.253962
[10]	valid_0's multi_logloss: 0.253941
[11]	valid_0's multi_logloss: 0.254117
[12]	valid_0's multi_logloss: 0.254331
[13]	valid_0's multi_logloss: 0.254395
[14]	valid_0's multi_logloss: 0.254408
[15]	valid_0's multi_logloss: 0.254388
[16]	valid_0's multi_logloss: 0.25451
[17]	valid_0's mul

lgb now score is: [2.6300821660643603, 2.589485985152662, 2.5884166050143556, 2.5602029903385746]
[1]	valid_0's multi_logloss: 0.213981
Training until validation scores don't improve for 100 rounds
[2]	valid_0's multi_logloss: 0.21363
[3]	valid_0's multi_logloss: 0.213469
[4]	valid_0's multi_logloss: 0.213072
[5]	valid_0's multi_logloss: 0.21282
[6]	valid_0's multi_logloss: 0.21238
[7]	valid_0's multi_logloss: 0.21212
[8]	valid_0's multi_logloss: 0.211931
[9]	valid_0's multi_logloss: 0.211733
[10]	valid_0's multi_logloss: 0.211496
[11]	valid_0's multi_logloss: 0.211131
[12]	valid_0's multi_logloss: 0.210951
[13]	valid_0's multi_logloss: 0.211148
[14]	valid_0's multi_logloss: 0.21122
[15]	valid_0's multi_logloss: 0.211105
[16]	valid_0's multi_logloss: 0.211112
[17]	valid_0's multi_logloss: 0.211177
[18]	valid_0's multi_logloss: 0.211093
[19]	valid_0's multi_logloss: 0.211231
[20]	valid_0's multi_logloss: 0.211362
[21]	valid_0's multi_logloss: 0.211314
[22]	valid_0's multi_logloss: 0.211

[64]	train-mlogloss:0.23167	eval-mlogloss:0.26242
[65]	train-mlogloss:0.22967	eval-mlogloss:0.26134
[66]	train-mlogloss:0.22812	eval-mlogloss:0.26029
[67]	train-mlogloss:0.22635	eval-mlogloss:0.25960
[68]	train-mlogloss:0.22458	eval-mlogloss:0.25900
[69]	train-mlogloss:0.22299	eval-mlogloss:0.25815
[70]	train-mlogloss:0.22120	eval-mlogloss:0.25724
[71]	train-mlogloss:0.21974	eval-mlogloss:0.25638
[72]	train-mlogloss:0.21819	eval-mlogloss:0.25561
[73]	train-mlogloss:0.21667	eval-mlogloss:0.25488
[74]	train-mlogloss:0.21517	eval-mlogloss:0.25410
[75]	train-mlogloss:0.21380	eval-mlogloss:0.25343
[76]	train-mlogloss:0.21239	eval-mlogloss:0.25286
[77]	train-mlogloss:0.21126	eval-mlogloss:0.25230
[78]	train-mlogloss:0.20988	eval-mlogloss:0.25182
[79]	train-mlogloss:0.20878	eval-mlogloss:0.25131
[80]	train-mlogloss:0.20767	eval-mlogloss:0.25060
[81]	train-mlogloss:0.20650	eval-mlogloss:0.25013
[82]	train-mlogloss:0.20544	eval-mlogloss:0.24964
[83]	train-mlogloss:0.20425	eval-mlogloss:0.24919


[13]	train-mlogloss:0.46440	eval-mlogloss:0.48307
[14]	train-mlogloss:0.45346	eval-mlogloss:0.47330
[15]	train-mlogloss:0.44303	eval-mlogloss:0.46425
[16]	train-mlogloss:0.43307	eval-mlogloss:0.45518
[17]	train-mlogloss:0.42345	eval-mlogloss:0.44666
[18]	train-mlogloss:0.41418	eval-mlogloss:0.43869
[19]	train-mlogloss:0.40532	eval-mlogloss:0.43080
[20]	train-mlogloss:0.39686	eval-mlogloss:0.42342
[21]	train-mlogloss:0.38877	eval-mlogloss:0.41651
[22]	train-mlogloss:0.38125	eval-mlogloss:0.40980
[23]	train-mlogloss:0.37395	eval-mlogloss:0.40337
[24]	train-mlogloss:0.36702	eval-mlogloss:0.39733
[25]	train-mlogloss:0.36025	eval-mlogloss:0.39155
[26]	train-mlogloss:0.35367	eval-mlogloss:0.38595
[27]	train-mlogloss:0.34747	eval-mlogloss:0.38087
[28]	train-mlogloss:0.34134	eval-mlogloss:0.37584
[29]	train-mlogloss:0.33556	eval-mlogloss:0.37116
[30]	train-mlogloss:0.32976	eval-mlogloss:0.36650
[31]	train-mlogloss:0.32430	eval-mlogloss:0.36229
[32]	train-mlogloss:0.31913	eval-mlogloss:0.35803


[176]	train-mlogloss:0.12603	eval-mlogloss:0.30919
[177]	train-mlogloss:0.12568	eval-mlogloss:0.30918
[178]	train-mlogloss:0.12518	eval-mlogloss:0.30946
[179]	train-mlogloss:0.12480	eval-mlogloss:0.30981
[180]	train-mlogloss:0.12444	eval-mlogloss:0.30999
[181]	train-mlogloss:0.12401	eval-mlogloss:0.31006
[182]	train-mlogloss:0.12372	eval-mlogloss:0.31039
[183]	train-mlogloss:0.12339	eval-mlogloss:0.31088
[184]	train-mlogloss:0.12297	eval-mlogloss:0.31089
Stopping. Best iteration:
[84]	train-mlogloss:0.19214	eval-mlogloss:0.28870

xgb now score is: [2.4266696114931254, 2.207174352668226]
[0]	train-mlogloss:0.67046	eval-mlogloss:0.67131
Multiple eval metrics have been passed: 'eval-mlogloss' will be used for early stopping.

Will train until eval-mlogloss hasn't improved in 100 rounds.
[1]	train-mlogloss:0.64928	eval-mlogloss:0.65085
[2]	train-mlogloss:0.62932	eval-mlogloss:0.63174
[3]	train-mlogloss:0.61055	eval-mlogloss:0.61357
[4]	train-mlogloss:0.59260	eval-mlogloss:0.59651
[5]	train

[149]	train-mlogloss:0.14872	eval-mlogloss:0.26301
[150]	train-mlogloss:0.14797	eval-mlogloss:0.26341
[151]	train-mlogloss:0.14739	eval-mlogloss:0.26370
[152]	train-mlogloss:0.14687	eval-mlogloss:0.26386
[153]	train-mlogloss:0.14633	eval-mlogloss:0.26410
[154]	train-mlogloss:0.14580	eval-mlogloss:0.26393
[155]	train-mlogloss:0.14529	eval-mlogloss:0.26380
[156]	train-mlogloss:0.14464	eval-mlogloss:0.26376
[157]	train-mlogloss:0.14410	eval-mlogloss:0.26376
[158]	train-mlogloss:0.14349	eval-mlogloss:0.26398
[159]	train-mlogloss:0.14292	eval-mlogloss:0.26409
[160]	train-mlogloss:0.14245	eval-mlogloss:0.26407
[161]	train-mlogloss:0.14200	eval-mlogloss:0.26424
[162]	train-mlogloss:0.14143	eval-mlogloss:0.26427
[163]	train-mlogloss:0.14101	eval-mlogloss:0.26435
[164]	train-mlogloss:0.14048	eval-mlogloss:0.26444
[165]	train-mlogloss:0.13985	eval-mlogloss:0.26454
[166]	train-mlogloss:0.13931	eval-mlogloss:0.26477
[167]	train-mlogloss:0.13892	eval-mlogloss:0.26472
[168]	train-mlogloss:0.13845	ev

[98]	train-mlogloss:0.19624	eval-mlogloss:0.21167
[99]	train-mlogloss:0.19523	eval-mlogloss:0.21124
[100]	train-mlogloss:0.19439	eval-mlogloss:0.21092
[101]	train-mlogloss:0.19342	eval-mlogloss:0.21074
[102]	train-mlogloss:0.19250	eval-mlogloss:0.21062
[103]	train-mlogloss:0.19152	eval-mlogloss:0.21022
[104]	train-mlogloss:0.19040	eval-mlogloss:0.20992
[105]	train-mlogloss:0.18952	eval-mlogloss:0.20985
[106]	train-mlogloss:0.18861	eval-mlogloss:0.20945
[107]	train-mlogloss:0.18769	eval-mlogloss:0.20930
[108]	train-mlogloss:0.18684	eval-mlogloss:0.20908
[109]	train-mlogloss:0.18588	eval-mlogloss:0.20904
[110]	train-mlogloss:0.18503	eval-mlogloss:0.20884
[111]	train-mlogloss:0.18394	eval-mlogloss:0.20854
[112]	train-mlogloss:0.18301	eval-mlogloss:0.20843
[113]	train-mlogloss:0.18224	eval-mlogloss:0.20836
[114]	train-mlogloss:0.18145	eval-mlogloss:0.20796
[115]	train-mlogloss:0.18050	eval-mlogloss:0.20766
[116]	train-mlogloss:0.17985	eval-mlogloss:0.20746
[117]	train-mlogloss:0.17902	eval

[12]	train-mlogloss:0.48161	eval-mlogloss:0.48024
[13]	train-mlogloss:0.47056	eval-mlogloss:0.46910
[14]	train-mlogloss:0.45989	eval-mlogloss:0.45821
[15]	train-mlogloss:0.44969	eval-mlogloss:0.44814
[16]	train-mlogloss:0.43988	eval-mlogloss:0.43837
[17]	train-mlogloss:0.43052	eval-mlogloss:0.42910
[18]	train-mlogloss:0.42156	eval-mlogloss:0.42020
[19]	train-mlogloss:0.41305	eval-mlogloss:0.41174
[20]	train-mlogloss:0.40485	eval-mlogloss:0.40373
[21]	train-mlogloss:0.39698	eval-mlogloss:0.39576
[22]	train-mlogloss:0.38947	eval-mlogloss:0.38809
[23]	train-mlogloss:0.38232	eval-mlogloss:0.38081
[24]	train-mlogloss:0.37543	eval-mlogloss:0.37406
[25]	train-mlogloss:0.36868	eval-mlogloss:0.36762
[26]	train-mlogloss:0.36239	eval-mlogloss:0.36141
[27]	train-mlogloss:0.35632	eval-mlogloss:0.35523
[28]	train-mlogloss:0.35045	eval-mlogloss:0.34968
[29]	train-mlogloss:0.34489	eval-mlogloss:0.34417
[30]	train-mlogloss:0.33956	eval-mlogloss:0.33891
[31]	train-mlogloss:0.33441	eval-mlogloss:0.33369


[175]	train-mlogloss:0.14060	eval-mlogloss:0.21745
[176]	train-mlogloss:0.14017	eval-mlogloss:0.21770
[177]	train-mlogloss:0.13973	eval-mlogloss:0.21778
[178]	train-mlogloss:0.13938	eval-mlogloss:0.21789
[179]	train-mlogloss:0.13902	eval-mlogloss:0.21808
[180]	train-mlogloss:0.13846	eval-mlogloss:0.21802
[181]	train-mlogloss:0.13795	eval-mlogloss:0.21814
[182]	train-mlogloss:0.13750	eval-mlogloss:0.21828
[183]	train-mlogloss:0.13699	eval-mlogloss:0.21859
[184]	train-mlogloss:0.13654	eval-mlogloss:0.21859
[185]	train-mlogloss:0.13600	eval-mlogloss:0.21843
[186]	train-mlogloss:0.13551	eval-mlogloss:0.21854
[187]	train-mlogloss:0.13511	eval-mlogloss:0.21848
[188]	train-mlogloss:0.13462	eval-mlogloss:0.21846
[189]	train-mlogloss:0.13429	eval-mlogloss:0.21849
[190]	train-mlogloss:0.13391	eval-mlogloss:0.21851
[191]	train-mlogloss:0.13350	eval-mlogloss:0.21853
[192]	train-mlogloss:0.13309	eval-mlogloss:0.21847
[193]	train-mlogloss:0.13275	eval-mlogloss:0.21842
[194]	train-mlogloss:0.13237	ev

### 原始特征和stacking特征合并

In [88]:
# # 合并所有特征
train = pd.DataFrame(np.concatenate([x_train, train_stacking], axis=1))
test = np.concatenate([x_valid, test_stacking], axis=1)

In [89]:
train.shape,test.shape,train_stacking.shape

((2000, 230), (2000, 230), (2000, 2))

In [90]:
all_data_test.shape

(2000, 236)

### 特征重命名

In [91]:
len(features_columns),clf_list_col,len(features_columns + clf_list_col)

(228, ['lgb_clf', 'xgb_clf'], 230)

In [92]:
df_train_all = pd.DataFrame(train)
df_train_all.columns = features_columns + clf_list_col
df_test_all = pd.DataFrame(test)
df_test_all.columns = features_columns + clf_list_col

## 获取数据ID以及特征标签LABEL

In [94]:
df_train_all['user_id'] = all_data_test[~all_data_test['label'].isna()]['user_id']
df_test_all['user_id'] = all_data_test[all_data_test['label'].isna()]['user_id']
df_train_all['label'] = all_data_test[~all_data_test['label'].isna()]['label']

In [96]:
all_data_test[all_data_test['label'].isna()]

Unnamed: 0,user_id,merchant_id,label,prob,age_range,gender,item_path,cat_path,seller_path,brand_path,...,embeeding_90,embeeding_91,embeeding_92,embeeding_93,embeeding_94,embeeding_95,embeeding_96,embeeding_97,embeeding_98,embeeding_99


## 训练数据和测试数据保存

In [82]:
df_train_all.to_csv('train_all.csv',header=True,index=False)
df_test_all.to_csv('test_all.csv',header=True,index=False)