In [20]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

import gc
from collections import Counter
import copy

import warnings
warnings.filterwarnings("ignore")

%matplotlib inline

In [21]:
test_data = pd.read_csv("./data/test_format1.csv")
train_data = pd.read_csv("./data/train_format1.csv")
user_info = pd.read_csv("./data/user_info_format1.csv")
user_log = pd.read_csv("./data/user_log_format1.csv")

In [22]:
# 对数据内存压缩
def reduce_mem_usage(df, verbose=True):
    start_mem = df.memory_usage().sum() / 1024 ** 2
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']

    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024 ** 2
    print("Memory usage after optimation is {:.2f} MB".format(end_mem))
    print("Decreased by {:.1f}%".format(100 * (start_mem - end_mem) / start_mem))
    return df

In [23]:
train_data = reduce_mem_usage(train_data)
test_data = reduce_mem_usage(test_data)
user_info = reduce_mem_usage(user_info)
user_log = reduce_mem_usage(user_log)

Memory usage after optimation is 1.74 MB
Decreased by 70.8%
Memory usage after optimation is 3.49 MB
Decreased by 41.7%
Memory usage after optimation is 3.24 MB
Decreased by 66.7%
Memory usage after optimation is 890.48 MB
Decreased by 69.6%


In [24]:
all_data = train_data.append(test_data)
all_data = all_data.merge(user_info, on=['user_id'], how='left')
del train_data, test_data, user_info
gc

<module 'gc' (built-in)>

In [25]:
user_log = user_log.sort_values(['user_id', 'time_stamp'])

In [26]:
list_join_func = lambda x: " ".join([str(i) for i in x])

agg_dict = {
    'item_id': list_join_func,
    'cat_id': list_join_func,
    'seller_id': list_join_func,
    'brand_id': list_join_func,
    'time_stamp': list_join_func,
    'action_type': list_join_func
}

rename_dict = {
    'item_id': 'item_path',
    'cat_id': 'cat_path',
    'seller_id': 'seller_path',
    'brand_id': 'brand_path',
    'time_stamp': 'time_stamp_path',
    'action_type': 'action_type_path'
}

def merge_list(df_ID, join_columns, df_data, agg_dict, rename_dict):
    df_data = df_data.\
        groupby(join_columns).\
        agg(agg_dict).\
        reset_index().\
        rename(columns=rename_dict)
    df_ID = df_ID.merge(df_data, on=join_columns, how='left')
    return df_ID

print(all_data.head())
all_data = merge_list(all_data, 'user_id', user_log, agg_dict, rename_dict)
print(all_data.head())

   user_id  merchant_id  label  prob  age_range  gender
0    34176         3906    0.0   NaN        6.0     0.0
1    34176          121    0.0   NaN        6.0     0.0
2    34176         4356    1.0   NaN        6.0     0.0
3    34176         2217    0.0   NaN        6.0     0.0
4   230784         4818    0.0   NaN        0.0     0.0
   user_id  merchant_id  label  prob  age_range  gender  \
0    34176         3906    0.0   NaN        6.0     0.0   
1    34176          121    0.0   NaN        6.0     0.0   
2    34176         4356    1.0   NaN        6.0     0.0   
3    34176         2217    0.0   NaN        6.0     0.0   
4   230784         4818    0.0   NaN        0.0     0.0   

                                           item_path  \
0  581818 879005 581818 581818 1011673 52343 2773...   
1  581818 879005 581818 581818 1011673 52343 2773...   
2  581818 879005 581818 581818 1011673 52343 2773...   
3  581818 879005 581818 581818 1011673 52343 2773...   
4  191923 191923 191923 19192

In [27]:
del user_log
gc.collect()

1569

In [30]:
def cnt_(x):
    try:
        return len(x.split(' '))
    except:
        return -1

def nunique_(x):
    try:
        return len(set(x.split(' ')))
    except:
        return -1

def max_(x):
    try:
        return np.max([float(i) for i in x.split(' ')])
    except:
        return -1

def min_(x):
    try:
        return np.min([float(i) for i in x.split(' ')])
    except:
        return -1

def std_(x):
    try:
        return np.std([float(i) for i in x.split(' ')])
    except:
        return -1

def most_n(x, n):
    try:
        return Counter(x.split(' ')).most_common(n)[n-1][0]
    except:
        return -1

def most_n_cnt(x, n):
    try:
        return Counter(x.split(' ')).most_common(n)[n-1][1]
    except:
        return -1

def user_cnt(df_data, single_col, name):
    df_data[name] = df_data[single_col].apply(cnt_)
    return df_data

def user_unique(df_data, single_col, name):
    df_data[name] = df_data[single_col].apply(nunique_)
    return df_data

def user_max(df_data, single_col, name):
    df_data[name] = df_data[single_col].apply(max_)
    return df_data

def user_min(df_data, single_col, name):
    df_data[name] = df_data[single_col].apply(min_)
    return df_data

def user_std(df_data, single_col, name):
    df_data[name] = df_data[single_col].apply(std_)
    return df_data

def user_most_n(df_data, single_col, name, n=1):
    func = lambda x: most_n(x, n)
    df_data[name] = df_data[single_col].apply(func)
    return df_data

def user_most_n_cnt(df_data, single_col, name, n=1):
    func = lambda x: most_n_cnt(x, n)
    df_data[name] = df_data[single_col].apply(func)
    return df_data

In [35]:
all_data_test = all_data
all_data_test = user_cnt(all_data_test, 'seller_path', 'user_cnt')
all_data_test = user_unique(all_data_test, 'seller_path', 'seller_nunique')
all_data_test = user_unique(all_data_test, 'cat_path', 'cat_nunique')
all_data_test = user_unique(all_data_test, 'brand_path', 'brand_nunique')
all_data_test = user_unique(all_data_test, 'item_path', 'item_nunique')
all_data_test = user_unique(all_data_test, 'time_stamp_path', 'time_stamp_nunique')
all_data_test = user_unique(all_data_test, 'action_type_path', 'action_type_nunique')

all_data_test = user_max(all_data_test, 'action_type_path', 'time_stamp_max')
all_data_test = user_min(all_data_test, 'action_type_path', 'time_stamp_min')
all_data_test = user_std(all_data_test, 'action_type_path', 'time_stamp_std')
all_data_test['time_stamp_range'] = all_data_test['time_stamp_max'] - all_data_test['time_stamp_min']
all_data_test = user_most_n(all_data_test, 'seller_path', 'seller_most_1', 1)
all_data_test = user_most_n(all_data_test, 'cat_path', 'cat_most_1', 1)
all_data_test = user_most_n(all_data_test, 'brand_path', 'brand_most_1', 1)
all_data_test = user_most_n(all_data_test, 'action_type_path', 'action_type_1', 1)
all_data_test = user_most_n_cnt(all_data_test, 'seller_path', 'seller_most_1_cnt', 1)
all_data_test = user_most_n_cnt(all_data_test, 'cat_path', 'cat_most_1_cnt', 1)
all_data_test = user_most_n_cnt(all_data_test, 'brand_path', 'brand_most_1_cnt', 1)
all_data_test = user_most_n_cnt(all_data_test, 'action_type_path', 'action_type_1_cnt', 1)

In [36]:
def col_cnt_(df_data, columns_list, action_type):
    try:
        data_dict = {}
        col_list = copy.deepcopy(columns_list)
        if action_type != None:
            col_list += ['action_type_path']

        for col in col_list:
            data_dict[col] = df_data[col].split(' ')

        path_len = len(data_dict[col])

        data_out = []
        for i_ in range(path_len):
            data_txt = ''
            for col_ in columns_list:
                if data_dict['action_type_path'][i_] == action_type:
                    data_txt += '_' + data_dict[col_][i_]
            data_out.append(data_txt)
        return len(data_out)
    except:
        return -1

def col_nuique_(df_data, columns_list, action_type):
    try:
        data_dict = {}

        col_list = copy.deepcopy(columns_list)
        if action_type != None:
            col_list += ['action_type_path']

        for col in col_list:
            data_dict[col] = df_data[col].split(' ')

        path_len = len(data_dict[col])

        data_out = []
        for i_ in range(path_len):
            data_txt = ''
            for col_ in columns_list:
                if data_dict['action_type_path'][i_] == action_type:
                    data_txt += '_' + data_dict[col_][i_]
            data_out.append(data_txt)

        return len(set(data_out))
    except:
        return -1

def user_col_cnt(df_data, columns_list, action_type, name):
    df_data[name] = df_data.apply(lambda x: col_cnt_(x, columns_list, action_type), axis=1)
    return df_data

def user_col_nunique(df_data, columns_list, action_type, name):
    df_data[name] = df_data.apply(lambda x: col_nuique_(x, columns_list, action_type), axis=1)
    return df_data

In [37]:
# 点击次数
all_data_test = user_col_cnt(all_data_test,  ['seller_path'], '0', 'user_cnt_0')
# 加购次数
all_data_test = user_col_cnt(all_data_test,  ['seller_path'], '1', 'user_cnt_1')
# 购买次数
all_data_test = user_col_cnt(all_data_test,  ['seller_path'], '2', 'user_cnt_2')
# 收藏次数
all_data_test = user_col_cnt(all_data_test,  ['seller_path'], '3', 'user_cnt_3')

# 不同店铺个数
all_data_test = user_col_nunique(all_data_test,  ['seller_path'], '0', 'seller_nunique_0')

In [38]:
all_data_test.columns

Index(['user_id', 'merchant_id', 'label', 'prob', 'age_range', 'gender',
       'item_path', 'cat_path', 'seller_path', 'brand_path', 'time_stamp_path',
       'action_type_path', 'user_cnt', 'seller_nunique', 'cat_nunique',
       'brand_nunique', 'item_nunique', 'time_stamp_nunique',
       'action_type_nunique', 'time_stamp_max', 'time_stamp_min',
       'time_stamp_std', 'time_stamp_range', 'seller_most_1', 'cat_most_1',
       'brand_most_1', 'action_type_1', 'seller_most_1_cnt', 'cat_most_1_cnt',
       'brand_most_1_cnt', 'action_type_1_cnt', 'user_cnt_0', 'user_cnt_1',
       'user_cnt_2', 'user_cnt_3', 'seller_nunique_0'],
      dtype='object')

In [39]:
# 利用Countvector和TF-IDF提取特征
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from scipy import sparse

tfidfVec = TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS,
                           ngram_range=(1, 1),
                           max_features=100)

columns_list = ['seller_path']
for i, col in enumerate(columns_list):
    tfidfVec.fit(all_data_test[col])
    data_ = tfidfVec.transform(all_data_test[col])
    if i == 0:
        data_cat = data_
    else:
        data_cat = sparse.hstack((data_cat, data_))

df_tfidf = pd.DataFrame(data_cat.toarray())
df_tfidf.columns = ['tfidf_' + str(i) for i in df_tfidf.columns]
all_data_test = pd.concat([all_data_test, df_tfidf], axis=1)

In [42]:
import gensim

model = gensim.models.Word2Vec(
    all_data_test["seller_path"].apply(lambda x: x.split(' ')),
    # size = 100,
    window = 5,
    min_count = 5,
    workers = 4
)

def mean_w2v_(x, model, size=100):
    try:
        i = 0
        for word in x.split(' '):
            if word in model.wv.vocab:
                i += 1
                if i == 1:
                    vec = np.zeros(size)
                vec += model.wv[word]
        return vec / i
    except:
        return np.zeros(size)

def get_mean_w2v(df_data, columns, model, size):
    data_array = []
    for index, row in df_data.iterrows():
        w2v = mean_w2v_(row[columns], model, size)
        data_array.append(w2v)
    return pd.DataFrame(data_array)

df_embedding = get_mean_w2v(all_data_test, 'seller_path', model, 100)
df_embedding.columns = ['embedding_' + str(i) for i in df_embedding.columns]

all_data_test = pd.concat([all_data_test, df_embedding], axis=1)

In [43]:
# Stacking 分类特征
from sklearn.model_selection import KFold
import pandas as pd
import numpy as np
from scipy import sparse
import xgboost
import lightgbm
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier,ExtraTreesClassifier
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor,GradientBoostingRegressor,ExtraTreesRegressor
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.svm import LinearSVC,SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import log_loss,mean_absolute_error,mean_squared_error
from sklearn.naive_bayes import MultinomialNB,GaussianNB