In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import time

import gc
"""
Python中，主要依靠gc（garbage collector）模块的引用计数技术来进行垃圾回收。
所谓引用计数，就是考虑到Python中变量的本质不是内存中一块存储数据的区域，而是对一块内存数据区域的引用。
所以python可以给所有的对象（内存中的区域）维护一个引用计数的属性，
在一个引用被创建或复制的时候，让python把相关对象的引用计数+1；相反当引用被销毁的时候就把相关对象的引用计数-1。
当对象的引用计数减到0时，自然就可以认为整个python中不会再有变量引用这个对象，
所以就可以把这个对象所占据的内存空间释放出来了。
"""
from collections import Counter
import copy

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

# 配置路径

In [2]:
data_path = 'data/'
tmp_res_path = 'tmp_results/'

# 读取数据集

In [3]:
# 读取数据函数
def read_data(file_name):

    df = pd.read_csv(data_path + file_name)
    df.drop_duplicates(inplace=True)
    
    return df

# 对数据进行内存压缩

In [4]:
# 节约内存的一个标配函数
def reduce_memory(df, verbose=True):
    
    starttime = time.time()
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if pd.isnull(c_min) or pd.isnull(c_max):
                continue
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    
    print('-- Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction),time spend:{:2.2f} min'.format(end_mem,
                                                                                                           100*(start_mem-end_mem)/start_mem,
                                                                                                           (time.time()-starttime)/60))
    return df

In [5]:
num_of_rows = None
"""
# train & test
test_data_path = data_path + 'test_format1.csv'
train_data_path = data_path + 'train_format1.csv'

# user_info & user_log
user_info_path = data_path + 'user_info_format1.csv'
user_log_path = data_path + 'user_log_format1.csv'
"""
# STEP1. read data from csv
# train & test
test_data = read_data(file_name='test_format1.csv')
train_data = read_data(file_name='train_format1.csv')

# user_info & user_log
user_info = read_data(file_name='user_info_format1.csv')
user_log = read_data(file_name='user_log_format1.csv')

# STEP2. reduce memory
train_data = reduce_memory(train_data)
test_data = reduce_memory(test_data)
user_info = reduce_memory(user_info)
user_log = reduce_memory(user_log)

-- Mem. usage decreased to  3.73 Mb (53.1% reduction),time spend:0.00 min
-- Mem. usage decreased to  5.49 Mb (31.2% reduction),time spend:0.00 min
-- Mem. usage decreased to  6.47 Mb (50.0% reduction),time spend:0.00 min
-- Mem. usage decreased to 981.69 Mb (60.9% reduction),time spend:0.12 min


# 数据处理

In [6]:
# 合并训练&测试数据集
df_all_data = train_data.append(test_data)
df_all_data.head()

Unnamed: 0,label,merchant_id,prob,user_id
0,0.0,3906,,34176
1,0.0,121,,34176
2,1.0,4356,,34176
3,0.0,2217,,34176
4,0.0,4818,,230784


In [7]:
# 合并用户信息&全量数据集
df_all_data = pd.merge(df_all_data, user_info, on='user_id', how='left')
df_all_data.head()

Unnamed: 0,label,merchant_id,prob,user_id,age_range,gender
0,0.0,3906,,34176,6.0,0.0
1,0.0,121,,34176,6.0,0.0
2,1.0,4356,,34176,6.0,0.0
3,0.0,2217,,34176,6.0,0.0
4,0.0,4818,,230784,0.0,0.0


In [8]:
# 清除垃圾
del train_data, test_data, user_info
gc.collect()

107

In [9]:
# 按照时间排序
user_log = user_log.sort_values(by=['user_id', 'time_stamp'])
user_log.head()

Unnamed: 0,user_id,item_id,cat_id,seller_id,brand_id,time_stamp,action_type
23288890,1,181459,276,2245,4752.0,1009,0
23288891,1,779078,276,2245,4752.0,1009,0
23288893,1,452837,276,2245,4752.0,1009,0
23288894,1,543397,276,2245,4752.0,1009,0
23288886,1,504149,1023,925,7400.0,1011,0


In [13]:
# 对于每个用户合并字段
# cat_id, seller_id, brand_id, time_stamp, action_type
list_join_func = lambda x: " ".join([str(i) for i in x])

agg_dict = {
    'item_id': list_join_func,
    'cat_id': list_join_func,
    'seller_id': list_join_func,
    'brand_id': list_join_func, 
    'time_stamp': list_join_func, 
    'action_type': list_join_func
}

rename_dict = {
    'item_id': 'iten_path',
    'cat_id': 'cat_path',
    'seller_id': 'seller_path',
    'brand_id': 'brand_path', 
    'time_stamp': 'time_stamp_path', 
    'action_type': 'action_type_path'
}

def merga_list(df_id, join_col, df_data, agg_dict, rename_dict):
    
    df_data = df_data.groupby(join_col).agg(agg_dict).reset_index().rename(columns=rename_dict)
    df_id = pd.merge(df_id, df_data, on=join_col, how='left')
    
    return df_id
    

df_all_data = merga_list(df_all_data, 'user_id', user_log, agg_dict, rename_dict)
df_all_data.head()

Unnamed: 0,label,merchant_id,prob,user_id,age_range,gender,iten_path,cat_path,seller_path,brand_path,time_stamp_path,action_type_path
0,0.0,3906,,34176,6.0,0.0,581818 879005 1011673 52343 277305 1011348 929...,1505 662 1505 662 1095 1505 662 1095 1505 1505...,416 3606 416 3760 3606 416 1926 3004 416 416 4...,4014.0 33.0 4014.0 3738.0 33.0 4014.0 5472.0 5...,521 521 521 521 521 521 521 521 521 521 521 52...,0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 2 0 0 ...
1,0.0,121,,34176,6.0,0.0,581818 879005 1011673 52343 277305 1011348 929...,1505 662 1505 662 1095 1505 662 1095 1505 1505...,416 3606 416 3760 3606 416 1926 3004 416 416 4...,4014.0 33.0 4014.0 3738.0 33.0 4014.0 5472.0 5...,521 521 521 521 521 521 521 521 521 521 521 52...,0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 2 0 0 ...
2,1.0,4356,,34176,6.0,0.0,581818 879005 1011673 52343 277305 1011348 929...,1505 662 1505 662 1095 1505 662 1095 1505 1505...,416 3606 416 3760 3606 416 1926 3004 416 416 4...,4014.0 33.0 4014.0 3738.0 33.0 4014.0 5472.0 5...,521 521 521 521 521 521 521 521 521 521 521 52...,0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 2 0 0 ...
3,0.0,2217,,34176,6.0,0.0,581818 879005 1011673 52343 277305 1011348 929...,1505 662 1505 662 1095 1505 662 1095 1505 1505...,416 3606 416 3760 3606 416 1926 3004 416 416 4...,4014.0 33.0 4014.0 3738.0 33.0 4014.0 5472.0 5...,521 521 521 521 521 521 521 521 521 521 521 52...,0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 2 0 0 ...
4,0.0,4818,,230784,0.0,0.0,191923 191923 964906 229470 813085 964906 2642...,1023 1023 662 664 1544 662 662 662 662 737 107...,3545 3545 4566 2537 2420 4566 2963 3931 3931 3...,5860.0 5860.0 6320.0 6064.0 8036.0 6320.0 6804...,601 601 614 614 614 618 618 618 618 622 624 62...,0 2 0 0 0 0 0 0 0 0 2 2 0 0 0 0 0 0 0 0 2 0 0 ...


In [14]:
# 删除数据并回收内存
del user_log
gc.collect()

2866

# 定义特征统计函数
## 定义统计函数

In [None]:
# total count of data
def cnt_(x):
    try:
        return len(x.split(' '))
    except:
        return -1

# number of unique element of data
def unique_(x):
    try:
        return len(set(x.split(' ')))
    except:
        return -1
    
# maximum of data
def max_(x):
    try:
        return np.max([float(i) for i in x.split(' ')])
    except:
        -1
        
# minimum of data
def min_(x):
    try:
        return np.min([float(i) for i in x.split(' ')])
    except:
        -1
        
# standard deviation of data
def std_(x):
    try:
        return np.std([float(i) for i in x.split(' ')])
    except:
        -1
        
# the top k element of data
def most_n(x, n):
    try:
        return Counter(x.split(' ')).most_common(n)[n-1][0]
    except:
        -1
        
# total count of the top k element of data
def most_n_cnt(x, n):
    try:
        return Counter(x.split(' ')).most_common(n)[n-1][1]
    except:
        -1


In [21]:
Counter('1 2 8 10 13 20 20'.split(' ')).most_common(3)[3-1][0]   

'2'