In [25]:
import pandas as pd
import numpy as np
from sklearn.metrics import log_loss
from xgboost import XGBClassifier
import time
from sklearn.ensemble import RandomForestClassifier
from collections import defaultdict
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder


            
def property_count(data):
    total_count = data.shape[0]
    property_dict = defaultdict(int)
    for item in data['item_property_list']:
        for item2 in str(item).split(';'):
            property_dict[item2] += 1
    property_value = np.array(list(property_dict.values()))
    property_keys = np.array(list(property_dict.keys()))
    property_sort_idx = np.argsort(property_value)[::-1]
    
    m_sum = 0
    XL_bin = [] # 75%
    L_bin = [] # 75% - 50%
    M_bin = [] # 50% - 25%
    S_bin = [] # 25% - 0%
    for idx in property_sort_idx:
        m_sum += property_value[idx]
        if m_sum / total_count < 0.25:
            XL_bin.append(int(property_keys[idx]))
        elif m_sum / total_count < 0.5:
            L_bin.append(int(property_keys[idx]))
        elif m_sum / total_count < 0.75:
            M_bin.append(int(property_keys[idx]))
        else:
            S_bin.append(int(property_keys[idx]))
    return XL_bin, L_bin, M_bin, S_bin
      
# data preprocess
def datapreprocess(train_file_path, test_file_path):
    data = pd.read_csv(train_file_path, sep=' ', header=0)
    test_data = pd.read_csv(test_file_path, sep=' ', header=0)
    data.drop_duplicates(inplace=True)
    print('befor preprocess')
    print('train shape:', data.shape)
    print('test shape:', test_data.shape)
    
    def gen_click_feature(data):
        '''
        util function
        '''
        def time2str(v):
            return time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(v))
        def _gen(data, group_by_cols, new_name):
            groupby = data.groupby(group_by_cols).size().reset_index().rename(columns={0: new_name})
            data = pd.merge(data, groupby, 'left', on=group_by_cols)
            return data
            
        '''
        add ['time','day','hour']
        '''
        data['time'] = data['context_timestamp'].apply(time2str)
        data['day'] = data['time'].apply(lambda x:int(x[8:10]))
        data['hour'] = data['time'].apply(lambda x:int(x[11:13]))
        
        '''
        add ['user_click_day','user_click_hour']
        '''
        data = _gen(data, ['user_id', 'day'], 'user_click_day')
        data = _gen(data, ['user_id', 'day', 'hour'], 'user_click_hour')

        '''
        add ['shop_click_day','shop_click_hour']
        '''
        data = _gen(data, ['shop_id', 'day'], 'shop_click_day')
        data = _gen(data, ['shop_id', 'day', 'hour'], 'shop_click_hour')
        
        '''
        add ['item_click_day','item_click_hour']
        '''
        data = _gen(data, ['item_id', 'day'], 'item_click_day')
        data = _gen(data, ['item_id', 'day', 'hour'], 'item_click_hour')
        
        del data['time']
        return data
    
    def gen_category_feature(data, test_data):
        # util func
#         def category_count(data):
#             category_dict = defaultdict(set)
#             for item in data['item_category_list']:
#                 for idx, item2 in enumerate(str(item).split(';')):
#                     category_dict['cate_{}'.format(idx)].add(int(item2))
#             for key in category_dict.keys():
#                 category_dict[key].add(-1)
#             return category_dict
        
        con_data = pd.concat([data.drop(columns=['is_trade']), test_data], ignore_index=True)
        con_data['item_cate_1'] = con_data['item_category_list'].apply(
            lambda x: int(str(x).split(';')[1]) if 1 < len(str(x).split(';')) else -1)
        con_data['item_cate_2'] = con_data['item_category_list'].apply(
            lambda x: int(str(x).split(';')[2]) if 2 < len(str(x).split(';')) else -1)
        onehot_ed = OneHotEncoder()
        label_ed = LabelEncoder()
        
        label_ed.fit(con_data['item_cate_1'])
        onehot_ed.fit(label_ed.transform(con_data['item_cate_1']).reshape(-1, 1))
        onehot_array = onehot_ed.transform(label_ed.transform(con_data['item_cate_1']).reshape(-1, 1)).toarray()
        onehot_cate_1 = pd.DataFrame(onehot_array, columns=['cate1_' + str(x) for x in range(onehot_array.shape[1])])
        del onehot_array
        
        label_ed.fit(con_data['item_cate_2'])
        onehot_ed.fit(label_ed.transform(con_data['item_cate_2']).reshape(-1, 1)) 
        onehot_array = onehot_ed.transform(label_ed.transform(con_data['item_cate_2']).reshape(-1, 1)).toarray()
        onehot_cate_2 = pd.DataFrame(onehot_array, columns=['cate2_' + str(x) for x in range(onehot_array.shape[1])])
        del onehot_array

        con_data = pd.concat([con_data, onehot_cate_1, onehot_cate_2], axis=1)
        del con_data['item_cate_1']
        del con_data['item_cate_2']
        data = pd.concat([con_data.iloc[0:data.shape[0]], data['is_trade']], axis=1)
        test_data = con_data.iloc[data.shape[0]:]
        
        return data, test_data
    
    #data = gen_click_feature(data)
    #test_data = gen_click_feature(test_data)
    #data, test_data = gen_category_feature(data, test_data)
    
    print('after preprocess')
    print('train shape:', data.shape)
    print('test shape:', test_data.shape)
    drop_columns = ['instance_id','item_category_list','item_property_list','predict_category_property']
    return data.drop(columns = drop_columns), test_data['instance_id'], test_data.drop(columns = drop_columns)


def trainAsubmission(file_path, test_path, write_path, model):
    data, instance_id, test_data = datapreprocess(file_path, test_path)
    x, y = data.values[:,:-1], data.values[:,-1]
    model.fit(x, y)
    
    # train loss
    y_pred = model.predict_proba(x)
    print('train loss ',log_loss(y, y_pred))
    
    #make_submission
    result = pd.DataFrame()
    result['instance_id'] = instance_id
    result['predicted_score'] = model.predict_proba(test_data.values)[:,1]
    result.to_csv(write_path, sep=' ', index=False)
    

# def make_submission(file_path, test_path, write_path, trained_model):
#     data, instance_id, test_data = datapreprocess(file_path, test_path)
    

# data,instance_id,test_data = datapreprocess('train.txt', 'test.txt')

    

In [26]:
trainAsubmission('train.txt', 'test.txt', 'res_xgboost_1.txt', XGBClassifier(silent=1,objective='binary:logistic',seed=1203))   

befor preprocess
train shape: (478111, 27)
test shape: (18371, 26)
after preprocess
train shape: (478111, 27)
test shape: (18371, 26)
train loss  0.0891840973511807


In [17]:
data['is_trade']

0         0
1         0
2         0
3         0
4         0
5         0
6         0
7         0
8         0
9         0
10        0
11        0
12        0
13        0
14        1
15        0
16        0
17        0
18        0
19        0
20        0
21        0
22        0
23        0
24        0
25        0
26        0
27        0
28        0
29        0
         ..
478081    0
478082    0
478083    0
478084    0
478085    0
478086    0
478087    0
478088    0
478089    0
478090    0
478091    0
478092    0
478093    0
478094    0
478095    0
478096    0
478097    0
478098    0
478099    0
478100    0
478101    0
478102    0
478103    0
478104    0
478105    1
478106    0
478107    0
478108    0
478109    0
478110    0
Name: is_trade, Length: 478111, dtype: int64

In [None]:
data = pd.read_csv('train.txt', sep=' ', header=0)
test_data = pd.read_csv('test.txt', sep=' ', header=0)
con_data = pd.concat([data.drop(columns=['is_trade']),test_data],ignore_index=True)

In [None]:
con_data.head()

In [None]:
from collections import defaultdict
import numpy as np
def category_count(data):
    category_dict = defaultdict(set)
    for item in data['item_category_list']:
        for idx, item2 in enumerate(str(item).split(';')):
            category_dict['cate_{}'.format(idx)].add(item2)
    return category_dict
            
def property_count(data):
    total_count = data.shape[0]
    property_dict = defaultdict(int)
    for item in data['item_property_list']:
        for item2 in str(item).split(';'):
            property_dict[item2] += 1
    property_value = np.array(list(property_dict.values()))
    property_keys = np.array(list(property_dict.keys()))
    property_sort_idx = np.argsort(property_value)[::-1]
    
    m_sum = 0
    XL_bin = [] # 75%
    L_bin = [] # 75% - 50%
    M_bin = [] # 50% - 25%
    S_bin = [] # 25% - 0%
    for idx in property_sort_idx:
        m_sum += property_value[idx]
        if m_sum / total_count < 0.25:
            XL_bin.append(int(property_keys[idx]))
        else if m_sum / total_count < 0.5:
            L_bin.append(int(property_keys[idx]))
        else if m_sum / total_count < 0.75:
            M_bin.append(int(property_keys[idx]))
        else:
            S_bin.append(int(property_keys[idx]))
    return XL_bin, L_bin, M_bin, S_bin
      

In [None]:
len(property_set)