In [171]:
import import_ipynb
from sklearn.preprocessing import Imputer, MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.utils import shuffle
import numpy as np
import itertools
import pandas as pd
import collections
from scipy import sparse
import time
from scipy.sparse import csr_matrix, save_npz, load_npz, hstack
import warnings
import lightgbm as lgb

from loadData import raw_df, test_df
from Util import getAllTypesofCategory, getAllTypesOfProperty

warnings.filterwarnings('ignore')

In [2]:
final_df = pd.concat([raw_df.drop(['is_trade'], axis = 1), test_df], axis = 0).reset_index(drop=True)

## impute missing value with most_frequent strategy

In [3]:
contain_missing_value_columns = ["item_brand_id","item_city_id","item_sales_level","user_gender_id","user_age_level",
                                "user_occupation_id","user_star_level","shop_review_positive_rate",
                                "shop_score_service","shop_score_delivery","shop_score_description"]
data_imputer = Imputer(missing_values=-1 , strategy='most_frequent', axis=0)
final_df[contain_missing_value_columns] = data_imputer.fit_transform(final_df[contain_missing_value_columns])

## processing  predict_category_property

In [4]:
def generate_predict_category_property_feature(df):
    def process_func(item):
        result = {}
        for i in item.split(';'):
            if ':' in i:
                items = i.split(':')
                category = int(items[0])
                properties = items[1]
                if ',' in properties:
                    result[category] = map(int,items[1].split(','))
                else:
                    result[category] = [int(items[1])]
        return result
    def process_category(item):
        return list(process_func(item).keys())
    def process_property(item):
        _ = process_func(item).values()
        return list(set(itertools.chain(*_)))
    df['predict_categories'] = df['predict_category_property'].apply(lambda item: process_category(item))
    df['predict_properties'] = df['predict_category_property'].apply(lambda item: process_property(item))
generate_predict_category_property_feature(final_df)

In [5]:
tmp = final_df['predict_categories'].values
all_categories = list(set(itertools.chain(*tmp)))
category_label_encode = LabelEncoder()
all_categories_labels = category_label_encode.fit_transform(all_categories)

In [6]:
final_df['predict_categories'][160943] = ['7908382889764677758']
final_df['predict_categories'][205272] = ['7908382889764677758']
final_df['predict_categories'][214168] = ['7908382889764677758']
final_df['predict_categories'][314685] = ['7908382889764677758']

In [7]:
final_df['predict_categories'] = final_df['predict_categories'].apply(lambda item: category_label_encode.transform(item))

In [8]:
final_df['item_category_list'] = final_df['item_category_list'].apply(lambda item: item.split(';'))

In [9]:
final_df['item_category_list'] = final_df['item_category_list'].apply(lambda item: category_label_encode.transform(item))

In [10]:
for i in range(3):
    final_df['predict_categories_%d'%(i)] = final_df['predict_categories'].apply(lambda item:  item[i] if len(item) > i else 0)
#     final_df['predict_properties_%d'%(i)] = final_df['predict_properties'].apply(lambda item:  item[i] if len(item) > i else -1)
    final_df['item_category_%d'%(i)] = final_df['item_category_list'].apply(lambda x:x[i] if len(x) > i else 0)

In [11]:
extract_item_property_list_func = lambda item: list(item.split(';'))
final_df['item_property_list_array'] = final_df['item_property_list'].apply(lambda item: extract_item_property_list_func(item))

arrays = list(final_df['item_property_list_array'])
count = collections.Counter(list(itertools.chain(*arrays)))
most_common_property_id_list = list(list(zip(*count.most_common(1499)))[0])
most_common_property_id_list.append('o')

le = LabelEncoder()
le.fit(most_common_property_id_list)
most_common_property_id_num_list = le.transform(most_common_property_id_list)

# label property id in item_property_list_array
for i in range(final_df.shape[0]):
    for index, _id in enumerate(final_df['item_property_list_array'][i]):
        if _id not in most_common_property_id_list:
            final_df['item_property_list_array'][i][index] = 'o'

final_df['item_property_list_array'] = final_df['item_property_list_array'].apply(lambda item:le.transform(item))

In [12]:
for i in range(5):
    final_df['item_properties_%d'%(i)] = final_df['item_property_list_array'].apply(lambda x: x[i] if len(x) > i else 0)

In [13]:
tmp = final_df['predict_properties'].values
all_properties = list(set(itertools.chain(*tmp)))
properties_label_encode = LabelEncoder()
all_properties_labels = properties_label_encode.fit_transform(all_properties)

In [14]:
final_df['predict_properties'] = final_df['predict_properties'].apply(lambda item: properties_label_encode.transform(item))

In [15]:
for i in range(3):
    final_df['predict_properties_%d'%(i)] = final_df['predict_properties'].apply(lambda x: x[i] if len(x) > i else 0)

In [207]:
# 'item_category_list', predict_categories
def calc_predict_category_accuracy(row_item):
    num = 0
    predict_categories_list = row_item.predict_categories
    for i in row_item.item_category_list:
        if i in predict_categories_list:
            num += 1
    return round(num/len(predict_categories_list), 2)
final_df['predict_category_accuracy'] = final_df[['item_category_list', 'predict_categories']].apply((lambda item: calc_predict_category_accuracy(item)), axis = 1)

In [None]:
# # 'item_property_list_array', predict_properties
# def calc_predict_category_accuracy(row_item):
#     num = 0
#     predict_properties_list = row_item.predict_categories
#     for i in row_item.item_category_list:
#         if i in predict_categories_list:
#             num += 1
#     return round(num/len(predict_categories_list), 2)
# final_df['predict_category_accuracy'] = final_df[['item_category_list', 'predict_categories']].apply((lambda item: calc_predict_category_accuracy(item)), axis = 1)

In [16]:
# need_onehot_columns = ['item_brand_id','item_city_id','user_gender_id', 'predict_categories_0',
#        'item_category_0', 'predict_categories_1', 'item_category_1',
#        'predict_categories_2', 'item_category_2', 'item_properties_0',
#        'item_properties_1', 'item_properties_2', 'item_properties_3',
#        'item_properties_4', 'predict_properties_0', 'predict_properties_1',
#        'predict_properties_2']

In [17]:
# category_label_oh_enc = OneHotEncoder()
# category_label_oh_enc.fit(all_categories_labels.reshape(-1,1))

In [18]:
# predict_categories_df = final_df['predict_categories'].apply(lambda item: np.sum(category_label_oh_enc.transform(item.reshape(-1,1)),axis=0))

In [19]:
# save_npz('../total_data/predict_categories.npz',  csr_matrix(np.concatenate(predict_categories_df.values).reshape(len(predict_categories_df),-1)))

## processing the object column

In [20]:
# need_onehot_columns = ['item_brand_id','item_city_id','user_gender_id','item_one_level_category', 'item_second_level_category']
# special_onehot_columns = ['item_property_list_array', 'item_third_level_category']
# other_columns = ['context_timestamp','predict_category_property','is_trade']
# need_scale_columns = ['item_price_level','item_sales_level','item_collected_level','item_pv_level',
#                      'user_age_level','user_occupation_id','user_star_level','context_page_id',
#                      'shop_review_num_level','shop_review_positive_rate','shop_star_level','shop_score_service',
#                     'shop_score_description', 'day', 'hour']

In [21]:
# enc = OneHotEncoder()
# enc.fit(most_common_property_id_num_list.reshape(-1,1))

In [22]:
# _tmp_df = raw_df['item_property_list_array'].apply(lambda item: np.sum(enc.transform(item.reshape(-1,1)),axis=0).reshape(1,-1)[0])
# save_npz('../total_data/item_property.npz', csr_matrix(np.concatenate(_tmp_df.values).reshape(len(_tmp_df),-1)))

In [209]:
instance_id_df = final_df['instance_id']
data_df = final_df.drop(['instance_id', 'item_category_list', 'item_brand_id','item_city_id','user_gender_id', 'item_property_list', 'context_timestamp', 'time', 'predict_properties', 'predict_categories', 'predict_category_property', 'item_property_list_array'], axis=1)

In [210]:
# standard level data
for i, column in zip(range(1, 6), ['user_age_level', 'user_occupation_id', 'user_star_level', 'context_page_id', 'shop_star_level']):
    data_df[column] = data_df[column].apply(lambda item: item%(i*1000))

### one hot encode item_brand_id, item_city_id, user_genger_id

In [211]:
result_data = csr_matrix(data_df.as_matrix())
for column in ['item_brand_id','item_city_id','user_gender_id']:
    lb = LabelEncoder()
    ohenc = OneHotEncoder()
    result_data = hstack((ohenc.fit_transform(lb.fit_transform(final_df[column]).reshape(-1,1)), result_data))

# split data into train valid and test set, shuffle it

In [212]:
train_index, valid_index, test_index = max(data_df[data_df.day < 24].index)+1, max(data_df[data_df.day == 24].index)+1,max(data_df[data_df.day == 25].index)+1

In [213]:
_data = result_data.tocsr()
train_X, valid_X, test_X = _data[:train_index, :], _data[train_index: valid_index, :], _data[valid_index:, :]
train_Y, valid_Y  = raw_df[raw_df.day < 24]['is_trade'].as_matrix(), raw_df[raw_df.day == 24]['is_trade'].as_matrix()

In [214]:
train_X, train_Y = shuffle(train_X, train_Y, random_state=0)

#  LGBM

In [176]:
# param_test7 = {
# 'n_estimators':[20]
# }
# clf = GridSearchCV(lgb.LGBMClassifier(objective='binary',
#      num_leaves=64,
#      learning_rate=0.01,
#      n_estimators=2000,
#      max_depth=7,
#      min_samples_split=100),
#      param_test7,
#      verbose=1,
#      cv=5,
#      scoring='log_loss',
#      n_jobs=4)
# clf.fit(train_X,train_Y)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:   11.4s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        learning_rate=0.01, max_depth=7, min_child_samples=20,
        min_child_weight=0.001, min_samples_split=100, min_split_gain=0.0,
        n_estimators=2000, n_jobs=-1, num_leaves=64, objective='binary',
        random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=1),
       fit_params={}, iid=True, n_jobs=4,
       param_grid={'n_estimators': [20]}, pre_dispatch='2*n_jobs',
       refit=True, scoring='log_loss', verbose=1)

In [215]:
# gbm = lgb.LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
#          learning_rate=0.01, max_depth=7, min_child_samples=20,
#          min_child_weight=0.001, min_samples_split=100, min_split_gain=0.0,
#          n_estimators=2000, n_jobs=-1, num_leaves=64, objective='binary',
#          random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
#          subsample=1.0, subsample_for_bin=200000, subsample_freq=1)
# evals_result = {}
# gbm.fit(train_X, train_Y,
# #         feature_name = list(trainX.columns.values),
#         eval_set=[(valid_X, valid_Y)],
#         eval_metric='binary_logloss',
#         early_stopping_rounds=100
#        )

[1]	valid_0's binary_logloss: 0.683887
Training until validation scores don't improve for 100 rounds.
[2]	valid_0's binary_logloss: 0.674811
[3]	valid_0's binary_logloss: 0.665912
[4]	valid_0's binary_logloss: 0.657187
[5]	valid_0's binary_logloss: 0.648629
[6]	valid_0's binary_logloss: 0.640234
[7]	valid_0's binary_logloss: 0.631998
[8]	valid_0's binary_logloss: 0.623917
[9]	valid_0's binary_logloss: 0.615985
[10]	valid_0's binary_logloss: 0.608202
[11]	valid_0's binary_logloss: 0.60056
[12]	valid_0's binary_logloss: 0.593057
[13]	valid_0's binary_logloss: 0.585688
[14]	valid_0's binary_logloss: 0.578453
[15]	valid_0's binary_logloss: 0.571344
[16]	valid_0's binary_logloss: 0.564364
[17]	valid_0's binary_logloss: 0.557503
[18]	valid_0's binary_logloss: 0.550765
[19]	valid_0's binary_logloss: 0.544143
[20]	valid_0's binary_logloss: 0.537631
[21]	valid_0's binary_logloss: 0.531234
[22]	valid_0's binary_logloss: 0.524943
[23]	valid_0's binary_logloss: 0.51876
[24]	valid_0's binary_loglos

[203]	valid_0's binary_logloss: 0.124781
[204]	valid_0's binary_logloss: 0.124232
[205]	valid_0's binary_logloss: 0.12369
[206]	valid_0's binary_logloss: 0.123153
[207]	valid_0's binary_logloss: 0.122624
[208]	valid_0's binary_logloss: 0.122101
[209]	valid_0's binary_logloss: 0.121586
[210]	valid_0's binary_logloss: 0.121076
[211]	valid_0's binary_logloss: 0.120573
[212]	valid_0's binary_logloss: 0.120077
[213]	valid_0's binary_logloss: 0.119586
[214]	valid_0's binary_logloss: 0.119101
[215]	valid_0's binary_logloss: 0.118621
[216]	valid_0's binary_logloss: 0.118147
[217]	valid_0's binary_logloss: 0.117679
[218]	valid_0's binary_logloss: 0.117217
[219]	valid_0's binary_logloss: 0.116761
[220]	valid_0's binary_logloss: 0.11631
[221]	valid_0's binary_logloss: 0.115864
[222]	valid_0's binary_logloss: 0.115424
[223]	valid_0's binary_logloss: 0.11499
[224]	valid_0's binary_logloss: 0.114562
[225]	valid_0's binary_logloss: 0.114139
[226]	valid_0's binary_logloss: 0.113722
[227]	valid_0's bin

[405]	valid_0's binary_logloss: 0.0849534
[406]	valid_0's binary_logloss: 0.0849166
[407]	valid_0's binary_logloss: 0.0848812
[408]	valid_0's binary_logloss: 0.0848457
[409]	valid_0's binary_logloss: 0.0848079
[410]	valid_0's binary_logloss: 0.0847735
[411]	valid_0's binary_logloss: 0.0847379
[412]	valid_0's binary_logloss: 0.0847054
[413]	valid_0's binary_logloss: 0.0846722
[414]	valid_0's binary_logloss: 0.0846374
[415]	valid_0's binary_logloss: 0.0846003
[416]	valid_0's binary_logloss: 0.0845689
[417]	valid_0's binary_logloss: 0.0845379
[418]	valid_0's binary_logloss: 0.0845061
[419]	valid_0's binary_logloss: 0.0844768
[420]	valid_0's binary_logloss: 0.0844456
[421]	valid_0's binary_logloss: 0.0844145
[422]	valid_0's binary_logloss: 0.0843846
[423]	valid_0's binary_logloss: 0.0843555
[424]	valid_0's binary_logloss: 0.0843278
[425]	valid_0's binary_logloss: 0.0842975
[426]	valid_0's binary_logloss: 0.0842686
[427]	valid_0's binary_logloss: 0.0842406
[428]	valid_0's binary_logloss: 0.

[603]	valid_0's binary_logloss: 0.0824091
[604]	valid_0's binary_logloss: 0.082406
[605]	valid_0's binary_logloss: 0.0824044
[606]	valid_0's binary_logloss: 0.0824019
[607]	valid_0's binary_logloss: 0.0823999
[608]	valid_0's binary_logloss: 0.0823958
[609]	valid_0's binary_logloss: 0.0823921
[610]	valid_0's binary_logloss: 0.0823892
[611]	valid_0's binary_logloss: 0.0823849
[612]	valid_0's binary_logloss: 0.0823837
[613]	valid_0's binary_logloss: 0.082381
[614]	valid_0's binary_logloss: 0.0823782
[615]	valid_0's binary_logloss: 0.0823764
[616]	valid_0's binary_logloss: 0.0823751
[617]	valid_0's binary_logloss: 0.0823723
[618]	valid_0's binary_logloss: 0.0823707
[619]	valid_0's binary_logloss: 0.0823671
[620]	valid_0's binary_logloss: 0.0823625
[621]	valid_0's binary_logloss: 0.0823602
[622]	valid_0's binary_logloss: 0.0823571
[623]	valid_0's binary_logloss: 0.0823545
[624]	valid_0's binary_logloss: 0.0823544
[625]	valid_0's binary_logloss: 0.0823518
[626]	valid_0's binary_logloss: 0.08

[802]	valid_0's binary_logloss: 0.0821867
[803]	valid_0's binary_logloss: 0.0821858
[804]	valid_0's binary_logloss: 0.0821859
[805]	valid_0's binary_logloss: 0.0821879
[806]	valid_0's binary_logloss: 0.082189
[807]	valid_0's binary_logloss: 0.0821911
[808]	valid_0's binary_logloss: 0.0821907
[809]	valid_0's binary_logloss: 0.0821913
[810]	valid_0's binary_logloss: 0.0821885
[811]	valid_0's binary_logloss: 0.0821884
[812]	valid_0's binary_logloss: 0.0821889
[813]	valid_0's binary_logloss: 0.0821901
[814]	valid_0's binary_logloss: 0.0821907
[815]	valid_0's binary_logloss: 0.0821896
[816]	valid_0's binary_logloss: 0.0821895
[817]	valid_0's binary_logloss: 0.0821899
[818]	valid_0's binary_logloss: 0.0821903
[819]	valid_0's binary_logloss: 0.0821916
[820]	valid_0's binary_logloss: 0.0821895
[821]	valid_0's binary_logloss: 0.0821896
[822]	valid_0's binary_logloss: 0.0821893
[823]	valid_0's binary_logloss: 0.0821886
[824]	valid_0's binary_logloss: 0.0821878
[825]	valid_0's binary_logloss: 0.0

[1002]	valid_0's binary_logloss: 0.0821618
[1003]	valid_0's binary_logloss: 0.0821615
[1004]	valid_0's binary_logloss: 0.0821589
[1005]	valid_0's binary_logloss: 0.0821592
[1006]	valid_0's binary_logloss: 0.0821596
[1007]	valid_0's binary_logloss: 0.0821598
[1008]	valid_0's binary_logloss: 0.0821591
[1009]	valid_0's binary_logloss: 0.0821596
[1010]	valid_0's binary_logloss: 0.0821598
[1011]	valid_0's binary_logloss: 0.0821604
[1012]	valid_0's binary_logloss: 0.0821604
[1013]	valid_0's binary_logloss: 0.0821604
[1014]	valid_0's binary_logloss: 0.0821596
[1015]	valid_0's binary_logloss: 0.0821597
[1016]	valid_0's binary_logloss: 0.0821606
[1017]	valid_0's binary_logloss: 0.0821616
[1018]	valid_0's binary_logloss: 0.0821619
[1019]	valid_0's binary_logloss: 0.0821609
[1020]	valid_0's binary_logloss: 0.0821607
[1021]	valid_0's binary_logloss: 0.0821613
[1022]	valid_0's binary_logloss: 0.0821622
[1023]	valid_0's binary_logloss: 0.0821626
[1024]	valid_0's binary_logloss: 0.0821627
[1025]	vali

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        learning_rate=0.01, max_depth=7, min_child_samples=20,
        min_child_weight=0.001, min_samples_split=100, min_split_gain=0.0,
        n_estimators=2000, n_jobs=-1, num_leaves=64, objective='binary',
        random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=1)

In [177]:
# result_properties = gbm.predict_proba(test_X)[:, 1]
# result_df = pd.concat([test_df['instance_id'], pd.DataFrame(data=result_properties, columns=['predicted_score'])], axis=1)

In [178]:
# result_df.to_csv('../total_data/baseline_20180327_0821885.txt', index=False, sep=' ')

# Logistic Regression

In [135]:
# from sklearn.linear_model import LogisticRegression

In [136]:
# from sklearn.metrics import log_loss

In [137]:
# lr = LogisticRegression()

In [184]:
# lr.fit(train_X, train_Y)
# print(log_loss(valid_Y, lr.predict_proba(valid_X)[:, 1]))

0.08258274015599797
