In [2]:
import import_ipynb
from sklearn.preprocessing import Imputer, MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.utils import shuffle
import numpy as np
import itertools
import pandas as pd
import collections
from scipy import sparse
import time
from scipy.sparse import csr_matrix, save_npz, load_npz, hstack
import warnings
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV

from loadData import raw_df, test_df
from Util import getAllTypesofCategory, getAllTypesOfProperty

warnings.filterwarnings('ignore')

In [3]:
final_df = pd.concat([raw_df.drop(['is_trade'], axis = 1), test_df], axis = 0).reset_index(drop=True)

# impute missing value

In [4]:
contain_missing_value_columns = ["item_brand_id","item_city_id","item_sales_level","user_gender_id","user_age_level",
                                "user_occupation_id","user_star_level","shop_review_positive_rate",
                                "shop_score_service","shop_score_delivery","shop_score_description"]
data_imputer = Imputer(missing_values=-1 , strategy='most_frequent', axis=0)
final_df[contain_missing_value_columns] = data_imputer.fit_transform(final_df[contain_missing_value_columns])

# preporcessing

In [5]:
lbl = LabelEncoder()

## Add features

### item

In [6]:
# len_item_category, len_item_property features
final_df['len_item_category'] = final_df['item_category_list'].map(lambda x: len(str(x).split(';')))
final_df['len_item_property'] = final_df['item_property_list'].map(lambda x: len(str(x).split(';')))

In [7]:
# item_category_1, item_category_2 feature
for i in range(1, 3):
    final_df['item_category_%d'%(i)] = lbl.fit_transform(final_df['item_category_list'].map(lambda x: str(str(x).split(';')[i]) if len(str(x).split(';')) > i else ''))

In [8]:
# item_properties_0 - 9
for i in range(10):
    final_df['item_properties_%d'%(i)] = lbl.fit_transform(final_df['item_property_list'].map(lambda x: str(str(x).split(';')[i]) if len(str(x).split(';')) > i else ''))

In [9]:
# item_id, item_brand_id, item_city_id
for col in ['item_id', 'item_brand_id', 'item_city_id']:
    final_df[col] = lbl.fit_transform(final_df[col])

### user

In [10]:
for col in ['user_id']:
    final_df[col] = lbl.fit_transform(final_df[col])

In [11]:
for i, column in zip(range(1, 6), ['user_age_level', 'user_occupation_id', 'user_star_level', 'context_page_id', 'shop_star_level']):
    final_df[column] = final_df[column].apply(lambda item: item%(i*1000))

### context

In [12]:
final_df['len_predict_category_property'] = final_df['predict_category_property'].map(lambda x: len(str(x).split(';')))

In [13]:
for i in range(5):
    final_df['predict_category_property' + str(i)] = lbl.fit_transform(final_df['predict_category_property'].map(
        lambda x: str(str(x).split(';')[i]) if len(str(x).split(';')) > i else ''))

### shop

In [14]:
for col in ['shop_id']:
    final_df[col] = lbl.fit_transform(final_df[col])

In [15]:
final_df['shop_score_delivery0'] = final_df['shop_score_delivery'].apply(lambda x: 0 if x <= 0.98 and x >= 0.96  else 1)

### others

In [None]:
# final_df[['item_category_list', 'predict_categories']]

In [None]:
# # 'item_category_list', predict_categories
# def calc_predict_category_accuracy(row_item):
#     num = 0
#     predict_categories_list = row_item.predict_categories
#     for i in row_item.item_category_list:
#         if i in predict_categories_list:
#             num += 1
#     return round(num/len(predict_categories_list), 2)
# final_df['predict_category_accuracy'] = final_df[['item_category_list', 'predict_categories']].apply((lambda item: calc_predict_category_accuracy(item)), axis = 1)

In [23]:
# final_df.columns

Index(['instance_id', 'item_id', 'item_category_list', 'item_property_list',
       'item_brand_id', 'item_city_id', 'item_price_level', 'item_sales_level',
       'item_collected_level', 'item_pv_level', 'user_id', 'user_gender_id',
       'user_age_level', 'user_occupation_id', 'user_star_level', 'context_id',
       'context_timestamp', 'context_page_id', 'predict_category_property',
       'shop_id', 'shop_review_num_level', 'shop_review_positive_rate',
       'shop_star_level', 'shop_score_service', 'shop_score_delivery',
       'shop_score_description', 'time', 'day', 'hour', 'len_item_category',
       'len_item_property', 'item_category_1', 'item_category_2',
       'item_properties_0', 'item_properties_1', 'item_properties_2',
       'item_properties_3', 'item_properties_4', 'item_properties_5',
       'item_properties_6', 'item_properties_7', 'item_properties_8',
       'item_properties_9', 'len_predict_category_property',
       'predict_category_property0', 'predict_categor

## drop features

In [16]:
final_df_ = final_df.drop(['instance_id', 'item_category_list', 'item_property_list', 'context_id', 'predict_category_property', 'time', 'context_timestamp'], axis=1)

# split data into train, valid and test set, shuffle it

In [17]:
train_X, valid_X, test_X = final_df_[final_df.day < 24].as_matrix(), final_df_[final_df.day == 24].as_matrix(), final_df_[final_df.day == 25].as_matrix()
train_Y, valid_Y  = raw_df[raw_df.day < 24]['is_trade'].as_matrix(), raw_df[raw_df.day == 24]['is_trade'].as_matrix()

In [18]:
train_X, train_Y = shuffle(train_X, train_Y, random_state=0)

# Train by LGBM

In [19]:
# param_test7 = {
# 'n_estimators':[20]
# }
# clf = GridSearchCV(lgb.LGBMClassifier(objective='binary',
#      num_leaves=64,
#      learning_rate=0.01,
#      n_estimators=2000,
#      max_depth=7,
#      min_samples_split=100),
#      param_test7,
#      verbose=1,
#      cv=5,
#      scoring='log_loss',
#      n_jobs=4)
# clf.fit(train_X,train_Y)

In [20]:
gbm = lgb.LGBMClassifier(
        objective='binary',
        # metric='binary_error',
        num_leaves=35,
        depth=8,
        learning_rate=0.05,
        seed=2018,
        colsample_bytree=0.8,
        # min_child_samples=8,
        subsample=0.9,
        n_estimators=20000)
gbm.fit(train_X, train_Y,
#         feature_name = list(trainX.columns.values),
        eval_set=[(valid_X, valid_Y)],
        eval_metric='binary_logloss',
        early_stopping_rounds=100
       )

[1]	valid_0's binary_logloss: 0.647778
Training until validation scores don't improve for 100 rounds.
[2]	valid_0's binary_logloss: 0.606733
[3]	valid_0's binary_logloss: 0.569422
[4]	valid_0's binary_logloss: 0.53537
[5]	valid_0's binary_logloss: 0.504199
[6]	valid_0's binary_logloss: 0.4756
[7]	valid_0's binary_logloss: 0.449266
[8]	valid_0's binary_logloss: 0.424959
[9]	valid_0's binary_logloss: 0.4025
[10]	valid_0's binary_logloss: 0.381692
[11]	valid_0's binary_logloss: 0.362412
[12]	valid_0's binary_logloss: 0.344469
[13]	valid_0's binary_logloss: 0.327799
[14]	valid_0's binary_logloss: 0.312267
[15]	valid_0's binary_logloss: 0.297792
[16]	valid_0's binary_logloss: 0.284276
[17]	valid_0's binary_logloss: 0.271652
[18]	valid_0's binary_logloss: 0.259858
[19]	valid_0's binary_logloss: 0.248824
[20]	valid_0's binary_logloss: 0.238495
[21]	valid_0's binary_logloss: 0.228833
[22]	valid_0's binary_logloss: 0.219779
[23]	valid_0's binary_logloss: 0.211291
[24]	valid_0's binary_logloss: 

[204]	valid_0's binary_logloss: 0.0821324
[205]	valid_0's binary_logloss: 0.0821407
[206]	valid_0's binary_logloss: 0.0821508
[207]	valid_0's binary_logloss: 0.0821511
[208]	valid_0's binary_logloss: 0.0821538
[209]	valid_0's binary_logloss: 0.0821528
[210]	valid_0's binary_logloss: 0.082146
[211]	valid_0's binary_logloss: 0.0821497
[212]	valid_0's binary_logloss: 0.0821448
[213]	valid_0's binary_logloss: 0.0821482
[214]	valid_0's binary_logloss: 0.0821516
[215]	valid_0's binary_logloss: 0.0821613
[216]	valid_0's binary_logloss: 0.0821686
[217]	valid_0's binary_logloss: 0.0821704
[218]	valid_0's binary_logloss: 0.0821756
[219]	valid_0's binary_logloss: 0.0821843
[220]	valid_0's binary_logloss: 0.0821872
[221]	valid_0's binary_logloss: 0.0821826
[222]	valid_0's binary_logloss: 0.0821738
[223]	valid_0's binary_logloss: 0.0821797
[224]	valid_0's binary_logloss: 0.0821835
[225]	valid_0's binary_logloss: 0.0821969
[226]	valid_0's binary_logloss: 0.0822006
[227]	valid_0's binary_logloss: 0.0

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=0.8,
        depth=8, learning_rate=0.05, max_depth=-1, min_child_samples=20,
        min_child_weight=0.001, min_split_gain=0.0, n_estimators=20000,
        n_jobs=-1, num_leaves=35, objective='binary', random_state=None,
        reg_alpha=0.0, reg_lambda=0.0, seed=2018, silent=True,
        subsample=0.9, subsample_for_bin=200000, subsample_freq=1)

# predict score

In [21]:
# result_properties = gbm.predict_proba(test_X)[:, 1]
# result_df = pd.concat([test_df['instance_id'], pd.DataFrame(data=result_properties, columns=['predicted_score'])], axis=1)

In [22]:
# result_df.to_csv('../total_data/baseline_20180329_0820879.txt', index=False, sep=' ')