In [None]:
import numpy as np
import pandas as pd
import statsmodels as sm
import matplotlib.pylab as plt
import config as cf
import import_ipynb
import os
import sys
import lightgbm as lgb
import time
import itertools

from datetime import datetime, timedelta
from sklearn.metrics import mean_squared_error, log_loss
from chinese_calendar import is_workday, is_holiday
from jupyterthemes import jtplot
from IPython.core.display import clear_output

jtplot.style()
pd.options.display.max_rows = 1000
pd.options.display.max_columns = 200

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:96% !important; }</style>"))

In [None]:
exec_time = time.strftime("%Y%m%d%H%M", time.localtime())
os.mkdir('{0}_{1}'.format(cf.model_path, exec_time))

In [None]:
train_df = pd.read_csv(cf.train_data_features_file_path, index_col=0).reset_index()
test_df = pd.read_csv(cf.test_data_features_file_path, index_col=0).reset_index()
test_a_df = pd.read_csv(cf.round1_test_a_file_path, sep = ' ')
test_b_df = pd.read_csv(cf.round1_test_b_file_path, sep = ' ')

trainV_df = train_df.loc[train_df['context_day'] != 24]
testV_df = train_df.loc[train_df['context_day'] == 24]

ret_test_df = test_b_df

# train_df.loc[:,'context_datetime'] = pd.to_datetime(train_df.loc[:,'context_timestamp'] + time_offset, unit='s')
# train_df = train_df.loc[train_df['context_datetime'] < '2017-09-23']
# train_df = train_df.drop(columns=['context_datetime'])

In [None]:
print testV_df.shape
print trainV_df.shape

print test_df.shape
print train_df.shape

print trainV_df[['is_trade']].describe()
print testV_df[['is_trade']].describe()
print train_df[['is_trade']].describe()

In [None]:
feature_start = 10 
feature_start = 6
exclude_columns = ['user_id_is', 'item_id_is', 'shop_id_is', 'item_brand_id_is']
# exclude_columns.extend(filter(lambda x:x.startswith('ui'), train_df.columns.values))
# exclude_columns.extend(filter(lambda x:x.endswith('user_cnt'), train_df.columns.values))

trainV_y = trainV_df.iloc[:,1]
trainV_X = trainV_df.iloc[:,feature_start + 1:].drop(columns=exclude_columns)
testV_y = testV_df.iloc[:,1]
testV_X = testV_df.iloc[:,feature_start + 1:].drop(columns=exclude_columns)

train_y = train_df.iloc[:,1]
train_X = train_df.iloc[:,feature_start + 1:].drop(columns=exclude_columns)
test_X = test_df.iloc[:,feature_start:].drop(columns=exclude_columns)

print trainV_X.shape
print testV_X.shape
print train_X.shape
print test_X.shape
print train_X.columns.values

In [None]:
columns = filter(lambda x:x.endswith('0_cnt') or x.endswith('query_cnt'), train_X.columns.values)
# columns = train_X.columns.values
for column in columns:
    a = trainV_X[column].mean()
    b = testV_X[column].mean()
#     testV_X.loc[:, column] = testV_X[column] * a / b
#     c = testV_X[column].mean()
    a = train_X[column].mean()
    b = test_X[column].mean()
    f = float(a) / b
#     if (f > 2 or f < 0.5) and f < 1000:
#         print '--\t%s\t-- V:%f;B:%f;C:%f;D:%f' %(column, a,b,c, f)
    test_X.loc[:, column] = test_X[column] * a / b
    c = test_X[column].mean()

In [None]:
# args
lgb_args = {
    'num_leaves': 158,
    'max_depth': 8,
    'learning_rate': 0.05,
    'seed': 42,
    # 'min_child_samples' : 8,
    'colsample_bytree': 0.8,
    'subsample': 0.9
}

early_stopping_rounds=200
valid_n_estimators=5000
best_iter=112
categorical_feature=['user_gender_id', 'user_occupation_id', 'item_category_id',
                     'price_sale', 'collect_sale', 'collect_price', 'collect_pv', 'sale_pv',
                     'gender_age', 'gender_occ', 'gender_star', 'review_star',
                     'price_gender', 'price_occ', 'price_star', 'context_week',
                     'user_id_is', 'item_id_is', 'shop_id_is', 'item_brand_id_is']
categorical_feature='auto'

In [None]:
# test
print('Training LGBM model...')
clf = lgb.LGBMClassifier(
    objective='binary',
    n_estimators=valid_n_estimators,
    **lgb_args)

save_stdout = sys.stdout
with open('{0}_{1}/{2}'.format(cf.model_path, exec_time, cf.model_valid_log), 'w+') as outf:
#     sys.stdout = outf
    lgb_model = clf.fit(trainV_X, trainV_y, eval_set=[(testV_X, testV_y)], early_stopping_rounds=early_stopping_rounds, categorical_feature=categorical_feature)
    sys.stdout = save_stdout

    best_iter = lgb_model.best_iteration_
    pred_y = lgb_model.predict_proba(testV_X)[:, 1]
    # print(test[['is_trade','pred']])
    best_eval = log_loss(testV_y.values, pred_y)
    outf.write("best %f:%s" %(best_eval, {'n_estimators':best_iter}))
    
print "best %f:%s" %(best_eval, {'n_estimators':best_iter})
print 'pred mean:', pred_y.mean(), ' valid mean:', testV_y.mean()

In [None]:
# pred
clf1 = clf
# clf1 = lgb.LGBMClassifier(objective='binary', n_estimators=best_iter, **lgb_args)
# clf1 = lgb.LGBMClassifier(objective='binary', n_estimators=247, **lgb_args)
# clf1.fit(train_X, train_y, categorical_feature=categorical_feature)
test_pred = pd.Series(clf1.predict_proba(test_X)[:,1])
test_pred.name = 'predicted_score'

test_pred.hist(bins=100)
print test_pred.nunique()
print test_pred.value_counts().head()

pred_df = test_df[['instance_id']].join(test_pred)
ret_df = ret_test_df[['instance_id']].merge(pred_df)

print ret_df.describe()

ret_df.to_csv('{0}_{1}/{1}_{2}'.format(cf.model_path, exec_time, cf.result_filename), sep=' ', index=False)

In [None]:
#analysis
imp_df = pd.DataFrame({'feature_name':train_X.columns.values, 'importance':clf.feature_importances_}).sort_values('importance', ascending=False)
imp_df.to_csv('{0}_{1}/{1}_{2}'.format(cf.model_path, exec_time, 'importance.csv'), index=False)
lgb.plot_importance(clf)
imp_df

In [None]:
fig = plt.figure(figsize=(2048,768))
ax = fig.add_subplot(111)

lgb.plot_tree(clf, ax=ax, figsize=(2048, 768))

fig.savefig("foo.svg")