In [None]:
import numpy as np
import pandas as pd
import statsmodels as sm
import matplotlib.pylab as plt
import config as cf
import import_ipynb
import os
import sys
import xgboost
import time

from datetime import datetime, timedelta
from sklearn.metrics import mean_squared_error, log_loss
from chinese_calendar import is_workday, is_holiday
from jupyterthemes import jtplot
from IPython.core.display import clear_output

jtplot.style()
pd.options.display.max_rows = 1000
pd.options.display.max_columns = 200

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:96% !important; }</style>"))

In [None]:
exec_time = time.strftime("%Y%m%d%H%M", time.localtime())
os.mkdir('{0}_{1}'.format(cf.model_path, exec_time))

In [None]:
train_df = pd.read_csv(cf.train_data_features_file_path, index_col=0).reset_index()
test_df = pd.read_csv(cf.test_data_features_file_path, index_col=0).reset_index()
test_a_df = pd.read_csv(cf.round1_test_a_file_path, sep = ' ')
test_b_df = pd.read_csv(cf.round1_test_b_file_path, sep = ' ')

trainV_df = train_df.loc[train_df['context_day'] != 24]
testV_df = train_df.loc[train_df['context_day'] == 24]

ret_test_df = test_b_df

# train_df.loc[:,'context_datetime'] = pd.to_datetime(train_df.loc[:,'context_timestamp'] + time_offset, unit='s')
# train_df = train_df.loc[train_df['context_datetime'] < '2017-09-23']
# train_df = train_df.drop(columns=['context_datetime'])

In [None]:
print testV_df.shape
print trainV_df.shape

print test_df.shape
print train_df.shape

print trainV_df[['is_trade']].describe()
print testV_df[['is_trade']].describe()
print train_df[['is_trade']].describe()

In [None]:
feature_start = 10 
feature_start = 6
exclude_columns = []
# exclude_columns = ['user_id_is', 'item_id_is', 'shop_id_is', 'item_brand_id_is']
# exclude_columns.extend(filter(lambda x:x.startswith('ui'), train_df.columns.values))
# exclude_columns.extend(filter(lambda x:x.endswith('user_cnt'), train_df.columns.values))

trainV_y = trainV_df.iloc[:,1]
trainV_X = trainV_df.iloc[:,feature_start + 1:].drop(columns=exclude_columns)
testV_y = testV_df.iloc[:,1]
testV_X = testV_df.iloc[:,feature_start + 1:].drop(columns=exclude_columns)

train_y = train_df.iloc[:,1]
train_X = train_df.iloc[:,feature_start + 1:].drop(columns=exclude_columns)
test_X = test_df.iloc[:,feature_start:].drop(columns=exclude_columns)

print trainV_X.shape
print testV_X.shape
print train_X.shape
print test_X.shape
print train_X.columns.values

In [None]:
# train
trainV_matrix = xgboost.DMatrix(trainV_X.values, label=trainV_y.values, feature_names=trainV_X.columns)
testV_matrix = xgboost.DMatrix(testV_X.values, label=testV_y.values, feature_names=testV_X.columns)

train_matrix = xgboost.DMatrix(train_X.values, label=train_y.values, feature_names=train_X.columns)
predict_matrix = xgboost.DMatrix(test_X.values, feature_names=test_X.columns)

watchlist = [(trainV_matrix, 'train'), (testV_matrix, 'eval')]
num_round=5000
early_stopping_rounds=200
param = {
    'max_depth': 8,
    'eta': 0.05,
    'silent': 1,
    'seed': 42,
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
#     'scale_pos_weight': 2,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
#     'min_child_weight': 100,
#     'max_delta_step': 20
}


print 'model training'
save_stdout = sys.stdout
with open('{0}_{1}/{2}'.format(cf.model_path, exec_time, cf.model_valid_log), 'w+') as outf:
#     sys.stdout = outf
    model = xgboost.train(param, trainV_matrix, num_boost_round=num_round, evals=watchlist, early_stopping_rounds=early_stopping_rounds)
    # test
    test_matrix = xgboost.DMatrix(testV_X.values, feature_names=testV_X.columns)

    pred_y = model.predict(test_matrix, ntree_limit=model.best_ntree_limit)
    frame = pd.Series(pred_y, index=testV_df.index)
    frame.name = 'predicted_score'
    best_eval = log_loss(testV_y.values, pred_y)
    outf.write("best %f:%s" %(best_eval, {'n_estimators':model.best_iteration}))
    
frame.hist(bins=100)
print frame.value_counts().head()
print frame.describe()
print "test log loss:", best_eval
# sys.stdout = save_stdout
print 'model.best_score: {0}, model.best_iteration: {1}, model.best_ntree_limit: {2}'.format(model.best_score, model.best_iteration, model.best_ntree_limit)

In [None]:
# predict
pred_y = model.predict(predict_matrix, ntree_limit=model.best_ntree_limit)
frame = pd.Series(pred_y, index=test_df.index)
frame.name = 'predicted_score'
frame.hist(bins=100)
print frame.value_counts().head()

pred_df = test_df[['instance_id']].join(frame)
ret_df = ret_test_df[['instance_id']].merge(pred_df)

print ret_df.describe()

ret_df.to_csv('{0}_{1}/{1}_{2}'.format(cf.model_path, exec_time, cf.result_filename), sep=' ', index=False)

In [None]:
#analysis
xgboost.plot_importance(model)
imp_df = pd.DataFrame(model.get_fscore().items(), columns=['feature','importance']).sort_values('importance', ascending=False)
imp_df.to_csv('{0}_{1}/{1}_{2}'.format(cf.model_path, exec_time, 'importance.csv'), index=False)
imp_df