In [None]:
import numpy as np
import pandas as pd
import statsmodels as sm
import matplotlib.pylab as plt
import config as cf
import import_ipynb
import os
import sys
import lightgbm as lgb
import time
import itertools

from datetime import datetime, timedelta
from sklearn.metrics import mean_squared_error, log_loss
from chinese_calendar import is_workday, is_holiday
from jupyterthemes import jtplot
from IPython.core.display import clear_output
from feature_extract import *

jtplot.style()
pd.options.display.max_rows = 1000
pd.options.display.max_columns = 200

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:96% !important; }</style>"))

In [None]:
exec_time = time.strftime("%Y%m%d%H%M", time.localtime())
os.mkdir('{0}_{1}'.format(cf.model_path, exec_time))

In [None]:
trainV_df = pd.read_csv(cf.train_valid_features_file_path, index_col=0)
testV_df = pd.read_csv(cf.test_valid_features_file_path, index_col=0)

train_df = pd.read_csv(cf.train_data_features_file_path, index_col=0)
test_df = pd.read_csv(cf.test_data_features_file_path, index_col=0)

ret_test_df = pd.read_csv(cf.round1_test_file_path, sep=' ')

In [None]:
print testV_df.shape
print trainV_df.shape

print test_df.shape
print train_df.shape

print trainV_df[['is_trade']].describe()
print testV_df[['is_trade']].describe()
print train_df[['is_trade']].describe()

In [None]:
feature_start = 8

trainV_y = trainV_df.iloc[:,0]
trainV_X = trainV_df.iloc[:,feature_start + 1:]
testV_y = testV_df.iloc[:,0]
testV_X = testV_df.iloc[:,feature_start + 1:]

train_y = train_df.iloc[:,0]
train_X = train_df.iloc[:,feature_start + 1:]
test_X = test_df.iloc[:,feature_start:]

print trainV_X.shape
print testV_X.shape
print train_X.shape
print test_X.shape

In [None]:
trainV_X.columns.values

In [None]:
# test
def test_lgbm(leaves, depth, est):
    clf = lgb.LGBMClassifier(num_leaves=leaves, max_depth=depth, n_estimators=est, n_jobs=20)
#     clf = lgb.LGBMClassifier()
    clf.fit(trainV_X, trainV_y)
    testV_pred = pd.Series(clf.predict_proba(testV_X)[:,1])
    testV_pred.name = 'predicted_score'

#     testV_pred.hist()
#     print testV_pred.value_counts().head()
    logloss_eval = log_loss(testV_y.values, testV_pred.values)
    return logloss_eval

best_pramas = {}
best_eval = 1
all_depth = range(7, 10)
all_leaves = range(70, 85, 1)
all_ests = [80, 100]

with open('{0}_{1}/{2}'.format(cf.model_path, exec_time, cf.model_valid_log), 'w+') as outf:
    for d, l, e in list(itertools.product(all_depth, all_leaves, all_ests)):
        params = {'leaves': l, 'depth': d, 'estimators': e}
        logloss_eval = test_lgbm(params['leaves'], params['depth'], params['estimators'])
        outf.write("%f:%s\n" %(logloss_eval, str(params)))
        outf.flush()
        if logloss_eval < best_eval:
            print "%f:%s" %(logloss_eval, str(params))
            best_eval = logloss_eval
            best_params = params
    outf.write("best %f:%s\n" %(best_eval, str(best_params)))
    print "best %f:%s" %(best_eval, str(best_params))

In [None]:
# pred
clf = lgb.LGBMClassifier(num_leaves=best_params['leaves'], max_depth=best_params['depth'], n_estimators=best_params['estimators'], n_jobs=20)
# clf = lgb.LGBMClassifier(num_leaves=72, max_depth=7, n_estimators=100, n_jobs=20)
clf.fit(train_X, train_y)
test_pred = pd.Series(clf.predict_proba(test_X)[:,1])
test_pred.name = 'predicted_score'

test_pred.hist(bins=100)
print test_pred.nunique()
print test_pred.value_counts().head()

pred_df = test_df[['instance_id']].join(test_pred)
ret_df = ret_test_df[['instance_id']].merge(pred_df)

print ret_df.describe()

ret_df.to_csv('{0}_{1}/{1}_{2}'.format(cf.model_path, exec_time, cf.result_filename), sep=' ', index=False)

In [None]:
#analysis
imp_df = pd.DataFrame({'feature_name':train_X.columns.values, 'importance':clf.feature_importances_}).sort_values('importance', ascending=False)
imp_df.to_csv('{0}_{1}/{1}_{2}'.format(cf.model_path, exec_time, 'importance.csv'), index=False)
lgb.plot_importance(clf)
imp_df