In [1]:
import pandas as pd
import numpy as np
import gc
import matplotlib.pyplot as plt
import seaborn as sns
import string
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import nltk
from nltk.corpus import stopwords
import scipy
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from tqdm import tqdm
%matplotlib inline
import pickle

In [2]:
dense_text=False
SEED=411 
kf = KFold(5, shuffle=True, random_state=SEED)

In [3]:
train = pickle.load(open('train_selected_features', 'rb'))
test = pickle.load(open('test_selected_features', 'rb'))
train.shape, test.shape

((1503424, 145), (508438, 145))

In [4]:
if dense_text is False:
    train_text = pickle.load(open('train_text_sparse_new', 'rb'))
    test_text = pickle.load(open('test_text_sparse_new', 'rb'))
else:
    train_text = pickle.load(open('train_text_dense_new', 'rb'))
    test_text = pickle.load(open('test_text_dense_new', 'rb'))
    
train_text.shape, test_text.shape

((1503424, 44161), (508438, 44161))

In [5]:
if dense_text is False:
    x_test = scipy.sparse.hstack([
        test_text,
        test
    ], format='csr')

    x_train = scipy.sparse.hstack([
        train_text,
        train
    ], format='csr')
else:
    x_test = np.hstack([test_text, test])
    x_train = np.hstack([train_text, train])

In [6]:
text_feature_names = ['text_'+str(i) for i in range(train_text.shape[1])]

In [7]:
feature_names = np.hstack([
    text_feature_names,
    train.columns.tolist(),
])
categorical = pickle.load(open('selected_cat_feature_names', 'rb'))
print('Number of features:', len(feature_names))

Number of features: 44306


In [8]:
del train, test, train_text, test_text; gc.collect()

0

In [9]:
target = 'deal_probability'
y_train = pd.read_csv('data/train.csv', usecols=['activation_date', 'deal_probability'], parse_dates=['activation_date']) \
            .sort_values('activation_date') \
            .reset_index(drop=True).deal_probability.values

In [10]:
from GridSearcher import data_loader, model_loader, fit_params, get_oof_predictions
ml = model_loader('lgb')

In [11]:
rounds = 24000
early_stop_rounds = 200

'''
params = {
    'objective' : 'regression',
    'metric' : 'rmse',
    'num_leaves' : 270,
    'max_depth': -1,
    'learning_rate' : 0.02,
    'feature_fraction' : .6,
    'feature_fraction_seed': SEED,
    'bagging_fraction': .75,
    'bagging_freq': 4,
    'verbosity' : -1,
    'nthread': 4,
}

params = {
    'objective' : 'regression',
    'metric' : 'rmse',
    'num_leaves' : 48,
    'max_depth': 15,
    'learning_rate' : 0.02,
    'feature_fraction' : 0.6,
    'verbosity' : -1,
    'n_jobs': 4
}
'''
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression_l2',
    'metric': 'rmse',
    'min_child_weight': 1.5,
    'num_leaves': 2 ** 5,
    'lambda_l2': 10,
    'subsample': 0.7,
    'colsample_bytree': 0.5,
    'colsample_bylevel': 0.5,
    'learning_rate': 0.1,
    'seed': 2018,
    'nthread': 4,
    'silent': True,
}

ret = np.zeros((x_train.shape[0],))
ret_test = np.zeros((x_test.shape[0],))
ret_models = []
    
for train_ix, val_ix in kf.split(x_train):
    dtrain = lgb.Dataset(x_train[train_ix,:], label=y_train[train_ix],)
                         #feature_name=list(feature_names), 
                         #categorical_feature=categorical)
    dvalid = lgb.Dataset(x_train[val_ix,:], label=y_train[val_ix],)
                         #feature_name=list(feature_names), 
                         #categorical_feature=categorical)
    
    model = lgb.train(params, dtrain, 
                      valid_sets=[dtrain, dvalid], 
                      valid_names=['train', 'valid'],
                      num_boost_round=rounds, 
                      early_stopping_rounds=early_stop_rounds, 
                      verbose_eval=100)
    
    ret[val_ix] = model.predict(x_train[val_ix,:])
    ret_test += model.predict(x_test)
    ret_models.append(model)
    
    del dtrain, dvalid
    
ret_test = ret_test / 5.

Training until validation scores don't improve for 200 rounds.
[100]	train's rmse: 0.221273	valid's rmse: 0.221939
[200]	train's rmse: 0.218702	valid's rmse: 0.220117
[300]	train's rmse: 0.217282	valid's rmse: 0.219443
[400]	train's rmse: 0.216102	valid's rmse: 0.219045
[500]	train's rmse: 0.215129	valid's rmse: 0.21883
[600]	train's rmse: 0.214199	valid's rmse: 0.218605


KeyboardInterrupt: 

In [None]:
for model in ret_models:
    fig, ax = plt.subplots(figsize=(10, 14))
    lgb.plot_importance(model, max_num_features=100, ax=ax)
    plt.title("Light GBM Feature Importance")
    plt.show()

In [None]:
prefix = 'select_features_label_price_lgb'

In [None]:
pd.DataFrame(data=ret, columns=[prefix'_pred']).to_csv(prefix+'_oof_val_pred.csv', index=False)
pd.DataFrame(data=ret_test, columns=[prefix+'_pred']).to_csv(prefix+'_oof_test_pred.csv', index=False)

In [None]:
subm = pd.read_csv('sample_submission.csv')
subm['deal_probability'] = np.clip(ret_test, 0, 1)
subm.to_csv(prefix+'_submission.csv', index=False)