In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
import time

In [2]:
# datatypeの変換
def change_datatype(df):
    for col in list(df.select_dtypes(include=['int']).columns):
        if np.max(df[col]) <= 127 and np.min(df[col]) >= -128:
            df[col] = df[col].astype(np.int8)
        elif np.max(df[col]) <= 255 and np.min(df[col]) >= 0:
            df[col] = df[col].astype(np.uint8)
        elif np.max(df[col]) <= 32767 and np.min(df[col]) >= -32768:
            df[col] = df[col].astype(np.int16)
        elif np.max(df[col]) <= 65535 and np.min(df[col]) >= 0:
            df[col] = df[col].astype(np.uint16)
        elif np.max(df[col]) <= 2147483647 and np.min(df[col]) >= -2147483648:
            df[col] = df[col].astype(np.int32)
        elif np.max(df[col]) <= 4294967296 and np.min(df[col]) >= 0:
            df[col] = df[col].astype(np.uint32)
    for col in list(df.select_dtypes(include=['float']).columns):
        df[col] = df[col].astype(np.float32)

In [3]:
# 文字数カウント
def count_words(key):
    return len(str(key).split())

In [4]:
# 数字のカウント
# isalphaで英字判定
def count_numbers(key):
    return sum(c.isalpha() for c in key)

In [5]:
# 小文字判定
def count_upper(key):
    return sum(c.isupper() for c in key)

In [6]:
# 
# pandasのagg()は、関数を渡して処理することができる。
# numpyのmean関数は、平均の取得
# to_frameは、seriesをdataframeに変換
def get_mean(df, name, target, alpha=0):
    group = df.groupby(name)[target].agg([np.sum, np.size])
    mean = train[target].mean()
    series = (group['sum'] + mean*alpha)/(group['size']+alpha)
    series.name = name + '_mean'
    return series.to_frame().reset_index()

In [7]:
# numpyのonesは、引数分の配列を生成する
def add_words(df, name, length):
    x_data = []
    for x in df[name].values:
        x_row = np.ones(length, dtype=np.uint16)*0
        for xi, i in zip(list(str(x)), np.arange(length)):
            x_row[i] = ord(xi)
        x_data.append(x_row)
    return pd.concat([df, pd.DataFrame(x_data, columns=[name+str(c) for c in range(length)]).astype(np.uint16)], axis=1)

In [8]:
start_time = time.time()
c_categories = ['name', 'category_name', 'brand_name', 'item_description']
c_means = ['category_name', 'item_condition_id', 'brand_name']
c_texts = ['name', 'item_description']
c_ignors = ['name', 'item_description', 'brand_name', 'category_name', 'train_id', 'test_id', 'price']

In [9]:
train = pd.read_csv('../input/train.tsv', sep='\t')
test = pd.read_csv('../input/test.tsv', sep='\t')
test['price'] = -1

train_id               int64
name                  object
item_condition_id      int64
category_name         object
brand_name            object
price                float64
shipping               int64
item_description      object
dtype: object

In [13]:
df = pd.concat([train, test]).reset_index()
change_datatype(df)
df = df.fillna('')
df = add_words(df, 'name', 43) 
df = add_words(df, 'item_description', 60)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  """Entry point for launching an IPython kernel.


In [14]:
for c in c_categories:
     df[c+'_cat'] = pd.factorize(df[c])[0]

In [15]:
for c in c_texts:
    df[c + '_c_words'] = df[c].apply(count_words)
    df[c + '_c_upper'] = df[c].apply(count_upper)
    df[c + '_c_numbers'] = df[c].apply(count_numbers)
    df[c + '_len'] = df[c].str.len()
    df[c + '_mean_len_words'] = df[c + '_len']/df[c + '_c_words']
    df[c + '_mean_upper'] = df[c + '_len']/df[c + '_c_upper']
    df[c + '_mean_numbers'] = df[c + '_len']/df[c + '_c_numbers']

In [16]:
# numpyのsquare()は、各要素の2乗を返す。
# numpy.tanh(x)	双曲線関数tanh(x)を求めます
#------- begin feature engineering (Leandro dos Santos Coelho)
df['fe001'] = np.square(df["name_mean_len_words"])
df['fe002'] = np.square(df["item_description_mean_len_words"])
df['fe003'] = np.tanh(df["name_mean_len_words"])
df['fe004'] = np.tanh(df["item_description_mean_len_words"])
df['fe005'] = df["name_mean_len_words"]**2.37
df['fe006'] = df["item_description_mean_len_words"]**2.15
#------- end feature engineering (Leandro dos Santos Coelho)

In [17]:
df.dtypes

index                                int32
brand_name                          object
category_name                       object
item_condition_id                     int8
item_description                    object
name                                object
price                              float32
shipping                              int8
test_id                             object
train_id                            object
name0                               uint16
name1                               uint16
name2                               uint16
name3                               uint16
name4                               uint16
name5                               uint16
name6                               uint16
name7                               uint16
name8                               uint16
name9                               uint16
name10                              uint16
name11                              uint16
name12                              uint16
name13     

In [18]:
test = df[df['price'] == -1]
train = df[df['price'] != -1]
del df

In [19]:
train, valid = np.split(train.sample(frac=1), [int(.75*train.shape[0])])

In [20]:
for c in c_means:
    mean = get_mean(train, c, 'price')
    test = test.merge(mean, on=[c], how='left')
    train = train.merge(mean, on=[c], how='left')
    valid = valid.merge(mean, on=[c], how='left')

In [21]:
col = [c for c in train.columns if c not in c_ignors]

In [22]:
dtrain = xgb.DMatrix(train[col], train['price'])
dvalid  = xgb.DMatrix(valid[col],  valid['price'])
watchlist = [(dtrain, 'train'), (dvalid, 'valid')]
params = {'min_child_weight': 20, 'eta': 0.015, 'colsample_bytree': 0.48, 'max_depth': 14,
            'subsample': 0.91, 'lambda': 2.01, 'nthread': 4, 'booster' : 'gbtree', 'silent': 1,
            'eval_metric': 'rmse', 'objective': 'reg:linear','tree_method': 'hist'}
model = xgb.train(params, dtrain, 1000, watchlist, verbose_eval=10, early_stopping_rounds=20)
test['price'] = model.predict(xgb.DMatrix(test[col]), ntree_limit=model.best_ntree_limit)
test.loc[test['price'] < 0, 'price'] = 0
test['test_id'] = test['test_id'].astype(int)

[01:17:13] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
[0]	train-rmse:46.2835	valid-rmse:46.1363
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 20 rounds.
[10]	train-rmse:42.9263	valid-rmse:43.0347
[20]	train-rmse:39.9763	valid-rmse:40.39
[30]	train-rmse:37.5772	valid-rmse:38.2813
[40]	train-rmse:35.668	valid-rmse:36.6721
[50]	train-rmse:33.8473	valid-rmse:35.1736
[60]	train-rmse:32.4532	valid-rmse:34.0852
[70]	train-rmse:31.0364	valid-rmse:33.0061
[80]	train-rmse:30.0168	valid-rmse:32.2845
[90]	train-rmse:29.1387	valid-rmse:31.7041
[100]	train-rmse:28.3743	valid-rmse:31.2279
[110]	train-rmse:27.6612	valid-rmse:30.7957
[120]	train-rmse:27.0308	valid-rmse:30.4417
[130]	train-rmse:26.5545	valid-rmse:30.2017
[140]	train-rmse:26.0956	valid-rmse:29.9867
[150]	train-rmse:25.6631	valid-rmse:29.7858
[160]	train-rmse:25.3018	valid-rmse:29.6432
[170]	train-rmse:24.961

FileNotFoundError: [Errno 2] No such file or directory: 'output/20180827_xgboost.csv'

In [23]:
test[['test_id', 'price']].to_csv("../output/20180827_xgboost.csv", index = False)
print("Finished ...")
tt = (time.time() - start_time)/60
print("Total time %s min" % tt)

Finished ...
Total time 557.2454558491706 min
