In [75]:
import sys
import pickle
import datetime, time
from tqdm import tqdm
import numpy as np
import pandas as pd
from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import KFold
from sklearn.grid_search import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
import xgboost as xgb
from logging import StreamHandler, DEBUG, Formatter, FileHandler, getLogger

pd.set_option("display.max_columns", 100)

# 回帰用

# time count
start_time = "{0:%Y%m%d_%H%M%S}".format(datetime.datetime.now())
print(time.ctime(time.time()))

logger = getLogger(__name__)

# path
input_path = "../input/tabelog_train_saitama.csv"
pred_path  = "../input/tabelog_pred_minato.csv"
origin_path  = "../input/tabelog_store_data_minato.csv"
log_path   = "../log/train.py.log"
output_path   = "../output/"
model_path = "../output/model/"
model_f    = "20180128_224103tb_xgb.pkl"

# hyper parameter tune
all_params = {
    'max_depth':[3,5,7,9],
    'learning_rate':[0.1],
    'min_child_weight':[3,5,10],
    'n_estimators':[1, 10, 100, 1000, 10000],
    'colsample_bytree':[0.8, 0.9],
    'colsample_bylevel':[0.8, 0.9],
    'reg_alpha':[0,0.1],
    'max_delta_step':[0.1],
    'seed':[0]
}


def dataset_split(df):
    N = df.shape[0]
    N_train = int(df.shape[0] * 0.8)
    print('N:{}, N_train:{}, N_test:{}'.format(N, N_train, N-N_train))

    df_sample = df.sample(n=N, random_state=0)
    train = df_sample[:N_train]
    test  = df_sample[N_train:]
    
    print(train.head())
    print(test.head())

    return train, test
    

def train_phase():
    
    # target score = mse
    min_score = 1000
    min_params = None
#     kfold = KFold(n_splits=5, shuffle=True, random_state=0)

#     # パラメータ総当たり
#     for params in tqdm(list(ParameterGrid(all_params))):
#         logger.info('params: {}'.format(params))

#         reg = xgb.XGBRegressor(**params)

#         list_mse = cross_val_score(reg, x_train, y_train, scoring='neg_mean_squared_error', cv=kfold, n_jobs=-1)

#         mse = -np.mean(list_mse)

#         logger.info('cv mse: {}'.format(mse))
#         logger.debug('cv mse: {}'.format(mse))

#         if min_score > mse:
#             min_score = mse
#             min_params = params
#         logger.info('current min mse: {}, params: {}'.format(min_score, min_params))
#         break # 動作確認用

    reg = GridSearchCV(
        xgb.XGBRegressor(),
        all_params,
        cv=5,
        scoring='neg_mean_squared_error',
        n_jobs=-1
    )
    reg.fit(x_train, y_train)

    min_params = reg.best_params_

    logger.info('min params: {}'.format(min_params))
#     logger.info('min mse: {}'.format(min_score))
    logger.debug('best params: {}'.format(min_params))

    reg = xgb.XGBRegressor(**min_params)
    reg.fit(x_train, y_train)

    pickle.dump(reg, open(model_path + start_time + "tb_xgb.pkl", "wb"))
    reg = pickle.load(open(model_path + start_time + "tb_xgb.pkl", "rb"))
    
    logger.info('train end')
    logger.debug('train end')
    
    return reg


def test_phase(model):
    
    x_test = df_test.drop('rate', axis=1)
    y_test = df_test['rate'].values

    logger.info('test data load end {}'.format(x_test.shape))

    pred_test = model.predict(x_test)

    mse = mean_squared_error(y_test, pred_test)

    logger.info('test mse : {}'.format(mse))
    logger.debug('test mse : {}'.format(mse))

    importances = pd.Series(model.booster().get_score(importance_type='weight'), index = use_cols)
    importances = importances.sort_values(ascending=False)
    logger.info("imporance in the xgboost Model")
    logger.debug("imporance in the xgboost Model")
    logger.info('{}'.format(importances))
    logger.debug('{}'.format(importances))

    logger.info('test end')
    

# 予測と実データの比較
def prediction(model):
    
    pred_data = pd.read_csv(pred_path)
    pred_data = pred_data[pred_data.rate!=0.0]
    x_pred    = pred_data[use_cols]
    y_pred    = pred_data['rate'].values
    
    logger.info('pred data load end {}'.format(pred_data.shape))

    result_pred = model.predict(x_pred)

    mse = mean_squared_error(y_pred, result_pred)

    logger.info('result mse : {}'.format(mse))
    logger.debug('result mse : {}'.format(mse))
    
    origin = pd.read_csv(origin_path)
    origin = origin[origin.seat!='seat']
    rate = origin['rate'].astype('float').values
    origin = origin[origin.rate!=0.0]
    mu = rate.mean()
    se = rate.std()
    origin_rate = (y_pred * se) + mu
    pred_rate = (result_pred * se) + mu

    x_pred['origin']  = origin_rate
    x_pred['predict'] = pred_rate
    x_pred['diff']    = pred_rate - origin_rate

    x_pred.to_csv(output_path + start_time + "tabelog_predict_result.csv")

    
def main():
    
    model = train_phase()
    
#     model = pickle.load(open(model_path + model_f, 'rb'))

    test_phase(model)

#     prediction(model)
    

if __name__ == '__main__':

    # get log
    log_fmt = Formatter('%(asctime)s %(name)s %(lineno)d [%(levelname)s]\
    [%(funcName)s] %(message)s ')
    handler = StreamHandler()
    handler.setLevel('INFO')
    handler.setFormatter(log_fmt)
    logger.addHandler(handler)

    handler = FileHandler(log_path, 'a')
    handler.setLevel(DEBUG)
    handler.setFormatter(log_fmt)
    logger.setLevel(DEBUG)
    logger.addHandler(handler)

    logger.info('start')

    data = pd.read_csv(input_path)
    df_train, df_test = dataset_split(data)

    x_train = df_train.drop('rate', axis=1)
    y_train = df_train['rate'].values
    use_cols = x_train.columns.values

    logger.debug('train columns: {} {}'.format(use_cols.shape, use_cols))
    logger.info('data preparation end {}'.format(x_train.shape))

    main()

2018-01-29 07:52:03,221 __main__ 199 [INFO]    [<module>] start 
2018-01-29 07:52:03,221 __main__ 199 [INFO]    [<module>] start 
2018-01-29 07:52:03,221 __main__ 199 [INFO]    [<module>] start 
2018-01-29 07:52:03,221 __main__ 199 [INFO]    [<module>] start 
2018-01-29 07:52:03,221 __main__ 199 [INFO]    [<module>] start 
2018-01-29 07:52:03,221 __main__ 199 [INFO]    [<module>] start 
2018-01-29 07:52:03,221 __main__ 199 [INFO]    [<module>] start 
2018-01-29 07:52:03,221 __main__ 199 [INFO]    [<module>] start 
2018-01-29 07:52:03,221 __main__ 199 [INFO]    [<module>] start 
2018-01-29 07:52:03,221 __main__ 199 [INFO]    [<module>] start 
2018-01-29 07:52:03,221 __main__ 199 [INFO]    [<module>] start 
2018-01-29 07:52:03,221 __main__ 199 [INFO]    [<module>] start 
2018-01-29 07:52:03,221 __main__ 199 [INFO]    [<module>] start 
2018-01-29 07:52:03,221 __main__ 199 [INFO]    [<module>] start 
2018-01-29 07:52:03,221 __main__ 199 [INFO]    [<module>] start 
2018-01-29 07:52:03,221 _

Mon Jan 29 07:52:03 2018
N:6002, N_train:4801, N_test:1201
      couple_flg  coupon_flg  dinner_budget  dinner_flg  hideout_flg  \
4322         0.0         0.0            NaN         0.0          0.0   
4780         0.0         0.0      -0.245750         1.0          0.0   
4846         0.0         0.0            NaN         0.0          0.0   
472          0.0         0.0            NaN         0.0          0.0   
1370         0.0         0.0      -0.529713         1.0          0.0   

      kodawari_flg  lunch_budget  lunch_flg  net_reserve_flg  night_view_flg  \
4322           0.0           NaN        0.0              0.0             0.0   
4780           1.0     -0.343912        1.0              1.0             1.0   
4846           1.0           NaN        0.0              0.0             0.0   
472            0.0           NaN        0.0              0.0             0.0   
1370           1.0           NaN        0.0              0.0             0.0   

      nomiho_flg  open_date

DEBUG:__main__:train columns: (39,) ['couple_flg' 'coupon_flg' 'dinner_budget' 'dinner_flg' 'hideout_flg'
 'kodawari_flg' 'lunch_budget' 'lunch_flg' 'net_reserve_flg'
 'night_view_flg' 'nomiho_flg' 'open_date' 'osya_flg' 'private_flg'
 'relax_flg' 'review' 'sake_flg' 'seat' 'sommelier_flg' 'tabeho_flg'
 'toll_flg' 'vegetable_flg' 'wine_flg' 'distance_station' 'genre_cnt' 'bal'
 'china' 'cook' 'countries' 'dinning' 'ita_fre' 'izakaya' 'japan' 'light'
 'meat' 'nabe' 'other' 'ramen' 'sushi']
2018-01-29 07:52:03,426 __main__ 209 [INFO]    [<module>] data preparation end (4801, 39) 
2018-01-29 07:52:03,426 __main__ 209 [INFO]    [<module>] data preparation end (4801, 39) 
2018-01-29 07:52:03,426 __main__ 209 [INFO]    [<module>] data preparation end (4801, 39) 
2018-01-29 07:52:03,426 __main__ 209 [INFO]    [<module>] data preparation end (4801, 39) 
2018-01-29 07:52:03,426 __main__ 209 [INFO]    [<module>] data preparation end (4801, 39) 
2018-01-29 07:52:03,426 __main__ 209 [INFO]    [<mo

KeyboardInterrupt: 

Process ForkPoolWorker-2053:
Process ForkPoolWorker-2054:
Process ForkPoolWorker-2056:
Process ForkPoolWorker-2055:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/mnt/c/Users/gixo/go/anaconda3/envs/stan_env/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/mnt/c/Users/gixo/go/anaconda3/envs/stan_env/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/mnt/c/Users/gixo/go/anaconda3/envs/stan_env/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/mnt/c/Users/gixo/go/anaconda3/envs/stan_env/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/mnt/c/Users/gixo/go/anaconda3/envs/stan_env/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/mnt/c/Users/gixo/go/anaconda3/envs/st

In [165]:
from matplotlib import pyplot as plt
%matplotlib inline

col_result  = ['origin', 'predict', 'diff']
col_vis = ['name', 'genre', 'rate', 'diff', 'review', 'seat', 'lunch_budget', 'dinner_budget', 'relax_flg', 'private_flg', 'night_view_flg', 'couple_flg', 'osya_flg', 'pr_comment' ]
test_result = "../output/20180129_083356tabelog_test_result.csv"
pred_result = "../output/20180129_083356tabelog_predict_result.csv"
origin_roppongi = "../input/tabelog_origin_roppongi.csv"

test = pd.read_csv(test_result)
pred = pd.read_csv(pred_result)
roppongi = pd.read_csv(origin_roppongi)

df = roppongi.merge(pred[['name', 'diff']], on='name', how='inner')

# df['diff'] = df['diff'].map(lambda x:np.power(x, 2))
df.sort_values(by='diff', ascending=False, inplace=True)

display(df[col_vis].head(20))
sys.exit()

N_test = len(test.origin)
N_pred = len(pred.origin)

# rss_test = np.power(test['diff'].values, 2).sum()
# mu_test = test['origin'].values.mean()
# val_test = len(test.drop(['name', 'origin', 'diff', 'predict'], axis=1).columns)
# r2_test = 1 - (rss_test/(N_test-1-val_test)) / (np.power((test.origin - mu_test), 2).sum()/(N_test-1))

rss_pred = np.power(pred['diff'].values, 2).sum()
mu_pred = pred['origin'].values.mean()
val_pred = len(pred.drop(['name', 'origin', 'diff', 'predict'], axis=1).columns)
r2_pred = 1 - (rss_pred/(N_pred-1-val_pred)) / (np.power((pred.origin - mu_pred), 2).sum()/(N_pred-1))

# print(r2_test)
print(r2_pred)
# print(test.diff)

# mse = np.power(test['diff'].values, 2).mean()
# mae = np.absolute(test['diff'].values).mean()
mse = np.power(pred['diff'].values, 2).mean()
mae = np.absolute(pred['diff'].values).mean()
print(mse)
print(mae)
# sys.exit()

test.sort_values(by='origin', inplace=True)
pred.sort_values(by='origin', inplace=True)


x_test = np.linspace(1, N_test, N_test)
x_pred = np.linspace(1, N_pred, N_pred)

fig = plt.figure(figsize=(12, 8))

# plt.plot(x_test, test.origin, label='origin')
# plt.plot(x_test, test.predict, label='prediction')

plt.plot(x_pred, pred.origin,  label='origin')
plt.plot(x_pred, pred.predict, label='prediction')

plt.xlabel('store index')
plt.ylabel('tabelog score')
plt.legend()


Unnamed: 0,name,genre,rate,diff,review,seat,lunch_budget,dinner_budget,relax_flg,private_flg,night_view_flg,couple_flg,osya_flg,pr_comment
374,レストラン リューズ,フレンチ,4.19,0.58574,235.0,５,6000.0,20000.0,1.0,0.0,0.0,0.0,1.0,新潟産網獲り真鴨 松葉ガニ 黒トリュフ ブルターニュ産オマール海老
442,つるとんたん 六本木店,しゃぶしゃぶ、うどん、居酒屋,3.57,0.561974,868.0,200,1000.0,6000.0,1.0,0.0,0.0,0.0,1.0,忘年会・新年会の個室予約をネットで承っております。
560,天鳳,ラーメン,3.58,0.550038,238.0,6,,,0.0,0.0,0.0,0.0,0.0,
447,総本家更科堀井 本店,そば、懐石・会席料理、天ぷら・揚げ物（その他）,3.6,0.521167,419.0,70,1000.0,3000.0,1.0,0.0,0.0,0.0,0.0,伊勢丹立川店でも、伝統の味をお楽しみ頂けます。
690,アマンド 六本木店,ケーキ、洋食,3.08,0.49478,95.0,,,1000.0,0.0,0.0,0.0,0.0,0.0,
433,十番右京,居酒屋、創作料理、和食（その他）,3.59,0.471801,184.0,１２,3000.0,6000.0,1.0,0.0,0.0,0.0,1.0,名物トリュフたまごかけご飯®️！朝4時まで営業！100種の食事、200種のお酒！
618,マルズ・バー,ダイニングバー、ワインバー,3.15,0.464053,21.0,4,8000.0,15000.0,0.0,0.0,0.0,0.0,0.0,
418,ウルフギャング・ステーキハウス 六本木,ステーキ、ダイニングバー、ワインバー,3.64,0.458187,384.0,178,2000.0,10000.0,1.0,0.0,0.0,0.0,1.0,大人気の美食家を唸らすドライエイジング・ステーキ
475,松ちゃん,居酒屋、寿司、韓国料理,3.15,0.436824,131.0,,,2000.0,0.0,0.0,0.0,0.0,0.0,
420,龍吟,懐石・会席料理、かに、ふぐ,4.26,0.434546,244.0,20,30000.0,30000.0,1.0,1.0,0.0,0.0,1.0,「春」は山菜、貝尽くし 「夏」は鮎、鱧、天然大鰻 「秋」は松茸、松葉蟹 「冬」


SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
