In [7]:
import sys
# sys.path.append(r'E:/0code')
# sys.path.append(r'/home/wyf/0code')
# sys.path.append(r'/home/wangyf226/0code')
sys.path.append(r'/BIGDATA1/nsccgz_yfdu_1/asc19/wyf/0code')

In [9]:
import pandas as pd
import numpy as np
import datetime
from pyml.ensemble.regression import GradientBoostingRegression
from pyml.feature_extraction.text import CountVectorizer
from pyml.linear_model.regression import LinearRegression
from pyml.neighbors.classification import KNeighborsClassifier
from pyml.metrics.regression import pearson_correlation
from pyml.model_selection import KFold
from pyml.model_selection import ShuffleSplit
from pyml.preprocessing import StandardScaler
from pyml.logger import logger
import logging
import matplotlib.pyplot as plt
import pickle
import seaborn as sns

In [13]:
def save_model(model, name):
    with open(name,'wb') as f:
        pickle.dump(model, f)
def load_model(name):
    with open(name,'rb') as f:
        pickle.load(f,model)
    return model

In [14]:
fl = logging.FileHandler('wyf-GBDT-baseline.log',mode='a')
formatter = logging.Formatter('[%(levelname)8s] - [%(module)10s] - [%(lineno)3d] - [%(funcName)10s] \n%(message)s\n')
logger.addHandler(fl)

# 读取数据文件

In [10]:
train = pd.read_excel('../data/train.xlsx')
test = pd.read_excel('../data/testStudent.xlsx')

In [15]:
# 增加里tags特征的属性
train = pd.read_excel('../data/train_add_feat_score.xlsx')
test = pd.read_excel('../data/test_add_feat_score.xlsx')

In [16]:
train.dtypes # 检查有没有数据类型错误的，比如原本是int的变成str，说明里面可能有nan值等奇怪的数据

Additional_Number_of_Scoring                    int64
Average_Score                                 float64
Review_Total_Negative_Word_Counts               int64
Total_Number_of_Reviews                         int64
Review_Total_Positive_Word_Counts               int64
Total_Number_of_Reviews_Reviewer_Has_Given      int64
Reviewer_Score                                float64
TripType_score                                  int64
traveler_type_score                             int64
order_type_score                                int64
nights_num_score                                int64
with_pet_score                                  int64
room_type_score                                 int64
dtype: object

In [17]:
# train_ori_X = train.drop('Reviewer_Score', axis=1).drop('Tags', axis=1)
# train_ori_Y = train['Reviewer_Score']
# test_ori_X = test.drop('Tags', axis=1)
train_ori_X = train.drop('Reviewer_Score', axis=1)
train_ori_Y = train['Reviewer_Score']
test_ori_X = test

# 特征工程

In [19]:
def get_proportion_feature_1(df):
    """
    构造以下三个特征
    积极评论占总评论的比例
    消极评论占总评论的比例
    评论员评论占总评论的比例
    """
    df = df.copy()
    
    base_features = ['Total_Number_of_Reviews']
    gap_features = ['Review_Total_Negative_Word_Counts', 'Review_Total_Positive_Word_Counts','Total_Number_of_Reviews_Reviewer_Has_Given']
    for base_feature in base_features:
        for gap_feature in gap_features:
            df[gap_feature+'_radio_'+base_feature] = df[gap_feature]/df[base_feature]
            # 数字太小了，乘上一个10
#             df = df.drop(gap_feature, axis=1)
    return df

# 构造训练集和测试集，并归一化

In [18]:
# 特征方案0：不设置任何特征
train_X_feat = train_ori_X
test_X_feat = test_ori_X

In [20]:
# 特征方案1：增加占比特征，不抛弃原有特征
train_X_feat = get_proportion_feature_1(train_ori_X)
test_X_feat = get_proportion_feature_1(test_ori_X)

In [10]:
train_X_feat.columns

Index(['Additional_Number_of_Scoring', 'Average_Score',
       'Review_Total_Negative_Word_Counts', 'Total_Number_of_Reviews',
       'Review_Total_Positive_Word_Counts',
       'Total_Number_of_Reviews_Reviewer_Has_Given',
       'Review_Total_Negative_Word_Counts_radio_Total_Number_of_Reviews',
       'Review_Total_Positive_Word_Counts_radio_Total_Number_of_Reviews',
       'Total_Number_of_Reviews_Reviewer_Has_Given_radio_Total_Number_of_Reviews'],
      dtype='object')

In [21]:
# 查看不同特征与分数的相关系数
for feat_name in train_X_feat:
    print("{} : {}".format(feat_name, pearson_correlation(train_X_feat[feat_name].values, train_ori_Y.values)))

Additional_Number_of_Scoring : -0.06342829612443443
Average_Score : 0.3622534338191119
Review_Total_Negative_Word_Counts : -0.38232451404246
Total_Number_of_Reviews : -0.0715884558642013
Review_Total_Positive_Word_Counts : 0.22370965918084224
Total_Number_of_Reviews_Reviewer_Has_Given : 0.002496175494613464
TripType_score : 0.11245692676175148
traveler_type_score : 0.08914127698750476
order_type_score : 0.009844120194724183
nights_num_score : 0.027800754327103265
with_pet_score : 0.0008883916547842767
room_type_score : 0.23706243792316262
Review_Total_Negative_Word_Counts_radio_Total_Number_of_Reviews : -0.19866941605811572
Review_Total_Positive_Word_Counts_radio_Total_Number_of_Reviews : 0.1371873627002511
Total_Number_of_Reviews_Reviewer_Has_Given_radio_Total_Number_of_Reviews : 0.02441323402912634


In [11]:
# 方案一：没有权重
ss = StandardScaler()
train_X = ss.fit_transform(train_X_feat.values)
test_X = ss.transform(test_X_feat.values)

In [22]:
# 方案二：设置部分列的权重
ss = StandardScaler()
train_X = ss.fit_transform(train_X_feat.values)
test_X = ss.transform(test_X_feat.values)
# 增加某些特征的权重
train_X[:,1] *= 2
train_X[:,2] *= 2
train_X[:,4] *= 2

In [25]:
train_Y = train_ori_Y.values

# 交叉验证

In [26]:
logger.setLevel(logging.INFO)

In [27]:
n_splits = 2
k_splits = 5
# cv = ShuffleSplit(n_splits=n_splits)
cv = KFold(k_splits=k_splits)
score = 0
models= []
for train_indices, test_indices in cv.split(train_X):
    lr = GradientBoostingRegression(loss='lad', learning_rate=0.05, n_estimators=100, max_tree_node_size=50)
#     lr.fit(train_X[train_indices], train_Y[train_indices], watch=True)
    lr.fit_and_valid(train_X[train_indices], train_Y[train_indices],train_X[test_indices],train_Y[test_indices], mini_batch=4000 , watch=True)
    y_pred = lr.predict(train_X[test_indices])
    this_score = pearson_correlation(y_pred, train_Y[test_indices])
    score += this_score
    logger.info(this_score)
    models.append(lr)
logger.info('score : {}'.format(score/k_splits))

[    INFO] - [regression] - [132] - [fit_and_valid] 
train 0/100  current cost: 3526.7640344571673, test: 0.6177581573313643

[    INFO] - [regression] - [132] - [fit_and_valid] 
train 1/100  current cost: 3529.079980441834, test: 0.6181205912272337

[    INFO] - [regression] - [132] - [fit_and_valid] 
train 2/100  current cost: 3474.948449841613, test: 0.6184409644980834

[    INFO] - [regression] - [132] - [fit_and_valid] 
train 3/100  current cost: 3459.94501751991, test: 0.6187714013445581

[    INFO] - [regression] - [132] - [fit_and_valid] 
train 4/100  current cost: 3553.7545854305044, test: 0.6189211105870754

[    INFO] - [regression] - [132] - [fit_and_valid] 
train 5/100  current cost: 3537.1097075786192, test: 0.6189709795374297

[    INFO] - [regression] - [132] - [fit_and_valid] 
train 6/100  current cost: 3500.8375095537335, test: 0.6190174836524522

[    INFO] - [regression] - [132] - [fit_and_valid] 
train 7/100  current cost: 3384.3396240500383, test: 0.61913992612119

[    INFO] - [regression] - [132] - [fit_and_valid] 
train 65/100  current cost: 3346.9129083875296, test: 0.6190325397364117

[    INFO] - [regression] - [132] - [fit_and_valid] 
train 66/100  current cost: 3459.4097940675865, test: 0.6190346887509505

[    INFO] - [regression] - [132] - [fit_and_valid] 
train 67/100  current cost: 3434.676196380092, test: 0.6188201723628657

[    INFO] - [regression] - [132] - [fit_and_valid] 
train 68/100  current cost: 3384.4400139752192, test: 0.6186353686799532

[    INFO] - [regression] - [132] - [fit_and_valid] 
train 69/100  current cost: 3431.3907772363223, test: 0.6186085163937436

[    INFO] - [regression] - [132] - [fit_and_valid] 
train 70/100  current cost: 3331.475242398904, test: 0.6186079376566268

[    INFO] - [regression] - [132] - [fit_and_valid] 
train 71/100  current cost: 3389.469444469887, test: 0.6183360651642449

[    INFO] - [regression] - [132] - [fit_and_valid] 
train 72/100  current cost: 3393.251511724601, test: 0.618280

[    INFO] - [regression] - [132] - [fit_and_valid] 
train 30/100  current cost: 3454.074455392206, test: 0.6074022848595857

[    INFO] - [regression] - [132] - [fit_and_valid] 
train 31/100  current cost: 3435.2162927636455, test: 0.6075311245374118

[    INFO] - [regression] - [132] - [fit_and_valid] 
train 32/100  current cost: 3483.272311327194, test: 0.6073813870309432

[    INFO] - [regression] - [132] - [fit_and_valid] 
train 33/100  current cost: 3456.390000596518, test: 0.6074210319030604

[    INFO] - [regression] - [132] - [fit_and_valid] 
train 34/100  current cost: 3401.9656666615456, test: 0.6070522928052327

[    INFO] - [regression] - [132] - [fit_and_valid] 
train 35/100  current cost: 3317.1635343106445, test: 0.6069652018678727

[    INFO] - [regression] - [132] - [fit_and_valid] 
train 36/100  current cost: 3526.447465376539, test: 0.606890710516214

[    INFO] - [regression] - [132] - [fit_and_valid] 
train 37/100  current cost: 3455.798090409961, test: 0.60677442

[    INFO] - [regression] - [132] - [fit_and_valid] 
train 95/100  current cost: 3327.632887587368, test: 0.6035051807842917

[    INFO] - [regression] - [132] - [fit_and_valid] 
train 96/100  current cost: 3452.7716363697264, test: 0.6033961099946173

[    INFO] - [regression] - [132] - [fit_and_valid] 
train 97/100  current cost: 3390.296056247634, test: 0.6031413371683534

[    INFO] - [regression] - [132] - [fit_and_valid] 
train 98/100  current cost: 3282.017540364871, test: 0.6031272049761482

[    INFO] - [regression] - [132] - [fit_and_valid] 
train 99/100  current cost: 3418.094890403955, test: 0.6031186281374498

[    INFO] - [<ipython-input-27-92041653e07b>] - [ 14] - [  <module>] 
0.6031186281374498

[    INFO] - [regression] - [132] - [fit_and_valid] 
train 0/100  current cost: 3522.3467901403355, test: 0.6123621961608007

[    INFO] - [regression] - [132] - [fit_and_valid] 
train 1/100  current cost: 3462.3138270659033, test: 0.612541440139352

[    INFO] - [regression] -

[    INFO] - [regression] - [132] - [fit_and_valid] 
train 60/100  current cost: 3419.534374030032, test: 0.613744336835131

[    INFO] - [regression] - [132] - [fit_and_valid] 
train 61/100  current cost: 3412.230759872684, test: 0.613850864760794

[    INFO] - [regression] - [132] - [fit_and_valid] 
train 62/100  current cost: 3321.647183791022, test: 0.6138041397533273

[    INFO] - [regression] - [132] - [fit_and_valid] 
train 63/100  current cost: 3352.1616501237722, test: 0.6139024569423789

[    INFO] - [regression] - [132] - [fit_and_valid] 
train 64/100  current cost: 3463.2389096839333, test: 0.613891375747697

[    INFO] - [regression] - [132] - [fit_and_valid] 
train 65/100  current cost: 3426.3815519062373, test: 0.6139071927821569

[    INFO] - [regression] - [132] - [fit_and_valid] 
train 66/100  current cost: 3343.878825408413, test: 0.6139805211969984

[    INFO] - [regression] - [132] - [fit_and_valid] 
train 67/100  current cost: 3391.6962234725165, test: 0.613971240

[    INFO] - [regression] - [132] - [fit_and_valid] 
train 25/100  current cost: 3450.73926726423, test: 0.6130413019656813

[    INFO] - [regression] - [132] - [fit_and_valid] 
train 26/100  current cost: 3445.4158430325106, test: 0.6131030433633436

[    INFO] - [regression] - [132] - [fit_and_valid] 
train 27/100  current cost: 3412.793866564749, test: 0.6131290981576009

[    INFO] - [regression] - [132] - [fit_and_valid] 
train 28/100  current cost: 3397.444534925263, test: 0.6131730466123603

[    INFO] - [regression] - [132] - [fit_and_valid] 
train 29/100  current cost: 3457.221065649299, test: 0.6129178083502107

[    INFO] - [regression] - [132] - [fit_and_valid] 
train 30/100  current cost: 3482.5565628810364, test: 0.6129423715930405

[    INFO] - [regression] - [132] - [fit_and_valid] 
train 31/100  current cost: 3379.923150057079, test: 0.6129111567019844

[    INFO] - [regression] - [132] - [fit_and_valid] 
train 32/100  current cost: 3392.905927392848, test: 0.612873399

[    INFO] - [regression] - [132] - [fit_and_valid] 
train 90/100  current cost: 3273.861470478698, test: 0.6102895136575764

[    INFO] - [regression] - [132] - [fit_and_valid] 
train 91/100  current cost: 3210.5146013451213, test: 0.6101455759226392

[    INFO] - [regression] - [132] - [fit_and_valid] 
train 92/100  current cost: 3400.1638674841597, test: 0.610140877150073

[    INFO] - [regression] - [132] - [fit_and_valid] 
train 93/100  current cost: 3302.5596581314685, test: 0.6101415686765213

[    INFO] - [regression] - [132] - [fit_and_valid] 
train 94/100  current cost: 3393.4378665119775, test: 0.610093191693735

[    INFO] - [regression] - [132] - [fit_and_valid] 
train 95/100  current cost: 3385.5474573374586, test: 0.6101087111696785

[    INFO] - [regression] - [132] - [fit_and_valid] 
train 96/100  current cost: 3365.2068070957553, test: 0.6101282938512953

[    INFO] - [regression] - [132] - [fit_and_valid] 
train 97/100  current cost: 3330.979432318164, test: 0.610100

[    INFO] - [regression] - [132] - [fit_and_valid] 
train 55/100  current cost: 3472.208967745913, test: 0.6163105218524263

[    INFO] - [regression] - [132] - [fit_and_valid] 
train 56/100  current cost: 3469.748383006411, test: 0.6160511486468113

[    INFO] - [regression] - [132] - [fit_and_valid] 
train 57/100  current cost: 3497.194486636836, test: 0.6160401913774819

[    INFO] - [regression] - [132] - [fit_and_valid] 
train 58/100  current cost: 3456.559058096812, test: 0.6158649258316584

[    INFO] - [regression] - [132] - [fit_and_valid] 
train 59/100  current cost: 3424.037796812493, test: 0.6157979363151928

[    INFO] - [regression] - [132] - [fit_and_valid] 
train 60/100  current cost: 3450.98153467852, test: 0.6156260027219522

[    INFO] - [regression] - [132] - [fit_and_valid] 
train 61/100  current cost: 3437.4720686743512, test: 0.6156581610320674

[    INFO] - [regression] - [132] - [fit_and_valid] 
train 62/100  current cost: 3354.235272970436, test: 0.6155859057

In [None]:
for i in models:
    plt.plot(range(len(i.information['test_loss'])),i.information['test_loss'],label='test', )
    plt.legend()

# 训练模型写入结果

In [28]:
lr = GradientBoostingRegression(learning_rate=0.2, n_estimators=50, max_tree_node_size=500)
lr.fit(train_X, train_Y, watch=True)

[    INFO] - [regression] - [110] - [       fit] 
train 0/50  current cost: 62987.92350962772

[    INFO] - [regression] - [110] - [       fit] 
train 1/50  current cost: 62612.02609113783

[    INFO] - [regression] - [110] - [       fit] 
train 2/50  current cost: 62351.3632071482

[    INFO] - [regression] - [110] - [       fit] 
train 3/50  current cost: 62142.89172507621

[    INFO] - [regression] - [110] - [       fit] 
train 4/50  current cost: 61988.55013363538

[    INFO] - [regression] - [110] - [       fit] 
train 5/50  current cost: 61844.18815845453

[    INFO] - [regression] - [110] - [       fit] 
train 6/50  current cost: 61770.79426437271

[    INFO] - [regression] - [110] - [       fit] 
train 7/50  current cost: 61701.086662226495

[    INFO] - [regression] - [110] - [       fit] 
train 8/50  current cost: 61623.12757961742

[    INFO] - [regression] - [110] - [       fit] 
train 9/50  current cost: 61564.74543432237

[    INFO] - [regression] - [110] - [       fit] 


In [None]:
y_pred = lr.predict(test_X)
y_train_pred = lr.predict(train_X)

In [None]:
sns.distplot(train_Y)

In [None]:
sns.distplot(y_train_pred)

In [None]:
sns.distplot(y_pred)

In [None]:
pd.DataFrame(y_pred).to_csv('./results/'+'GBDT-0.05-820-'+ str(datetime.datetime.now().strftime("%Y-%m-%d-%H-%M")) + ".csv", index=0, header=None, index_label=None)

In [None]:
## 一些记录
训练5颗树的时候，验证集大概在0.625-0.640左右
训练10颗树的时候，验证集大概在0.633-0.642左右

# ## 2018.10.17 第二次rank
# 4:GBDT-0.2-50-2018-10-17-13-34
#     1. GBDT
#     2. 特征：无
#     3. 超参数：
#         1. learning_rate=0.2
#         1. n_estimators=50
#         1. max_tree_node_size=500
#     4. 验证集 0.63
#     5. 测试集 53.9545
#
#

# ## 2018.10.19
# 0:GBDT-0.05-820-2018-10-19-13-54.csv
#     1. GBDT
#     2. 特征：无
#     3. 超参数：
#         1. learning_rate=0.05
#         2. n_estimatros = 820
#         3. max_tree_node = 500
#         4. mini_batch=4000
#         5. loss = 'huber'
#     4. 验证集：0.645左右
#     5. 测试集：TODO