In [1]:
import os
from datetime import datetime
import time

import numpy as np 
import pandas as pd 
import json
from pandas.io.json import json_normalize
import seaborn as sns 
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

import xgboost as xgb

pd.set_option('display.max_columns', 100)

In [4]:
def load_df(csv_path='input/train.csv', nrows=None):
    JSON_COLUMNS = ['device', 'geoNetwork', 'totals', 'trafficSource']
    
    df = pd.read_csv(csv_path,
                              converters={column: json.loads for column in JSON_COLUMNS}, 
                              dtype={'fullVisitorId': 'str'}, # Important!!
                              nrows=nrows)
    
    for column in JSON_COLUMNS:
        column_as_df = json_normalize(df[column])
        column_as_df.columns = ["{0}_{1}".format(column, subcolumn) for subcolumn in column_as_df.columns]
        df = df.drop(column, axis=1).merge(column_as_df, right_index=True, left_index=True)
    print("Loaded {0}. Shape: {1}".format(os.path.basename(csv_path), df.shape))
    return df

In [3]:
def process_date_time(data_df):
    print("process date time ...")
    data_df['date'] = data_df['date'].astype(str)
    data_df["date"] = data_df["date"].apply(lambda x : x[:4] + "-" + x[4:6] + "-" + x[6:])
    data_df["date"] = pd.to_datetime(data_df["date"])   
    data_df["year"] = data_df['date'].dt.year
    data_df["month"] = data_df['date'].dt.month
    data_df["day"] = data_df['date'].dt.day
    data_df["weekday"] = data_df['date'].dt.weekday
    data_df['weekofyear'] = data_df['date'].dt.weekofyear
    data_df['month_unique_user_count'] = data_df.groupby('month')['fullVisitorId'].transform('nunique')
    data_df['day_unique_user_count'] = data_df.groupby('day')['fullVisitorId'].transform('nunique')
    data_df['weekday_unique_user_count'] = data_df.groupby('weekday')['fullVisitorId'].transform('nunique')
    return data_df

In [5]:
def process_format(data_df):
    print("process format ...")
    for col in ['visitNumber', 'totals_hits', 'totals_pageviews']:
        data_df[col] = data_df[col].astype(float)
    data_df['trafficSource_adwordsClickInfo.isVideoAd'].fillna(True, inplace=True)
    data_df['trafficSource_isTrueDirect'].fillna(False, inplace=True)
    return data_df

In [6]:
def process_device(data_df):
    print("process device ...")
    data_df['browser_category'] = data_df['device_browser'] + '_' + data_df['device_deviceCategory']
    data_df['browser_operatingSystem'] = data_df['device_browser'] + '_' + data_df['device_operatingSystem']
    data_df['source_country'] = data_df['trafficSource_source'] + '_' + data_df['geoNetwork_country']
    return data_df

In [7]:
def process_totals(data_df):
    print("process totals ...")
    data_df['visitNumber'] = np.log1p(data_df['visitNumber'])
    data_df['totals_hits'] = np.log1p(data_df['totals_hits'])
    data_df['totals_pageviews'] = np.log1p(data_df['totals_pageviews'].fillna(0))
    data_df['mean_hits_per_day'] = data_df.groupby(['day'])['totals_hits'].transform('mean')
    data_df['sum_hits_per_day'] = data_df.groupby(['day'])['totals_hits'].transform('sum')
    data_df['max_hits_per_day'] = data_df.groupby(['day'])['totals_hits'].transform('max')
    data_df['min_hits_per_day'] = data_df.groupby(['day'])['totals_hits'].transform('min')
    data_df['var_hits_per_day'] = data_df.groupby(['day'])['totals_hits'].transform('var')
    return data_df

In [8]:
def process_geo_network(data_df):
    print("process geo network ...")
    data_df['sum_pageviews_per_network_domain'] = data_df.groupby('geoNetwork_networkDomain')['totals_pageviews'].transform('sum')
    data_df['count_pageviews_per_network_domain'] = data_df.groupby('geoNetwork_networkDomain')['totals_pageviews'].transform('count')
    data_df['mean_pageviews_per_network_domain'] = data_df.groupby('geoNetwork_networkDomain')['totals_pageviews'].transform('mean')
    data_df['sum_hits_per_network_domain'] = data_df.groupby('geoNetwork_networkDomain')['totals_hits'].transform('sum')
    data_df['count_hits_per_network_domain'] = data_df.groupby('geoNetwork_networkDomain')['totals_hits'].transform('count')
    data_df['mean_hits_per_network_domain'] = data_df.groupby('geoNetwork_networkDomain')['totals_hits'].transform('mean')
    return data_df

In [9]:
# データ読み込み
train_data_path = 'input/train.csv'
test_data_path = 'input/test.csv'
train_df = load_df(train_data_path)
test_df = load_df(test_data_path)

Loaded train.csv. Shape: (903653, 55)
Loaded test.csv. Shape: (804684, 53)


In [10]:
train_df = process_date_time(train_df)
test_df = process_date_time(test_df)

process date time ...
process date time ...


In [11]:
cols_to_drop = [col for col in train_df.columns if train_df[col].nunique(dropna=False) == 1]
train_df.drop(cols_to_drop, axis=1, inplace=True)
test_df.drop([col for col in cols_to_drop if col in test_df.columns], axis=1, inplace=True)
print(cols_to_drop)

['socialEngagementType', 'device_browserSize', 'device_browserVersion', 'device_flashVersion', 'device_language', 'device_mobileDeviceBranding', 'device_mobileDeviceInfo', 'device_mobileDeviceMarketingName', 'device_mobileDeviceModel', 'device_mobileInputSelector', 'device_operatingSystemVersion', 'device_screenColors', 'device_screenResolution', 'geoNetwork_cityId', 'geoNetwork_latitude', 'geoNetwork_longitude', 'geoNetwork_networkLocation', 'totals_visits', 'trafficSource_adwordsClickInfo.criteriaParameters']


In [12]:
train_df.drop(['trafficSource_campaignCode'], axis=1, inplace=True)

In [13]:
train_df['totals_transactionRevenue'] = train_df['totals_transactionRevenue'].astype(float)
train_df['totals_transactionRevenue'] = train_df['totals_transactionRevenue'].fillna(0)
train_df['totals_transactionRevenue'] = np.log1p(train_df['totals_transactionRevenue'])

In [14]:
train_df = process_format(train_df)
train_df = process_device(train_df)
train_df = process_totals(train_df)
train_df = process_geo_network(train_df)

test_df = process_format(test_df)
test_df = process_device(test_df)
test_df = process_totals(test_df)
test_df = process_geo_network(test_df)

process format ...
process device ...
process totals ...
process geo network ...
process format ...
process device ...
process totals ...
process geo network ...


In [15]:
from sklearn.preprocessing import LabelEncoder
print("process categorical columns ...")
num_cols = ['month_unique_user_count', 'day_unique_user_count', 'weekday_unique_user_count',
            'visitNumber', 'totals_hits', 'totals_pageviews', 
            'mean_hits_per_day', 'sum_hits_per_day', 'min_hits_per_day', 'max_hits_per_day', 'var_hits_per_day',
            'sum_pageviews_per_network_domain', 'count_pageviews_per_network_domain', 'mean_pageviews_per_network_domain',
            'sum_hits_per_network_domain', 'count_hits_per_network_domain', 'mean_hits_per_network_domain']
            
not_used_cols = ["visitNumber", "date", "fullVisitorId", "sessionId", 
        "visitId", "visitStartTime", 'totals_transactionRevenue', 'trafficSource_referralPath']
cat_cols = [col for col in train_df.columns if col not in num_cols and col not in not_used_cols]
for col in cat_cols:
    print(col)
    lbl = LabelEncoder()
    lbl.fit(list(train_df[col].values.astype('str')) + list(test_df[col].values.astype('str')))
    train_df[col] = lbl.transform(list(train_df[col].values.astype('str')))
    test_df[col] = lbl.transform(list(test_df[col].values.astype('str')))

process categorical columns ...
channelGrouping
device_browser
device_deviceCategory
device_isMobile
device_operatingSystem
geoNetwork_city
geoNetwork_continent
geoNetwork_country
geoNetwork_metro
geoNetwork_networkDomain
geoNetwork_region
geoNetwork_subContinent
totals_bounces
totals_newVisits
trafficSource_adContent
trafficSource_adwordsClickInfo.adNetworkType
trafficSource_adwordsClickInfo.gclId
trafficSource_adwordsClickInfo.isVideoAd
trafficSource_adwordsClickInfo.page
trafficSource_adwordsClickInfo.slot
trafficSource_campaign
trafficSource_isTrueDirect
trafficSource_keyword
trafficSource_medium
trafficSource_source
year
month
day
weekday
weekofyear
browser_category
browser_operatingSystem
source_country


In [16]:
print("prepare model ...")
train_df = train_df.sort_values('date')
X = train_df.drop(not_used_cols, axis=1)
y = train_df['totals_transactionRevenue']
X_test = test_df.drop([col for col in not_used_cols if col in test_df.columns], axis=1)

prepare model ...


In [17]:
print("Number of unique visitors in train set : ",train_df.fullVisitorId.nunique(), " out of rows : ",train_df.shape[0])
print("Number of unique visitors in train set : ",test_df.fullVisitorId.nunique(), " out of rows : ",test_df.shape[0])
print("Number of common visitors in train and test set : ",len(set(train_df.fullVisitorId.unique()).intersection(set(test_df.fullVisitorId.unique())) ))

Number of unique visitors in train set :  714167  out of rows :  903653
Number of unique visitors in train set :  617242  out of rows :  804684
Number of common visitors in train and test set :  7679


In [84]:
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15, random_state=1)

print(X_train.shape)
print(y_train.shape)
print(X_val.shape)
print(y_val.shape)

(768105, 49)
(768105,)
(135548, 49)
(135548,)


In [82]:
params = {'objective': 'reg:linear',
          'eval_metric': 'rmse',
          'eta': 0.001,
          'max_depth': 10,
          'subsample': 0.6,
          'colsample_bytree': 0.8,
          'alpha':0.001,
           "num_leaves" : 12**2,
           'random_state': 42,
           'silent': True,
          }

In [83]:
folds = KFold(n_splits=10, random_state=42)

In [None]:
for fold_n1, (train_index, test_index) in enumerate(folds.split(X)):
    print('Fold:', fold_n1)
    xgb_train_data = xgb.DMatrix(X_train, y_train)
    xgb_val_data = xgb.DMatrix(X_val, y_val)
    xgb_submit_data = xgb.DMatrix(X_test)

    model = xgb.train(params, xgb_train_data, 
                      # Note: I disabled XGB to make the notebook run faster
                      # Set to 2000 to obtain the results reported in Conclusion
                      num_boost_round=2000, 
                      evals= [(xgb_train_data, 'train'), (xgb_val_data, 'valid')],
                      early_stopping_rounds=70, 
                      verbose_eval=500
                     )

Fold: 0
[0]	train-rmse:2.02795	valid-rmse:1.98447
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 70 rounds.


In [None]:
y_pred = model.predict(xgb_submit_data, ntree_limit=model.best_ntree_limit)
prediction += y_pred
prediction /= 10

In [23]:
y_conversion = y.apply(lambda x: 1 if x > 0 else 0)

In [25]:
X_train, X_val, y_train, y_val = train_test_split(X, y_conversion, test_size=0.15, random_state=1)

In [68]:
"""
param_cv = {
    'objective': 'binary:logistic',
    'base_score': y_conversion.mean(),
    #'n_estimators':150,
    'max_depth': 2,
    #'colsample_bytree': 1.0,
    'learning_rate': 0.3,
    #'eval_metric': 'rmse',
    'subsample': 1.0,
    'lambda': 1, # L2正則化 default:1
    'alpha': 0, # L1正則化 default:0
}
"""
param_cv = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'n_estimators':150,
    'base_score': y_conversion.mean(),
    'scale_pos_weight': 1/ y_conversion.mean(),
    'max_depth': 5,
    'colsample_bytree': 1.0,
    'learning_rate': 0.3,
    'subsample': 1.0,
    'reg_lambda': 100, # L2正則化 default:1
    'reg_alpha': 0.5, # L1正則化 default:0
}

model = xgb.XGBClassifier(**param_cv)

In [69]:
model

XGBClassifier(base_score=0.0127427231470487, booster='gbtree',
       colsample_bylevel=1, colsample_bytree=1.0, eval_metric='auc',
       gamma=0, learning_rate=0.3, max_delta_step=0, max_depth=5,
       min_child_weight=1, missing=None, n_estimators=150, n_jobs=1,
       nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0.5, reg_lambda=100, scale_pos_weight=78.47616152844117,
       seed=None, silent=True, subsample=1.0)

In [70]:
model.fit(X_train, y_train)
print('train score')
print(model.score(X=X_train, y=y_train))
print('val score')
print(model.score(X=X_val, y=y_val))

train score


  if diff:


0.9445648706882522
val score
0.9446247823649186


  if diff:


In [30]:
y_conversion.mean()

0.0127427231470487

In [71]:
train_pred = model.predict(X_train)

  if diff:


In [72]:
train_pred.mean()

0.06755066039148294

In [36]:
X_train[y_conversion == 1]

  if __name__ == '__main__':


Unnamed: 0,channelGrouping,device_browser,device_deviceCategory,device_isMobile,device_operatingSystem,geoNetwork_city,geoNetwork_continent,geoNetwork_country,geoNetwork_metro,geoNetwork_networkDomain,geoNetwork_region,geoNetwork_subContinent,totals_bounces,totals_hits,totals_newVisits,totals_pageviews,trafficSource_adContent,trafficSource_adwordsClickInfo.adNetworkType,trafficSource_adwordsClickInfo.gclId,trafficSource_adwordsClickInfo.isVideoAd,trafficSource_adwordsClickInfo.page,trafficSource_adwordsClickInfo.slot,trafficSource_campaign,trafficSource_isTrueDirect,trafficSource_keyword,trafficSource_medium,trafficSource_source,year,month,day,weekday,weekofyear,month_unique_user_count,day_unique_user_count,weekday_unique_user_count,browser_category,browser_operatingSystem,source_country,mean_hits_per_day,sum_hits_per_day,max_hits_per_day,min_hits_per_day,var_hits_per_day,sum_pageviews_per_network_domain,count_pageviews_per_network_domain,mean_pageviews_per_network_domain,sum_hits_per_network_domain,count_hits_per_network_domain,mean_hits_per_network_domain
273008,3,35,0,0,3,579,2,218,75,28692,280,12,1,3.970292,1,3.663562,62,3,59008,1,11,3,4,1,3327,4,171,1,0,3,3,11,53041,26847,126091,42,39,1209,1.306278,38494.701584,6.156979,0.693147,0.654788,81.286022,53,1.533699,87.090424,53,1.643216
472526,4,35,0,0,7,94,2,218,100,7652,73,12,1,3.091042,1,3.091042,62,3,59008,1,11,3,4,0,11,5,208,0,3,6,3,45,63839,27997,126091,42,42,2012,1.272690,38974.858677,5.680173,0.693147,0.621029,44505.836512,28743,1.548406,46757.778392,28743,1.626754
451077,2,35,0,0,7,579,2,218,75,0,280,12,1,2.944439,1,2.772589,62,3,59008,1,11,3,4,1,3327,0,0,1,8,29,3,15,52067,26646,126091,42,42,203,1.277934,37210.881415,5.556828,0.693147,0.642582,346181.198469,244881,1.413671,362835.303771,244881,1.481680
621917,6,35,0,0,7,463,2,218,61,0,73,12,1,3.218876,1,3.091042,62,3,59008,1,11,3,4,1,3327,6,358,1,4,6,2,49,51364,27997,129296,42,42,3027,1.272690,38974.858677,5.680173,0.693147,0.621029,346181.198469,244881,1.413671,362835.303771,244881,1.481680
225173,6,35,0,0,7,829,2,218,100,0,73,12,1,2.833213,0,2.708050,62,3,59008,1,11,3,4,0,3327,6,358,0,3,29,3,43,63839,26646,126091,42,42,3027,1.277934,37210.881415,5.556828,0.693147,0.642582,346181.198469,244881,1.413671,362835.303771,244881,1.481680
823686,4,35,0,0,3,955,2,218,122,38725,482,12,1,4.248495,1,3.850148,62,3,59008,1,11,3,4,1,11,5,208,0,11,17,6,31,59121,26416,91591,42,39,2012,1.250982,35714.286076,6.216606,0.693147,0.605039,153203.767963,146034,1.049097,158201.063662,146034,1.083317
762381,2,35,0,0,20,955,2,218,122,31811,482,12,1,4.852030,1,4.521789,62,3,59008,1,11,3,4,1,3327,0,0,0,10,15,1,27,63984,26363,129510,42,45,203,1.249908,35904.853498,5.389072,0.693147,0.599694,7850.072569,5388,1.456955,8253.809599,5388,1.531887
48397,6,35,0,0,20,955,5,146,122,33826,482,1,1,2.564949,1,2.484907,62,3,59008,1,11,3,4,1,3327,6,358,1,0,2,2,11,53041,25295,129296,42,45,3000,1.265928,34892.763387,5.805135,0.693147,0.607562,59.077730,55,1.074141,60.502034,55,1.100037
818618,2,35,0,0,3,955,2,218,122,24620,482,12,1,3.850148,1,3.713572,62,3,59008,1,11,3,4,1,3327,0,0,1,4,13,1,50,51364,27542,129510,42,39,203,1.262789,38038.985863,6.216606,0.693147,0.606688,2018.781909,1473,1.370524,2118.023393,1473,1.437898
494696,4,35,0,0,7,749,2,218,100,0,73,12,1,3.401197,1,3.218876,62,3,59008,1,11,3,4,1,11,5,208,0,11,20,2,32,59121,27739,129296,42,42,2012,1.260642,37957.930814,6.082219,0.693147,0.607742,346181.198469,244881,1.413671,362835.303771,244881,1.481680


In [73]:
cv_pred = model.predict(X_train[y_conversion == 1])

  if __name__ == '__main__':
  if diff:


In [75]:
cv_pred

array([1, 1, 1, ..., 1, 1, 1])

In [74]:
cv_pred.sum() / len(cv_pred)

0.9726737098740349

In [41]:
y_conversion[y_conversion == 1]

539414    1
539486    1
539526    1
539515    1
539442    1
539528    1
539461    1
537925    1
537900    1
537902    1
537903    1
537908    1
537911    1
537893    1
539554    1
539541    1
539538    1
539579    1
539578    1
539574    1
539570    1
539567    1
539557    1
537928    1
537966    1
537954    1
537953    1
537952    1
537931    1
537932    1
         ..
63924     1
63940     1
63931     1
64045     1
64044     1
64041     1
64046     1
64035     1
64031     1
64047     1
64059     1
64048     1
64058     1
64055     1
64053     1
64052     1
64028     1
64002     1
64006     1
63990     1
63997     1
64009     1
64024     1
64020     1
64008     1
64011     1
64010     1
63121     1
63109     1
63018     1
Name: totals_transactionRevenue, Length: 11515, dtype: int64

In [43]:
len(cv_pred)

9844

In [53]:
len(X_train)

768105

In [54]:
param_reg = {
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'n_estimators':150,
    'max_depth': 5,
    'colsample_bytree': 1.0,
    'learning_rate': 0.3,
    'subsample': 1.0,
    'reg_lambda': 100, # L2正則化 default:1
    'reg_alpha': 0.5, # L1正則化 default:0
}
xgb_reg = xgb.XGBRegressor(**param_reg)

In [55]:
xgb_reg.fit(X_train, y_train)
print('train score')
print(xgb_reg.score(X=X_train, y=y_train))
print('val score')
print(xgb_reg.score(X=X_val, y=y_val))

train score
0.4003719112916899
val score
0.32983060782212786


In [56]:
submission = test_df[['fullVisitorId']].copy()
submission['pred_cv'] = model.predict(X_test)
submission['pred_Revenue'] = xgb_reg.predict(X_test)

  if diff:


In [57]:
submission.head()

Unnamed: 0,fullVisitorId,pred_cv,pred_Revenue
0,6167871330617112363,0,0.000158
1,643697640977915618,0,9.6e-05
2,6059383810968229466,0,-5.3e-05
3,2376720078563423631,0,-9.5e-05
4,2314544520795440038,0,-0.000903


In [58]:
submission["PredictedLogRevenue"] = submission['pred_cv'] * submission['pred_Revenue']
submission.head()

Unnamed: 0,fullVisitorId,pred_cv,pred_Revenue,PredictedLogRevenue
0,6167871330617112363,0,0.000158,0.0
1,643697640977915618,0,9.6e-05,0.0
2,6059383810968229466,0,-5.3e-05,-0.0
3,2376720078563423631,0,-9.5e-05,-0.0
4,2314544520795440038,0,-0.000903,-0.0


In [59]:
submission["PredictedLogRevenue"] = submission["PredictedLogRevenue"].apply(lambda x : 0.0 if x < 0 else x)
submission["PredictedLogRevenue"] = submission["PredictedLogRevenue"].fillna(0.0)
grouped_test = submission[['fullVisitorId', 'PredictedLogRevenue']].groupby('fullVisitorId').sum().reset_index()
grouped_test.to_csv('cv_classfier.csv',index=False)

In [76]:
from sklearn.metrics import mean_squared_log_error

pred_val = xgb_reg.predict(X_val)

In [79]:
score = mean_squared_log_error(y_pred=pred_val,y_true=y_val)
print(score)

0.004056152453793755


In [80]:
pred_val_cv = model.predict(X_val) * pred_val
score = mean_squared_log_error(y_pred=pred_val_cv,y_true=y_val)
print(score)

0.004045809005182342


  if diff:
