In [254]:
import numpy as np
import lightgbm as lgb
import os

In [255]:
gacrp_home=os.environ['GACRP_HOME']

In [256]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error
import gc
import time
from pandas.core.common import SettingWithCopyWarning
import warnings
import lightgbm as lgb
from sklearn.model_selection import GroupKFold

In [257]:
warnings.simplefilter('error', SettingWithCopyWarning)
gc.enable()
%matplotlib inline

In [258]:
train= pd.read_csv(gacrp_home+"/dat/train_flat_100.csv"
                   ,dtype={'date':str,'fullVisitorId':str,'sessionId':str,'adContent':str})

In [259]:
print(train.shape)

(99, 55)


In [260]:
def get_folds(df=None,n_splits=5):
    unique_vis = np.array(sorted(df['fullVisitorId'].unique()))
    print(len(unique_vis))
    folds = GroupKFold(n_splits=n_splits)
    fold_ids=[]
    #idsはindexっぽい
    ids = np.arange(df.shape[0])
    for trn_vis,val_vis in folds.split(X=unique_vis,y=unique_vis,groups=unique_vis):
        fold_ids.append(
            [
                ids[df['fullVisitorId'].isin(unique_vis[trn_vis])],
                ids[df['fullVisitorId'].isin(unique_vis[val_vis])]
            ]
        )
    #print(len(fold_ids))
    return fold_ids
        

In [261]:
# transactionRevenueは教師側に持っていく
y_reg=train['transactionRevenue'].fillna(0)
del train['transactionRevenue']


In [262]:
for df in [train]:
    df['date']=pd.to_datetime(df['visitStartTime'],unit='s')
    df['sess_date_dow']=df['date'].dt.dayofweek
    df['sess_date_hours']=df['date'].dt.hour
    df['sess_date_dom']=df['date'].dt.day

In [263]:
excluded_features=[
    'date','fullVisitorId','sessionId','transactionRevenue','visitId','visitStartTime',
    'adwordsClickInfo.adNetworkType','browserSize','browserVersion','campaignCode','cityId','adwordsClickInfo.criteriaParameters','flashVersion','adwordsClickInfo.gclId','adwordsClickInfo.isVideoAd','language','latitude','longitude','mobileDeviceBranding','mobileDeviceInfo','mobileDeviceMarketingName','mobileDeviceModel','mobileInputSelector','networkLocation','operatingSystemVersion','adwordsClickInfo.page','screenColors','screenResolution','adwordsClickInfo.slot','socialEngagementType','visits'
]
categorical_features=[
    _f for _f in train.columns
    if (_f not in excluded_features) & (train[_f].dtype =='object')
]
print(categorical_features)

['channelGrouping', 'browser', 'operatingSystem', 'deviceCategory', 'continent', 'subContinent', 'country', 'region', 'metro', 'city', 'networkDomain', 'campaign', 'source', 'medium', 'keyword', 'isTrueDirect', 'referralPath', 'adContent']


In [264]:
for f in categorical_features:
    train[f],indexer = pd.factorize(train[f])
    

In [265]:
#print(train['continent'][0:100])

In [269]:
# prediction
folds = get_folds(df=train,n_splits=5)
#foldsは分割データのインデックスの配列（2,3,5,7…とか。）
#print("folds:",folds)
train_features=[_f for _f in train.columns if _f not in excluded_features]
print(train_features)
print(len(train_features))

98
('folds:', [[array([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 19, 21, 22,
       23, 24, 26, 27, 28, 29, 30, 32, 33, 36, 37, 39, 40, 41, 43, 44, 45,
       46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 60, 61, 62, 63,
       65, 67, 68, 69, 70, 71, 72, 73, 75, 76, 77, 79, 80, 82, 83, 85, 86,
       87, 88, 89, 90, 91, 93, 94, 95, 96, 97, 98]), array([ 0,  1, 16, 17, 18, 20, 25, 31, 34, 35, 38, 42, 59, 64, 66, 74, 78,
       81, 84, 92])], [array([ 0,  1,  2,  3,  5,  6,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
       19, 20, 21, 23, 24, 25, 26, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
       38, 39, 40, 41, 42, 43, 45, 47, 48, 50, 51, 52, 53, 55, 56, 57, 58,
       59, 62, 63, 64, 65, 66, 67, 69, 70, 71, 72, 74, 75, 76, 77, 78, 80,
       81, 82, 84, 85, 86, 88, 91, 92, 94, 95, 98]), array([ 4,  7, 22, 27, 44, 46, 49, 54, 60, 61, 68, 73, 79, 83, 87, 89, 90,
       93, 96, 97])], [array([ 0,  1,  3,  4,  5,  6,  7,  8,  9, 10, 13, 14, 15, 16, 17, 18, 19,
       

In [None]:
importances = pd.DataFrame()
oof_reg_preds = np.zeros(train.shape[0])

In [None]:
for fold_ , (trn_,val_)in enumerate(folds):
    trn_x,trn_y = train[train_features].iloc[trn_],y_reg.iloc[trn_]
    val_x,val_y = train[train_features].iloc[val_],y_reg.iloc[val_]
    
    reg= lgb.LGBMRegressor(
        num_leaves=31,
        learning_rage=0.03,
        n_estimators=1000,
        subsample=0.9,
        colsample_bytree=0.9,
        random_state=1
    )
    reg.fit(
        trn_x,np.log1p(trn_y),
        eval_set=[(val_x,np.log1p(val_y))],
        early_stopping_rounds=50,
        verbose=100
        eval_metric='rmse'
    )