In [63]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
from pandas.io.json import json_normalize
from datetime import datetime
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, KFold
import os

In [64]:
def load_df(csv_path='train.csv', nrows=None):
    JSON_COLUMNS = ['device', 'geoNetwork', 'totals', 'trafficSource']
    
    df = pd.read_csv(csv_path, 
                     converters={column: json.loads for column in JSON_COLUMNS}, 
                     dtype={'fullVisitorId': 'str'}, # Important!!
                     nrows=nrows)
    
    for column in JSON_COLUMNS:
        column_as_df = json_normalize(df[column])
        column_as_df.columns = [f"{column}.{subcolumn}" for subcolumn in column_as_df.columns]
        df = df.drop(column, axis=1).merge(column_as_df, right_index=True, left_index=True)
    print(f"Loaded {os.path.basename(csv_path)}. Shape: {df.shape}")
    return df

In [68]:
train_df = load_df('train.csv',1_00_000)
test_df = load_df("test.csv",1_00_000)

Loaded train.csv. Shape: (100000, 55)
Loaded test.csv. Shape: (100000, 53)


In [69]:
train_df["totals.transactionRevenue"] = train_df["totals.transactionRevenue"].astype('float')

In [70]:
const_cols = [c for c in train_df.columns if train_df[c].nunique(dropna=False)==1 ]
const_cols

['socialEngagementType',
 'device.browserSize',
 'device.browserVersion',
 'device.flashVersion',
 'device.language',
 'device.mobileDeviceBranding',
 'device.mobileDeviceInfo',
 'device.mobileDeviceMarketingName',
 'device.mobileDeviceModel',
 'device.mobileInputSelector',
 'device.operatingSystemVersion',
 'device.screenColors',
 'device.screenResolution',
 'geoNetwork.cityId',
 'geoNetwork.latitude',
 'geoNetwork.longitude',
 'geoNetwork.networkLocation',
 'totals.visits',
 'trafficSource.adwordsClickInfo.criteriaParameters']

In [71]:
cols_to_drop = const_cols + ['sessionId']

train_df = train_df.drop(cols_to_drop + ["trafficSource.campaignCode"], axis=1)
test_df = test_df.drop(cols_to_drop, axis=1)

In [72]:
train_df.head()

Unnamed: 0,channelGrouping,date,fullVisitorId,visitId,visitNumber,visitStartTime,device.browser,device.deviceCategory,device.isMobile,device.operatingSystem,...,trafficSource.adwordsClickInfo.gclId,trafficSource.adwordsClickInfo.isVideoAd,trafficSource.adwordsClickInfo.page,trafficSource.adwordsClickInfo.slot,trafficSource.campaign,trafficSource.isTrueDirect,trafficSource.keyword,trafficSource.medium,trafficSource.referralPath,trafficSource.source
0,Organic Search,20160902,1131660440785968503,1472830385,1,1472830385,Chrome,desktop,False,Windows,...,,,,,(not set),,(not provided),organic,,google
1,Organic Search,20160902,377306020877927890,1472880147,1,1472880147,Firefox,desktop,False,Macintosh,...,,,,,(not set),,(not provided),organic,,google
2,Organic Search,20160902,3895546263509774583,1472865386,1,1472865386,Chrome,desktop,False,Windows,...,,,,,(not set),,(not provided),organic,,google
3,Organic Search,20160902,4763447161404445595,1472881213,1,1472881213,UC Browser,desktop,False,Linux,...,,,,,(not set),,google + online,organic,,google
4,Organic Search,20160902,27294437909732085,1472822600,2,1472822600,Chrome,mobile,True,Android,...,,,,,(not set),True,(not provided),organic,,google


In [75]:
train_df["totals.transactionRevenue"].fillna(0, inplace=True)
train_y = train_df["totals.transactionRevenue"].values
train_id = train_df["fullVisitorId"].values
test_id = test_df["fullVisitorId"].values


# label encode the categorical variables and convert the numerical variables to float
cat_cols = ["channelGrouping", "device.browser", 
            "device.deviceCategory", "device.operatingSystem", 
            "geoNetwork.city", "geoNetwork.continent", 
            "geoNetwork.country", "geoNetwork.metro",
            "geoNetwork.networkDomain", "geoNetwork.region", 
            "geoNetwork.subContinent", "trafficSource.adContent", 
            "trafficSource.adwordsClickInfo.adNetworkType", 
            "trafficSource.adwordsClickInfo.gclId", 
            "trafficSource.adwordsClickInfo.page", 
            "trafficSource.adwordsClickInfo.slot", "trafficSource.campaign",
            "trafficSource.keyword", "trafficSource.medium", 
            "trafficSource.referralPath", "trafficSource.source",
            'trafficSource.adwordsClickInfo.isVideoAd', 'trafficSource.isTrueDirect']
for col in cat_cols:
    print(col)
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(train_df[col].values.astype('str')) + list(test_df[col].values.astype('str')))
    train_df[col] = lbl.transform(list(train_df[col].values.astype('str')))
    test_df[col] = lbl.transform(list(test_df[col].values.astype('str')))


num_cols = ["totals.hits", "totals.pageviews", "visitNumber", "visitStartTime", 'totals.bounces',  'totals.newVisits']    
for col in num_cols:
    train_df[col] = train_df[col].astype(float)
    test_df[col] = test_df[col].astype(float)

# Split the train dataset into development and valid based on time 
dev_df = train_df[train_df['date'] <= pd.to_datetime(2017531,format="%Y%m%d")]
val_df = train_df[train_df['date']> pd.to_datetime(2017531,format="%Y%m%d")]
dev_y = np.log1p(dev_df["totals.transactionRevenue"].values)
val_y = np.log1p(val_df["totals.transactionRevenue"].values)

dev_X = dev_df[cat_cols + num_cols] 
val_X = val_df[cat_cols + num_cols] 
test_X = test_df[cat_cols + num_cols]

channelGrouping
device.browser
device.deviceCategory
device.operatingSystem
geoNetwork.city
geoNetwork.continent
geoNetwork.country
geoNetwork.metro
geoNetwork.networkDomain
geoNetwork.region
geoNetwork.subContinent
trafficSource.adContent
trafficSource.adwordsClickInfo.adNetworkType
trafficSource.adwordsClickInfo.gclId
trafficSource.adwordsClickInfo.page
trafficSource.adwordsClickInfo.slot
trafficSource.campaign
trafficSource.keyword
trafficSource.medium
trafficSource.referralPath
trafficSource.source
trafficSource.adwordsClickInfo.isVideoAd
trafficSource.isTrueDirect


In [83]:
test_X.info()
train_df['date'] = pd.to_datetime(train_df['date'],format="%Y%m%d")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 29 columns):
channelGrouping                                 100000 non-null int64
device.browser                                  100000 non-null int64
device.deviceCategory                           100000 non-null int64
device.operatingSystem                          100000 non-null int64
geoNetwork.city                                 100000 non-null int64
geoNetwork.continent                            100000 non-null int64
geoNetwork.country                              100000 non-null int64
geoNetwork.metro                                100000 non-null int64
geoNetwork.networkDomain                        100000 non-null int64
geoNetwork.region                               100000 non-null int64
geoNetwork.subContinent                         100000 non-null int64
trafficSource.adContent                         100000 non-null int64
trafficSource.adwordsClickInfo.adNetworkType    10

In [19]:
train_df.head()

Unnamed: 0,channelGrouping,date,fullVisitorId,visitId,visitNumber,visitStartTime,device.browser,device.deviceCategory,device.isMobile,device.operatingSystem,...,trafficSource.adwordsClickInfo.gclId,trafficSource.adwordsClickInfo.isVideoAd,trafficSource.adwordsClickInfo.page,trafficSource.adwordsClickInfo.slot,trafficSource.campaign,trafficSource.isTrueDirect,trafficSource.keyword,trafficSource.medium,trafficSource.referralPath,trafficSource.source
0,4,2016-09-02,1131660440785968503,1472830385,1.0,1472830000.0,29,0,False,12,...,2683,1,4,2,6,1,162,5,965,110
1,4,2016-09-02,377306020877927890,1472880147,1.0,1472880000.0,3,0,False,3,...,2683,1,4,2,6,1,162,5,965,110
2,4,2016-09-02,3895546263509774583,1472865386,1.0,1472865000.0,29,0,False,12,...,2683,1,4,2,6,1,162,5,965,110
3,4,2016-09-02,4763447161404445595,1472881213,1.0,1472881000.0,26,0,False,2,...,2683,1,4,2,6,1,251,5,965,110
4,4,2016-09-02,27294437909732085,1472822600,2.0,1472823000.0,29,1,True,1,...,2683,1,4,2,6,0,162,5,965,110


In [77]:
# custom function to run light gbm model
def run_lgb(train_X, train_y, val_X, val_y, test_X):
    params = {
        "objective" : "regression",
        "metric" : "rmse", 
        "num_leaves" : 30,
        "min_child_samples" : 100,
        "learning_rate" : 0.1,
        "bagging_fraction" : 0.7,
        "feature_fraction" : 0.5,
        "bagging_frequency" : 5,
        "bagging_seed" : 2018,
        "verbosity" : -1
    }
    
    lgtrain = lgb.Dataset(train_X, label=train_y)
    lgval = lgb.Dataset(val_X, label=val_y)
    model = lgb.train(params, lgtrain, 2000, valid_sets=[lgval], early_stopping_rounds=100, verbose_eval=100)
    
    pred_test_y = model.predict(test_X, num_iteration=model.best_iteration)
    pred_val_y = model.predict(val_X, num_iteration=model.best_iteration)
    return pred_test_y, model, pred_val_y

In [81]:
#folds = get_folds(df=train_df, n_splits=100)
#y_reg = train_df['totals.transactionRevenue'].fillna(0)
#train_features = [_f for _f in train_df.columns if _f not in excluded_features]

#importances = pd.DataFrame()
#oof_reg_preds = np.zeros(train_df.shape[0])
#sub_reg_preds = np.zeros(test_df.shape[0])
#for fold_, (trn_, val_) in enumerate(folds):
#    dev_X, dev_y = train_df[train_features].iloc[trn_], y_reg.iloc[trn_]
#    val_X, val_y = train_df[train_features].iloc[val_], y_reg.iloc[val_]
    
pred_test, model, pred_val = run_lgb(dev_X, dev_y, val_X, val_y, test_X)

Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 1.74379
Early stopping, best iteration is:
[67]	valid_0's rmse: 1.73711


In [80]:
import lightgbm as lgb

In [15]:
import lightgbm as lgb

In [26]:
from sklearn import metrics
pred_val[pred_val<0] = 0
val_pred_df = pd.DataFrame({"fullVisitorId":val_df["fullVisitorId"].values})
val_pred_df["transactionRevenue"] = val_df["totals.transactionRevenue"].values
val_pred_df["PredictedRevenue"] = np.expm1(pred_val)
#print(np.sqrt(metrics.mean_squared_error(np.log1p(val_pred_df["transactionRevenue"].values), np.log1p(val_pred_df["PredictedRevenue"].values))))
val_pred_df = val_pred_df.groupby("fullVisitorId")["transactionRevenue", "PredictedRevenue"].sum().reset_index()
#print(np.sqrt(metrics.mean_squared_error(np.log1p(val_pred_df["transactionRevenue"].values), np.log1p(val_pred_df["PredictedRevenue"].values))))

  """


ValueError: Length of values does not match length of index

In [20]:
sub_df = pd.DataFrame({"fullVisitorId":test_id})
pred_test[pred_test<0] = 0
sub_df["PredictedLogRevenue"] = np.expm1(pred_test)
sub_df = sub_df.groupby("fullVisitorId")["PredictedLogRevenue"].sum().reset_index()
sub_df.columns = ["fullVisitorId", "PredictedLogRevenue"]
sub_df["PredictedLogRevenue"] = np.log1p(sub_df["PredictedLogRevenue"])
sub_df.to_csv("baseline_lgb.csv", index=False)

  This is separate from the ipykernel package so we can avoid doing imports until


In [22]:
def get_folds(df=None, n_splits=5):
    """Returns dataframe indices corresponding to Visitors Group KFold"""
    # Get sorted unique visitors
    unique_vis = np.array(sorted(df['fullVisitorId'].unique()))

    # Get folds
    folds = GroupKFold(n_splits=n_splits)
    fold_ids = []
    ids = np.arange(df.shape[0])
    for trn_vis, val_vis in folds.split(X=unique_vis, y=unique_vis, groups=unique_vis):
        fold_ids.append(
            [
                ids[df['fullVisitorId'].isin(unique_vis[trn_vis])],
                ids[df['fullVisitorId'].isin(unique_vis[val_vis])]
            ]
        )

    return fold_ids

In [23]:
excluded_features = [
    'date', 'fullVisitorId', 'sessionId', 'totals.transactionRevenue', 
    'visitId', 'visitStartTime'
]

In [24]:
from sklearn.model_selection import GroupKFold

In [27]:
from sklearn.metrics import mean_squared_error
def rmse(y_true, y_pred):
    return round(np.sqrt(mean_squared_error(y_true, y_pred)), 5)

In [39]:
target = train_df['totals.transactionRevenue'].fillna(0).astype(float)
target = target.apply(lambda x: np.log(x) if x > 0 else x)
del train_df['totals.transactionRevenue']

columns = [col for col in train_df.columns if train_df[col].nunique() > 1]

train_df = train_df[columns].copy()
test_df = test_df[columns].copy()

train_df =  feature_design(train_df)
test_df =  feature_design(test_df)

KeyError: 'totals.transactionRevenue'

In [29]:
def lgb_train(X, y, test_X, params):
    kf = KFold(n_splits=5, shuffle=True, random_state=2017)
    pred_test = 0
    pred_train = 0
    for dev_index, val_index in kf.split(X):
        train_x, valid_x = X.iloc[dev_index,:], X.iloc[val_index,:]
        train_y, valid_y = y[dev_index], y[val_index]
        lgtrain = lightgbm.Dataset(train_x, train_y,categorical_feature=list(cat_cols))
        lgvalid = lightgbm.Dataset(valid_x, valid_y,categorical_feature=list(cat_cols))
        model = lightgbm.train(params, lgtrain, 2000, valid_sets=[lgvalid], early_stopping_rounds=100, verbose_eval=100)
        pred_test_iter = model.predict(test_X, num_iteration=model.best_iteration)
        pred_test_iter[pred_test_iter<0]=0
        pred_test+=pred_test_iter
        pred_train_iter = model.predict(X, num_iteration=model.best_iteration)
        pred_train_iter[pred_train_iter<0]=0
        pred_train+=pred_train_iter
    pred_test /= 5.
    pred_train  /= 5.
    return pred_test, pred_train

In [33]:
params_lgb = {'objective': 'regression', 
          'metric': 'rmse', 
          'num_leaves': 49, 
          'max_depth': 14, 
          'lambda_l2': 0.01931081461346337, 
          'lambda_l1': 0.007163878762237125, 
          'num_threads': 4, 
          'min_child_samples': 40, 
          'learning_rate': 0.01, 
          'bagging_fraction': 0.7910460446769023, 
          'feature_fraction': 0.5046791892199741, 
          'subsample_freq': 5, 
          'bagging_seed': 42, 
          'verbosity': -1}

In [36]:
sub_lgb_test, sub_lgb_train = lgb_train(train_df, target, test_df, params_lgb)



ValueError: DataFrame.dtypes for data must be int, float or bool. Did not expect the data types in fields date, fullVisitorId

In [35]:
import lightgbm

In [1]:
def feature_design(df):
    df['date'] = pd.to_datetime(df['date'].apply(lambda x: str(x)[:4] + '-' + str(x)[4:6] + '-' + str(x)[6:]))
    for col in ['visitNumber', 'totals.hits', 'totals.pageviews']:
        df[col] = df[col].astype(float)
        
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['weekday'] = df['date'].dt.weekday
    df['weekofyear'] = df['date'].dt.weekofyear
    
    df['month_unique_user_count'] = df.groupby('month')['fullVisitorId'].transform('nunique')
    df['day_unique_user_count'] = df.groupby('day')['fullVisitorId'].transform('nunique')
    df['weekday_unique_user_count'] = df.groupby('weekday')['fullVisitorId'].transform('nunique')
    df['weekofyear_unique_user_count'] = df.groupby('weekofyear')['fullVisitorId'].transform('nunique')
    
    df['browser_category'] = df['device.browser'] + '_' + df['device.deviceCategory']
    df['browser_operatingSystem'] = df['device.browser'] + '_' + df['device.operatingSystem']
    df['source_country'] = df['trafficSource.source'] + '_' + df['geoNetwork.country']
    
    df['visitNumber'] = np.log1p(df['visitNumber'])
    df['totals.hits'] = np.log1p(df['totals.hits'])
    df['totals.pageviews'] = np.log1p(df['totals.pageviews'].fillna(0))
    
    df['sum_pageviews_per_network_domain'] = df.groupby('geoNetwork.networkDomain')['totals.pageviews'].transform('sum')
    df['count_pageviews_per_network_domain'] = df.groupby('geoNetwork.networkDomain')['totals.pageviews'].transform('count')
    df['mean_pageviews_per_network_domain'] = df.groupby('geoNetwork.networkDomain')['totals.pageviews'].transform('mean')
    df['sum_hits_per_network_domain'] = df.groupby('geoNetwork.networkDomain')['totals.hits'].transform('sum')
    df['count_hits_per_network_domain'] = df.groupby('geoNetwork.networkDomain')['totals.hits'].transform('count')
    df['mean_hits_per_network_domain'] = df.groupby('geoNetwork.networkDomain')['totals.hits'].transform('mean')
    
    df['mean_hits_per_day'] = df.groupby(['day'])['totals.hits'].transform('mean')
    df['sum_hits_per_day'] = df.groupby(['day'])['totals.hits'].transform('median')

    df['sum_pageviews_per_network_domain'] = df.groupby('geoNetwork.networkDomain')['totals.pageviews'].transform('sum')
    df['count_pageviews_per_network_domain'] = df.groupby('geoNetwork.networkDomain')['totals.pageviews'].transform('median')
    df['mean_pageviews_per_network_domain'] = df.groupby('geoNetwork.networkDomain')['totals.pageviews'].transform('mean')

    df['sum_pageviews_per_region'] = df.groupby('geoNetwork.region')['totals.pageviews'].transform('sum')
    df['count_pageviews_per_region'] = df.groupby('geoNetwork.region')['totals.pageviews'].transform('median')
    df['mean_pageviews_per_region'] = df.groupby('geoNetwork.region')['totals.pageviews'].transform('mean')

    df['sum_hits_per_network_domain'] = df.groupby('geoNetwork.networkDomain')['totals.hits'].transform('sum')
    df['count_hits_per_network_domain'] = df.groupby('geoNetwork.networkDomain')['totals.hits'].transform('median')
    df['mean_hits_per_network_domain'] = df.groupby('geoNetwork.networkDomain')['totals.hits'].transform('mean')

    df['sum_hits_per_region'] = df.groupby('geoNetwork.region')['totals.hits'].transform('sum')
    df['count_hits_per_region'] = df.groupby('geoNetwork.region')['totals.hits'].transform('median')
    df['mean_hits_per_region'] = df.groupby('geoNetwork.region')['totals.hits'].transform('mean')

    df['sum_hits_per_country'] = df.groupby('geoNetwork.country')['totals.hits'].transform('sum')
    df['count_hits_per_country'] = df.groupby('geoNetwork.country')['totals.hits'].transform('median')
    df['mean_hits_per_country'] = df.groupby('geoNetwork.country')['totals.hits'].transform('mean')

    df['user_pageviews_sum'] = df.groupby('fullVisitorId')['totals.pageviews'].transform('sum')
    df['user_hits_sum'] = df.groupby('fullVisitorId')['totals.hits'].transform('sum')

    df['user_pageviews_count'] = df.groupby('fullVisitorId')['totals.pageviews'].transform('count')
    df['user_hits_count'] = df.groupby('fullVisitorId')['totals.hits'].transform('count')

    df['user_pageviews_sum_to_mean'] = df['user_pageviews_sum'] / df['user_pageviews_sum'].mean()
    df['user_hits_sum_to_mean'] = df['user_hits_sum'] / df['user_hits_sum'].mean()

    df['user_pageviews_to_region'] = df['user_pageviews_sum'] / df['mean_pageviews_per_region']
    df['user_hits_to_region'] = df['user_hits_sum'] / df['mean_hits_per_region']
    
    useless_columns = ['sessionId', 'visitId', 'fullVisitorId', 'date', 'visitStartTime','user_pageviews_sum', 'user_hits_sum',
                      'user_pageviews_count', 'user_hits_count']
    df = df.drop(useless_columns, axis = 1)
    
    return df

In [27]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import json
%matplotlib inline
from pandas.io.json import json_normalize

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, KFold
import os

In [28]:
from sklearn.metrics import mean_squared_error
def rmse(y_true, y_pred):
    return round(np.sqrt(mean_squared_error(y_true, y_pred)), 5)

In [29]:
def load_df(csv_path='train.csv', nrows=None):
    JSON_COLUMNS = ['device', 'geoNetwork', 'totals', 'trafficSource']
    
    df = pd.read_csv(csv_path, 
                     converters={column: json.loads for column in JSON_COLUMNS}, 
                     dtype={'fullVisitorId': 'str'}, # Important!!
                     nrows=nrows)
    
    for column in JSON_COLUMNS:
        column_as_df = json_normalize(df[column])
        column_as_df.columns = [f"{column}.{subcolumn}" for subcolumn in column_as_df.columns]
        df = df.drop(column, axis=1).merge(column_as_df, right_index=True, left_index=True)
    print(f"Loaded {os.path.basename(csv_path)}. Shape: {df.shape}")
    return df

In [30]:
train_df = load_df('train.csv',1_00_000)
test_df = load_df('test.csv')
train_ind = train_df['fullVisitorId'].copy()
test_ind = test_df['fullVisitorId'].copy()

Loaded train.csv. Shape: (100000, 55)
Loaded test.csv. Shape: (804684, 53)


In [31]:
def feature_design(df):
    df['date'] = pd.to_datetime(df['date'].apply(lambda x: str(x)[:4] + '-' + str(x)[4:6] + '-' + str(x)[6:]))
    for col in ['visitNumber', 'totals.hits', 'totals.pageviews']:
        df[col] = df[col].astype(float)
        
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['weekday'] = df['date'].dt.weekday
    df['weekofyear'] = df['date'].dt.weekofyear
    
    df['month_unique_user_count'] = df.groupby('month')['fullVisitorId'].transform('nunique')
    df['day_unique_user_count'] = df.groupby('day')['fullVisitorId'].transform('nunique')
    df['weekday_unique_user_count'] = df.groupby('weekday')['fullVisitorId'].transform('nunique')
    df['weekofyear_unique_user_count'] = df.groupby('weekofyear')['fullVisitorId'].transform('nunique')
    
    df['browser_category'] = df['device.browser'] + '_' + df['device.deviceCategory']
    df['browser_operatingSystem'] = df['device.browser'] + '_' + df['device.operatingSystem']
    df['source_country'] = df['trafficSource.source'] + '_' + df['geoNetwork.country']
    
    df['visitNumber'] = np.log1p(df['visitNumber'])
    df['totals.hits'] = np.log1p(df['totals.hits'])
    df['totals.pageviews'] = np.log1p(df['totals.pageviews'].fillna(0))
    
    df['sum_pageviews_per_network_domain'] = df.groupby('geoNetwork.networkDomain')['totals.pageviews'].transform('sum')
    df['count_pageviews_per_network_domain'] = df.groupby('geoNetwork.networkDomain')['totals.pageviews'].transform('count')
    df['mean_pageviews_per_network_domain'] = df.groupby('geoNetwork.networkDomain')['totals.pageviews'].transform('mean')
    df['sum_hits_per_network_domain'] = df.groupby('geoNetwork.networkDomain')['totals.hits'].transform('sum')
    df['count_hits_per_network_domain'] = df.groupby('geoNetwork.networkDomain')['totals.hits'].transform('count')
    df['mean_hits_per_network_domain'] = df.groupby('geoNetwork.networkDomain')['totals.hits'].transform('mean')
    
    df['mean_hits_per_day'] = df.groupby(['day'])['totals.hits'].transform('mean')
    df['sum_hits_per_day'] = df.groupby(['day'])['totals.hits'].transform('median')

    df['sum_pageviews_per_network_domain'] = df.groupby('geoNetwork.networkDomain')['totals.pageviews'].transform('sum')
    df['count_pageviews_per_network_domain'] = df.groupby('geoNetwork.networkDomain')['totals.pageviews'].transform('median')
    df['mean_pageviews_per_network_domain'] = df.groupby('geoNetwork.networkDomain')['totals.pageviews'].transform('mean')

    df['sum_pageviews_per_region'] = df.groupby('geoNetwork.region')['totals.pageviews'].transform('sum')
    df['count_pageviews_per_region'] = df.groupby('geoNetwork.region')['totals.pageviews'].transform('median')
    df['mean_pageviews_per_region'] = df.groupby('geoNetwork.region')['totals.pageviews'].transform('mean')

    df['sum_hits_per_network_domain'] = df.groupby('geoNetwork.networkDomain')['totals.hits'].transform('sum')
    df['count_hits_per_network_domain'] = df.groupby('geoNetwork.networkDomain')['totals.hits'].transform('median')
    df['mean_hits_per_network_domain'] = df.groupby('geoNetwork.networkDomain')['totals.hits'].transform('mean')

    df['sum_hits_per_region'] = df.groupby('geoNetwork.region')['totals.hits'].transform('sum')
    df['count_hits_per_region'] = df.groupby('geoNetwork.region')['totals.hits'].transform('median')
    df['mean_hits_per_region'] = df.groupby('geoNetwork.region')['totals.hits'].transform('mean')

    df['sum_hits_per_country'] = df.groupby('geoNetwork.country')['totals.hits'].transform('sum')
    df['count_hits_per_country'] = df.groupby('geoNetwork.country')['totals.hits'].transform('median')
    df['mean_hits_per_country'] = df.groupby('geoNetwork.country')['totals.hits'].transform('mean')

    df['user_pageviews_sum'] = df.groupby('fullVisitorId')['totals.pageviews'].transform('sum')
    df['user_hits_sum'] = df.groupby('fullVisitorId')['totals.hits'].transform('sum')

    df['user_pageviews_count'] = df.groupby('fullVisitorId')['totals.pageviews'].transform('count')
    df['user_hits_count'] = df.groupby('fullVisitorId')['totals.hits'].transform('count')

    df['user_pageviews_sum_to_mean'] = df['user_pageviews_sum'] / df['user_pageviews_sum'].mean()
    df['user_hits_sum_to_mean'] = df['user_hits_sum'] / df['user_hits_sum'].mean()

    df['user_pageviews_to_region'] = df['user_pageviews_sum'] / df['mean_pageviews_per_region']
    df['user_hits_to_region'] = df['user_hits_sum'] / df['mean_hits_per_region']
    
    useless_columns = ['sessionId', 'visitId', 'fullVisitorId', 'date', 'visitStartTime','user_pageviews_sum', 'user_hits_sum',
                      'user_pageviews_count', 'user_hits_count']
    df = df.drop(useless_columns, axis = 1)
    
    return df

In [32]:
target = train_df['totals.transactionRevenue'].fillna(0).astype(float)
target = target.apply(lambda x: np.log(x) if x > 0 else x)
del train_df['totals.transactionRevenue']

columns = [col for col in train_df.columns if train_df[col].nunique() > 1]

train_df = train_df[columns].copy()
test_df = test_df[columns].copy()

train_df =  feature_design(train_df)
test_df =  feature_design(test_df)

In [33]:
cat_cols = train_df.select_dtypes(exclude=['float64', 'int64']).columns

for col in cat_cols:
    lbl = LabelEncoder()
    lbl.fit(list(train_df[col].values.astype('str')) + list(test_df[col].values.astype('str')))
    train_df[col] = lbl.transform(list(train_df[col].values.astype('str')))
    test_df[col] = lbl.transform(list(test_df[col].values.astype('str')))

In [34]:
import lightgbm

In [35]:
def lgb_train(X, y, test_X, params):
    kf = KFold(n_splits=5, shuffle=True, random_state=2017)
    pred_test = 0
    pred_train = 0
    for dev_index, val_index in kf.split(X):
        train_x, valid_x = X.iloc[dev_index,:], X.iloc[val_index,:]
        train_y, valid_y = y[dev_index], y[val_index]
        lgtrain = lightgbm.Dataset(train_x, train_y,categorical_feature=list(cat_cols))
        lgvalid = lightgbm.Dataset(valid_x, valid_y,categorical_feature=list(cat_cols))
        model = lightgbm.train(params, lgtrain, 2000, valid_sets=[lgvalid], early_stopping_rounds=100, verbose_eval=100)
        pred_test_iter = model.predict(test_X, num_iteration=model.best_iteration)
        pred_test_iter[pred_test_iter<0]=0
        pred_test+=pred_test_iter
        pred_train_iter = model.predict(X, num_iteration=model.best_iteration)
        pred_train_iter[pred_train_iter<0]=0
        pred_train+=pred_train_iter
    pred_test /= 5.
    pred_train  /= 5.
    return pred_test, pred_train

In [36]:
params_lgb = {'objective': 'regression', 
          'metric': 'rmse', 
          'num_leaves': 49, 
          'max_depth': 14, 
          'lambda_l2': 0.01931081461346337, 
          'lambda_l1': 0.007163878762237125, 
          'num_threads': 4, 
          'min_child_samples': 40, 
          'learning_rate': 0.01, 
          'bagging_fraction': 0.7910460446769023, 
          'feature_fraction': 0.5046791892199741, 
          'subsample_freq': 5, 
          'bagging_seed': 42, 
          'verbosity': -1}

In [37]:
sub_lgb_test, sub_lgb_train = lgb_train(train_df, target, test_df, params_lgb)



Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 1.85555
[200]	valid_0's rmse: 1.76001
[300]	valid_0's rmse: 1.72696
[400]	valid_0's rmse: 1.71393
[500]	valid_0's rmse: 1.70927
[600]	valid_0's rmse: 1.70764
[700]	valid_0's rmse: 1.70578
Early stopping, best iteration is:
[680]	valid_0's rmse: 1.70551




Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 1.83376
[200]	valid_0's rmse: 1.76197
[300]	valid_0's rmse: 1.74169
[400]	valid_0's rmse: 1.7385
[500]	valid_0's rmse: 1.73861
Early stopping, best iteration is:
[468]	valid_0's rmse: 1.73775




Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 1.84154
[200]	valid_0's rmse: 1.76516
[300]	valid_0's rmse: 1.73877
[400]	valid_0's rmse: 1.73245
[500]	valid_0's rmse: 1.72864
[600]	valid_0's rmse: 1.72755
[700]	valid_0's rmse: 1.72853
Early stopping, best iteration is:
[612]	valid_0's rmse: 1.72704




Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 1.70541
[200]	valid_0's rmse: 1.66353
[300]	valid_0's rmse: 1.66037
Early stopping, best iteration is:
[278]	valid_0's rmse: 1.65929




Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 1.85816
[200]	valid_0's rmse: 1.78492
[300]	valid_0's rmse: 1.76424
[400]	valid_0's rmse: 1.7577
[500]	valid_0's rmse: 1.7533
[600]	valid_0's rmse: 1.75347
Early stopping, best iteration is:
[526]	valid_0's rmse: 1.75237


In [38]:
import xgboost as xgb

In [39]:
def xgb_train(X, y, test_X, params):
    kf = KFold(n_splits=5, shuffle=True, random_state=2017)
    pred_test_xgb = 0
    pred_train_xgb = 0
    for dev_index, val_index in kf.split(train_df):
        train_x, valid_x = X.loc[dev_index,:], X.loc[val_index,:]
        train_y, valid_y = y[dev_index], y[val_index]
        xgb_train_data = xgb.DMatrix(train_x, train_y)
        xgb_val_data = xgb.DMatrix(valid_x, valid_y)
        xgb_submit_data = xgb.DMatrix(test_X)
        xgb_submit_data_train = xgb.DMatrix(X)
        xgb_model = xgb.train(params, xgb_train_data, 
                          num_boost_round=2000, 
                          evals= [(xgb_train_data, 'train'), (xgb_val_data, 'valid')],
                          early_stopping_rounds=100, 
                          verbose_eval=500
                         )
        pred_test = xgb_model.predict(xgb_submit_data, ntree_limit=xgb_model.best_ntree_limit)
        pred_train = xgb_model.predict(xgb_submit_data_train, ntree_limit=xgb_model.best_ntree_limit)
        pred_test[pred_test<0]=0
        pred_train[pred_train<0]=0
        pred_test_xgb += pred_test
        pred_train_xgb += pred_train
    pred_test_xgb /= 5.
    pred_train_xgb /= 5.
    return pred_test_xgb, pred_train_xgb

In [40]:
params_xgb = {
            'objective': 'reg:linear',
            'eval_metric': 'rmse',
            'eta': 0.001,
            'max_depth': 7,
            'gamma': 1.3250360141843498, 
            'min_child_weight': 13.0958516960316, 
            'max_delta_step': 8.88492863796954, 
            'subsample': 0.9864199446951019, 
            'colsample_bytree': 0.8376539278239742,
            'subsample': 0.6,
            'colsample_bytree': 0.8,
            'alpha':0.001,
            "num_leaves" : 40,
            'random_state': 42,
            'silent': True,
            }

In [41]:
sub_xgb_test, sub_xgb_train = xgb_train(train_df, target, test_df, params_xgb)

[0]	train-rmse:2.09567	valid-rmse:2.18168
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 100 rounds.
[500]	train-rmse:1.84302	valid-rmse:1.93879
[1000]	train-rmse:1.70914	valid-rmse:1.82236
[1500]	train-rmse:1.62918	valid-rmse:1.76446
[1999]	train-rmse:1.57669	valid-rmse:1.73554
[0]	train-rmse:2.10909	valid-rmse:2.12908
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 100 rounds.
[500]	train-rmse:1.84209	valid-rmse:1.90542
[1000]	train-rmse:1.70112	valid-rmse:1.80999
[1500]	train-rmse:1.61636	valid-rmse:1.76734
[1999]	train-rmse:1.56121	valid-rmse:1.74948
[0]	train-rmse:2.11038	valid-rmse:2.12415
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 100 rounds.
[500]	train-rmse:1.84589	valid-rmse:1.91257
[1000]	train-rmse:1.70755	valid-rm

In [42]:
from catboost import CatBoostRegressor

In [43]:
def cat_train(X, y, test_X):
    kf = KFold(n_splits=5, shuffle=True, random_state=2017)
    pred_test_cat = 0
    pred_train_cat = 0
    for dev_index, val_index in kf.split(train_df):
        train_x, valid_x = X.loc[dev_index,:], X.loc[val_index,:]
        train_y, valid_y = y[dev_index], y[val_index]
        model = CatBoostRegressor(iterations=1000,
                             learning_rate=0.05,
                             depth=10,
                             eval_metric='RMSE',
                             random_seed = 42,
                             bagging_temperature = 0.2,
                             od_type='Iter',
                             metric_period = 50,
                             od_wait=20)
        model.fit(train_x, train_y, eval_set=(valid_x, valid_y),use_best_model=True,verbose=True, 
                  cat_features= [i for i in range(len(train_df.columns)) if train_df.columns[i] in cat_cols])
        pred_test = model.predict(test_X)
        pred_test[pred_test<0]=0
        pred_test_cat += pred_test
        pred_train = model.predict(X)
        pred_train[pred_train<0]=0
        pred_train_cat += pred_train
    pred_test_cat /= 5.
    pred_train_cat /= 5.
    return pred_test_cat, pred_train_cat

In [44]:
sub_cat_test, sub_cat_train = cat_train(train_df, target, test_df)



0:	learn: 2.0643013	test: 2.1530379	best: 2.1530379 (0)	total: 452ms	remaining: 7m 31s
50:	learn: 1.5761052	test: 1.7337848	best: 1.7337848 (50)	total: 14.5s	remaining: 4m 30s
100:	learn: 1.4722913	test: 1.7095136	best: 1.7095136 (100)	total: 29.3s	remaining: 4m 20s
150:	learn: 1.4231400	test: 1.7057400	best: 1.7057400 (150)	total: 44.3s	remaining: 4m 9s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 1.704778764
bestIteration = 167

Shrink model to first 168 iterations.




0:	learn: 2.0760022	test: 2.1005059	best: 2.1005059 (0)	total: 300ms	remaining: 4m 59s
50:	learn: 1.5724637	test: 1.7548935	best: 1.7548935 (50)	total: 13.5s	remaining: 4m 11s
100:	learn: 1.4823396	test: 1.7375350	best: 1.7375350 (100)	total: 30.8s	remaining: 4m 34s
150:	learn: 1.4346918	test: 1.7343586	best: 1.7334117 (142)	total: 46.9s	remaining: 4m 23s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 1.733411706
bestIteration = 142

Shrink model to first 143 iterations.




0:	learn: 2.0778478	test: 2.0986115	best: 2.0986115 (0)	total: 259ms	remaining: 4m 18s
50:	learn: 1.5758241	test: 1.7473544	best: 1.7473544 (50)	total: 14.2s	remaining: 4m 23s
100:	learn: 1.4817920	test: 1.7238623	best: 1.7238623 (100)	total: 30.8s	remaining: 4m 34s
150:	learn: 1.4264160	test: 1.7198796	best: 1.7198796 (150)	total: 46.6s	remaining: 4m 22s
200:	learn: 1.3762002	test: 1.7173682	best: 1.7170864 (197)	total: 1m 2s	remaining: 4m 9s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 1.717086439
bestIteration = 197

Shrink model to first 198 iterations.




0:	learn: 2.1143870	test: 1.9385267	best: 1.9385267 (0)	total: 255ms	remaining: 4m 14s
50:	learn: 1.5729240	test: 1.6572487	best: 1.6561516 (41)	total: 14.1s	remaining: 4m 22s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 1.656151559
bestIteration = 41

Shrink model to first 42 iterations.




0:	learn: 2.0701203	test: 2.1220897	best: 2.1220897 (0)	total: 260ms	remaining: 4m 19s
50:	learn: 1.5509562	test: 1.7789080	best: 1.7789080 (50)	total: 13.9s	remaining: 4m 18s
100:	learn: 1.4500967	test: 1.7696012	best: 1.7696012 (100)	total: 30.1s	remaining: 4m 28s
150:	learn: 1.4007065	test: 1.7658879	best: 1.7652302 (147)	total: 46.2s	remaining: 4m 19s
200:	learn: 1.3526236	test: 1.7631661	best: 1.7631661 (200)	total: 1m 2s	remaining: 4m 9s
250:	learn: 1.3085713	test: 1.7622297	best: 1.7617352 (244)	total: 1m 20s	remaining: 3m 59s
300:	learn: 1.2705788	test: 1.7610982	best: 1.7595657 (283)	total: 1m 40s	remaining: 3m 53s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 1.759565736
bestIteration = 283

Shrink model to first 284 iterations.


In [45]:
last = pd.DataFrame()
last['fullVisitorId'] = train_ind
last['lgbm'] = sub_lgb_train
last['xbm'] = sub_xgb_train
last['cat'] = sub_cat_train

In [46]:
last_test = pd.DataFrame()
last_test['fullVisitorId'] = test_ind
last_test['lgbm'] = sub_lgb_test
last_test['xbm'] = sub_xgb_test
last_test['cat'] = sub_cat_test

In [47]:
from sklearn.linear_model import Ridge
model = Ridge().fit(last, target)
pred = model.predict(last_test)

Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number1.340088e-39
  overwrite_a=True).T


In [48]:
#pred[pred<0] = 0
submission = pd.DataFrame()
submission['fullVisitorId'] = test_ind
submission['PredictedLogRevenue'] = pred
submission = submission.groupby('fullVisitorId').sum()['PredictedLogRevenue'].fillna(0).reset_index()
submission.to_csv('submit_stack1.csv', index=False)