In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold, GroupKFold
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

import lightgbm as lgb
import gc
import os
print(os.listdir("../input/cs534-preprocessing-3/"))

## 1. Read Input

In [None]:
df_train = pd.read_csv('../input/cs534-preprocessing-3/train_P3.csv', dtype={'fullVisitorId': 'str'})
print(df_train.shape)
print('read train done')
df_test = pd.read_csv('../input/cs534-preprocessing-3/test_P3.csv', dtype={'fullVisitorId': 'str'})
print(df_test.shape)
print('read test done.')

In [None]:
df_test.head()

In [None]:
df_trn, df_val = train_test_split(df_train, test_size=0.33, shuffle=False)

y_train = df_train['totals.transactionRevenue']
y_test = np.log1p(df_test['totals.transactionRevenue'].fillna(0))
y_trn = df_trn['totals.transactionRevenue']
y_val = df_val['totals.transactionRevenue']

Id_train = df_train['fullVisitorId']
Id_test  = df_test['fullVisitorId']
Id_trn = df_trn['fullVisitorId']
Id_val  = df_val['fullVisitorId']

X_train = df_train.drop(['fullVisitorId', 'totals.transactionRevenue'], axis=1)
X_test = df_test.drop(['fullVisitorId', 'totals.transactionRevenue'], axis=1)
X_trn = df_trn.drop(['fullVisitorId', 'totals.transactionRevenue'], axis=1)
X_val = df_val.drop(['fullVisitorId', 'totals.transactionRevenue'], axis=1)

del df_train
del df_test
del df_trn
del df_val

#Backup_X_train = X_train.copy()
#Backup_X_test = X_test.copy()

## 2. Train

In [None]:
#X_train = Backup_X_train.copy()
#X_test = Backup_X_test.copy()

In [None]:
reg = lgb.LGBMRegressor(learning_rate=0.1, 
                            n_estimators=50,
                            max_depth=20,
                            num_leaves=100,
                            feature_fraction=0.6,
                            min_data_in_leaf=350,
                            subsample=0.6,
                           metric='rmse')

reg.fit(X_train, y_train)
yhat_test=reg.predict(X_test)
print('RMSE on test set:{:.6f}'.format(np.sqrt(mean_squared_error(yhat_test, y_test))))

In [None]:
List_n_estimators=range(10,101,5)

for n_estimators in List_n_estimators:
    reg = lgb.LGBMRegressor(learning_rate=0.1, 
                            n_estimators=n_estimators,
                            max_depth=20,
                            num_leaves=100,
                            feature_fraction=0.6,
                            min_data_in_leaf=350,
                            subsample=0.6,
                           metric='rmse')
    
    reg.fit(X_trn, y_trn, verbose=500)
    yhat_trn = reg.predict(X_trn)
    yhat_val = reg.predict(X_val)

    print('n_estimators:{:5d} RMSE on train set:{:.6f}, val set:{:.6f}'.format(
        n_estimators, 
        np.sqrt(mean_squared_error(yhat_trn, y_trn)),
        np.sqrt(mean_squared_error(yhat_val, y_val)),
        ))

In [None]:
reg = lgb.LGBMRegressor(learning_rate=0.1, 
                            n_estimators=35,
                            max_depth=20,
                            num_leaves=100,
                            feature_fraction=0.6,
                            min_data_in_leaf=350,
                            subsample=0.6,
                           metric='rmse')

reg.fit(X_train, y_train)
yhat_test=reg.predict(X_test)
print('RMSE on test set:{:.6f}'.format(np.sqrt(mean_squared_error(yhat_test, y_test))))

In [None]:
List_max_depth=range(2,31,2)

for max_depth in List_max_depth:
    reg = lgb.LGBMRegressor(learning_rate=0.1, 
                            n_estimators=35,
                            max_depth=max_depth,
                            num_leaves=100,
                            feature_fraction=0.6,
                            min_data_in_leaf=350,
                            subsample=0.6,
                            #random_state=42,
                           metric='rmse')
    
    reg.fit(X_trn, y_trn, verbose=500)
    yhat_trn = reg.predict(X_trn)
    yhat_val = reg.predict(X_val)

    print('max_depth:{:5d} RMSE on train set:{:.6f}, val set:{:.6f}'.format(
        max_depth, 
        np.sqrt(mean_squared_error(yhat_trn, y_trn)),
        np.sqrt(mean_squared_error(yhat_val, y_val)),
        ))

In [None]:
reg = lgb.LGBMRegressor(learning_rate=0.1, 
                            n_estimators=35,
                            max_depth=12,
                            num_leaves=100,
                            feature_fraction=0.6,
                            min_data_in_leaf=350,
                            subsample=0.6,
                           metric='rmse')

reg.fit(X_train, y_train)
yhat_test=reg.predict(X_test)
print('RMSE on test set:{:.6f}'.format(np.sqrt(mean_squared_error(yhat_test, y_test))))

In [None]:
List_num_leaves=range(50,151,10)

for num_leaves in List_num_leaves:
    reg = lgb.LGBMRegressor(learning_rate=0.1, 
                            n_estimators=35,
                            max_depth=12,
                            num_leaves=num_leaves,
                            feature_fraction=0.6,
                            min_data_in_leaf=350,
                            subsample=0.6,
                           metric='rmse',
                           seed=42)
    
    reg.fit(X_trn, y_trn, verbose=500)
    yhat_trn = reg.predict(X_trn)
    yhat_val = reg.predict(X_val)

    print('num_leaves:{:5d} RMSE on train set:{:.6f}, val set:{:.6f}'.format(
        num_leaves, 
        np.sqrt(mean_squared_error(yhat_trn, y_trn)),
        np.sqrt(mean_squared_error(yhat_val, y_val)),
        ))

In [None]:
reg = lgb.LGBMRegressor(learning_rate=0.1, 
                            n_estimators=35,
                            max_depth=12,
                            num_leaves=70,
                            feature_fraction=0.6,
                            min_data_in_leaf=350,
                            subsample=0.6,
                           metric='rmse')

reg.fit(X_train, y_train)
yhat_test=reg.predict(X_test)
print('RMSE on test set:{:.6f}'.format(np.sqrt(mean_squared_error(yhat_test, y_test))))

In [None]:
List_min_samples_leaf=range(50,1001,50)

for min_samples_leaf in List_min_samples_leaf:
    reg = lgb.LGBMRegressor(learning_rate=0.1, 
                            n_estimators=35,
                            max_depth=12,
                            num_leaves=70,
                            feature_fraction=0.6,
                            min_data_in_leaf=min_samples_leaf,
                            subsample=0.6,
                           metric='rmse')
    
    reg.fit(X_trn, y_trn, verbose=500)
    yhat_trn = reg.predict(X_trn)
    yhat_val = reg.predict(X_val)

    print('min_samples_leaf:{:5d} RMSE on train set:{:.6f}, val set:{:.6f}'.format(
        min_samples_leaf, 
        np.sqrt(mean_squared_error(yhat_trn, y_trn)),
        np.sqrt(mean_squared_error(yhat_val, y_val)),
        ))

In [None]:
reg = lgb.LGBMRegressor(learning_rate=0.1, 
                            n_estimators=35,
                            max_depth=12,
                            num_leaves=70,
                            feature_fraction=0.6,
                            min_data_in_leaf=200,
                            subsample=0.6,
                           metric='rmse')

reg.fit(X_train, y_train)
yhat_test=reg.predict(X_test)
print('RMSE on test set:{:.6f}'.format(np.sqrt(mean_squared_error(yhat_test, y_test))))

In [None]:
List_feature_fraction=np.arange(0.1,1.1,0.1)

for feature_fraction in List_feature_fraction:
    reg = lgb.LGBMRegressor(learning_rate=0.1, 
                            n_estimators=35,
                            max_depth=12,
                            num_leaves=70,
                            feature_fraction=feature_fraction,
                            min_data_in_leaf=200,
                            subsample=0.6,
                           metric='rmse')
    
    reg.fit(X_trn, y_trn, verbose=500)
    yhat_trn = reg.predict(X_trn)
    yhat_val = reg.predict(X_val)

    print('feature_fraction:{:.2f} RMSE on train set:{:.6f}, val set:{:.6f}'.format(
        feature_fraction, 
        np.sqrt(mean_squared_error(yhat_trn, y_trn)),
        np.sqrt(mean_squared_error(yhat_val, y_val)),
        ))

In [None]:
reg = lgb.LGBMRegressor(learning_rate=0.1, 
                            n_estimators=35,
                            max_depth=12,
                            num_leaves=70,
                            feature_fraction=0.7,
                            min_data_in_leaf=200,
                            subsample=0.6,
                           metric='rmse')

reg.fit(X_train, y_train)
yhat_test=reg.predict(X_test)
print('RMSE on test set:{:.6f}'.format(np.sqrt(mean_squared_error(yhat_test, y_test))))

In [None]:
reg = lgb.LGBMRegressor(learning_rate=0.01, 
                            n_estimators=350,
                            max_depth=12,
                            num_leaves=70,
                            feature_fraction=0.7,
                            min_data_in_leaf=200,
                            subsample=0.6,
                           metric='rmse')

reg.fit(X_trn, y_trn, verbose=500)
yhat_trn = reg.predict(X_trn)
yhat_val = reg.predict(X_val)

print('min_samples_leaf:{:5d} RMSE on train set:{:.6f}, val set:{:.6f}'.format(
        min_samples_leaf, 
        np.sqrt(mean_squared_error(yhat_trn, y_trn)),
        np.sqrt(mean_squared_error(yhat_val, y_val)),
))

### Final model:

In [None]:
reg = lgb.LGBMRegressor(learning_rate=0.01, 
                            n_estimators=350,
                            max_depth=12,
                            num_leaves=70,
                            feature_fraction=0.7,
                            min_data_in_leaf=200,
                            subsample=0.6,
                           metric='rmse')

reg.fit(X_trn, y_trn)
yhat_val=reg.predict(X_val)
yhat_val[yhat_val<0] = 0
reg.fit(X_train, y_train)
yhat_train=reg.predict(X_train)
yhat_train[yhat_train<0] = 0
yhat_test=reg.predict(X_test)
yhat_test[yhat_test<0] = 0
print('RMSE on \ntraining set::{:.6f}\nval set::{:.6f}\ntest set:{:.6f}'.format(np.sqrt(mean_squared_error(yhat_train, y_train)),
                                                                                np.sqrt(mean_squared_error(yhat_val, y_val)),
                                                               np.sqrt(mean_squared_error(yhat_test, y_test))))

In [None]:
print(pd.DataFrame([reg.feature_importances_], columns = X_train.columns, index = ['feature importance']).transpose().sort_values(by='feature importance', ascending=False))

# Plot feature importance
feature_importance = reg.feature_importances_
# make importances relative to max importance
feature_importance = 100.0 * (feature_importance / feature_importance.max())
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5

plt.figure(figsize=(6, 12))
plt.barh(pos, feature_importance[sorted_idx], align='center')
plt.yticks(pos, X_train.columns[sorted_idx])
plt.xlabel('Relative Importance')
plt.title('Feature Importance')
plt.show()

In [None]:
'''
List_n_estimators = np.arange(100,1101,200)

reg = lgb.LGBMRegressor(learning_rate=0.01, 
                            n_estimators=n_estimators,
                            max_depth=10,
                            num_leaves=120,
                            feature_fraction=0.8,
                            min_data_in_leaf=150,
                            subsample=0.6,
                           metric='rmse')

rf_cv = GridSearchCV(estimator=reg, param_grid={'n_estimators':List_n_estimators}, 
                   cv=5, scoring='neg_mean_squared_error', return_train_score=True,
                    n_jobs=-1, verbose=10)

rf_cv.fit(X_train, y_train, verbose=500)
cv_results = rf_cv.cv_results_ 
pd.DataFrame(cv_results)
'''

In [None]:
#pd.DataFrame(cv_results).plot(x='param_n_estimators', y=['mean_train_score', 'mean_test_score'])

## 3. Postprocessing

In [None]:
'''
df_final = pd.DataFrame({"fullVisitorId":Id_test})
df_final['PredictedLogRevenue'] = np.expm1(yhat_test)

print(df_final.shape)
df_final = df_final.groupby('fullVisitorId')['PredictedLogRevenue'].sum().reset_index()
df_final.columns = ['fullVisitorId', 'PredictedLogRevenue']
df_final['PredictedLogRevenue'] = np.log1p(df_final['PredictedLogRevenue'])
df_final.to_csv('pred_Linear.csv', index=False)

print(df_final.shape)
print(Id_test.nunique())

df_final
'''

In [None]:
def RMSE_group(Id, y, yhat, setName):
    df_final = pd.DataFrame({"fullVisitorId":Id})
    df_final['PredictedLogRevenue'] = np.expm1(yhat)

    df_final = df_final.groupby('fullVisitorId')['PredictedLogRevenue'].sum().reset_index()
    df_final.columns = ['fullVisitorId', 'PredictedLogRevenue']
    df_final['PredictedLogRevenue'] = np.log1p(df_final['PredictedLogRevenue'])
    
    
    df_true = pd.DataFrame({"fullVisitorId":Id})
    df_true['TrueLogRevenue'] = np.expm1(y)
        
    df_true = df_true.groupby('fullVisitorId')['TrueLogRevenue'].sum().reset_index()
    df_true.columns = ['fullVisitorId', 'TrueLogRevenue']
    df_true['TrueLogRevenue'] = np.log1p(df_true['TrueLogRevenue'])

    print('RMSE on ', setName, ' set:', 
      np.sqrt(mean_squared_error(df_final['PredictedLogRevenue'], df_true['TrueLogRevenue'])))

In [None]:
pd.Series(yhat_train).describe()

In [None]:
RMSE_group(Id_train, y_train, yhat_train, 'train')
RMSE_group(Id_val, y_val, yhat_val, 'validation')
RMSE_group(Id_test, y_test, yhat_test, 'test')