In [1]:
# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
import gc
import re

from tqdm import tqdm

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import scale, minmax_scale
from scipy.stats import norm

import lightgbm as lgb

import warnings
warnings.simplefilter(action='ignore')
warnings.filterwarnings('ignore')

# change to path
PATH='/content/drive/My Drive/Colab Notebooks/nba/'
os.chdir(PATH)

In [27]:
train = pd.read_csv(f'{PATH}/dataset/training_set.csv', encoding = 'ISO-8859-1')
test = pd.read_csv(f'{PATH}/dataset/holdout_set.csv', encoding = 'ISO-8859-1')

train.head()

Unnamed: 0,Engagements,Followers at Posting,Created,Type,Description
0,502093,36984682,2019-05-21 23:30:51 EDT,Video,The @raptors bench trio of @sergeibaka @norman...
1,603380,36984682,2019-05-21 22:53:33 EDT,Video,@kyle_lowry7 pulls from deep for the @raptors ...
2,603380,36984682,2019-05-21 22:19:58 EDT,Video,@k_mid22 with some english on the @bucks dime!
3,725100,36984682,2019-05-21 22:02:41 EDT,Video,Kawhi punches it home with the left on TNT!
4,661446,36984682,2019-05-21 20:47:49 EDT,Video,@giannis_an34 goes baseline early to rock the ...


In [28]:
len(train), len(test)

(7766, 1000)

In [0]:
random_state = 42
np.random.seed(random_state)

In [0]:
def str_to_date(s, split):
    return s.split(' ')[split]
  
def add_datepart(df, fldname, drop=True, time=False):
    "Helper function that adds columns relevant to a date."
    fld = df[fldname]
    fld_dtype = fld.dtype
    if isinstance(fld_dtype, pd.core.dtypes.dtypes.DatetimeTZDtype):
        fld_dtype = np.datetime64

    if not np.issubdtype(fld_dtype, np.datetime64):
        df[fldname] = fld = pd.to_datetime(fld, infer_datetime_format=True)
    targ_pre = re.sub('[Dd]ate$', '', fldname)
    attr = ['Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear',
            'Is_month_end', 'Is_month_start', 'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start']
    if time: attr = attr + ['Hour', 'Minute', 'Second']
    for n in attr: df[targ_pre + n] = getattr(fld.dt, n.lower())
    df[targ_pre + 'Elapsed'] = fld.astype(np.int64) // 10 ** 9
    if drop: df.drop(fldname, axis=1, inplace=True)

In [0]:
def preprocess(df):
    df['date'] = df.apply(lambda x: str_to_date(x['Created'], 0), axis=1)
    df['time'] = df.apply(lambda x: str_to_date(x['Created'], 1), axis=1)
    df['tz'] = df.apply(lambda x: str_to_date(x['Created'], 2), axis=1)
    
    df['date'] = pd.to_datetime(df['date'])
    
    add_datepart(df, 'date')

    time_df = pd.get_dummies(df['Type'], 
                         prefix='Type')

    tz_df = pd.get_dummies(df['tz'],
                           prefix='tz')

    df = pd.concat([df, time_df, tz_df],
                    axis=1)
    
    return df

In [0]:
train = preprocess(train)
test = preprocess(test)

In [33]:
train.shape, test.shape

((7766, 25), (1000, 25))

## Adversarial validation

In [0]:
features = [c for c in train.columns if c not in ['Created', 'Description',
                                                  'time', 'Year',
                                                  'Type', 'tz',
                                                  'Elapsed', 'Engagements']]

In [0]:
len_train = len(train)
train['target'] = 1
train = train.append(test).reset_index(drop = True)
train['target'] = train['target'].fillna(0)

In [0]:
lgb_params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'verbose': 1,
    'learning_rate': 0.05,
    'num_leaves': 31,
    'feature_fraction': 0.7,
    'min_data_in_leaf': 200,
    'bagging_fraction': 0.8,
    'bagging_freq': 20,
    'min_hessian': 0.01,
    'feature_fraction_seed': 2,
    'bagging_seed': 3,
    "seed": random_state
}

In [0]:
# eng_mean = train['Engagements'].mean()
# eng_std = train['Engagements'].std() 

# eng_simulated = np.random.rand(1000) * eng_std + eng_mean

# train.iloc[len_train:]['Engagements'] = eng_simulated

In [38]:
train[features].head()

Unnamed: 0,Followers at Posting,Month,Week,Day,Dayofweek,Dayofyear,Is_month_end,Is_month_start,Is_quarter_end,Is_quarter_start,Is_year_end,Is_year_start,Type_Album,Type_Photo,Type_Video,tz_EDT,tz_EST
0,36984682,5,21,21,1,141,False,False,False,False,False,False,0,0,1,1,0
1,36984682,5,21,21,1,141,False,False,False,False,False,False,0,0,1,1,0
2,36984682,5,21,21,1,141,False,False,False,False,False,False,0,0,1,1,0
3,36984682,5,21,21,1,141,False,False,False,False,False,False,0,0,1,1,0
4,36984682,5,21,21,1,141,False,False,False,False,False,False,0,0,1,1,0


In [39]:
train[features].tail()

Unnamed: 0,Followers at Posting,Month,Week,Day,Dayofweek,Dayofyear,Is_month_end,Is_month_start,Is_quarter_end,Is_quarter_start,Is_year_end,Is_year_start,Type_Album,Type_Photo,Type_Video,tz_EDT,tz_EST
8761,24893189,10,40,3,1,276,False,False,False,False,False,False,0,0,1,1,0
8762,24893189,10,40,3,1,276,False,False,False,False,False,False,0,1,0,1,0
8763,24885537,10,40,2,0,275,False,False,False,False,False,False,0,0,1,1,0
8764,24885537,10,40,2,0,275,False,False,False,False,False,False,0,0,1,1,0
8765,24875542,10,39,1,6,274,False,True,False,True,False,False,0,0,1,1,0


In [40]:
train.shape

(8766, 26)

In [0]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)
oof = train[['target']].copy()
oof['predict'] = 0
val_aucs = []

In [42]:
for fold, (trn_idx, val_idx) in enumerate(skf.split(train, train['target'])):
    X_train, y_train = train.iloc[trn_idx][features], train.iloc[trn_idx]['target']
    X_valid, y_valid = train.iloc[val_idx][features], train.iloc[val_idx]['target']
    trn_data = lgb.Dataset(X_train, label=y_train)
    val_data = lgb.Dataset(X_valid, label=y_valid)
    evals_result = {}
    lgb_clf = lgb.train(lgb_params,
                        trn_data,
                        1000,
                        valid_sets=[val_data],
                        early_stopping_rounds=100,
                        verbose_eval=50,
                        evals_result=evals_result)

    p_valid = lgb_clf.predict(X_valid[features], num_iteration=lgb_clf.best_iteration)

    oof['predict'][val_idx] = p_valid
    val_score = roc_auc_score(y_valid, p_valid)
    val_aucs.append(val_score)

Training until validation scores don't improve for 100 rounds.
[50]	valid_0's auc: 0.499459
[100]	valid_0's auc: 0.499815
Early stopping, best iteration is:
[3]	valid_0's auc: 0.509749
Training until validation scores don't improve for 100 rounds.
[50]	valid_0's auc: 0.501978
[100]	valid_0's auc: 0.500481
Early stopping, best iteration is:
[3]	valid_0's auc: 0.520576
Training until validation scores don't improve for 100 rounds.
[50]	valid_0's auc: 0.50797
[100]	valid_0's auc: 0.513136
[150]	valid_0's auc: 0.507978
Early stopping, best iteration is:
[93]	valid_0's auc: 0.513949
Training until validation scores don't improve for 100 rounds.
[50]	valid_0's auc: 0.496175
[100]	valid_0's auc: 0.49567
[150]	valid_0's auc: 0.496655
Early stopping, best iteration is:
[80]	valid_0's auc: 0.501788
Training until validation scores don't improve for 100 rounds.
[50]	valid_0's auc: 0.488764
[100]	valid_0's auc: 0.49459
Early stopping, best iteration is:
[6]	valid_0's auc: 0.502659
